Repository: triton-inference-server/backend Branch: main Commit: 3f57fb90bee1 Files: 68 Total size: 503.3 KB Directory structure: gitextract_l9oqlsxk/ ├── .clang-format ├── .github/ │ └── workflows/ │ └── pre-commit.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CMakeLists.txt ├── LICENSE ├── README.md ├── cmake/ │ ├── TritonBackendConfig.cmake.in │ └── define.cuda_architectures.cmake ├── docs/ │ ├── backend_platform_support_matrix.md │ └── python_based_backends.md ├── examples/ │ ├── README.md │ ├── backends/ │ │ ├── bls/ │ │ │ ├── CMakeLists.txt │ │ │ ├── README.md │ │ │ ├── cmake/ │ │ │ │ └── TritonBLSBackendConfig.cmake.in │ │ │ └── src/ │ │ │ ├── backend.cc │ │ │ ├── bls.cc │ │ │ ├── bls.h │ │ │ ├── bls_utils.cc │ │ │ ├── bls_utils.h │ │ │ └── libtriton_bls.ldscript │ │ ├── minimal/ │ │ │ ├── CMakeLists.txt │ │ │ ├── cmake/ │ │ │ │ └── TutorialMinimalBackendConfig.cmake.in │ │ │ └── src/ │ │ │ ├── libtriton_minimal.ldscript │ │ │ └── minimal.cc │ │ └── recommended/ │ │ ├── CMakeLists.txt │ │ ├── cmake/ │ │ │ └── TutorialRecommendedBackendConfig.cmake.in │ │ └── src/ │ │ ├── libtriton_recommended.ldscript │ │ └── recommended.cc │ ├── batching_strategies/ │ │ ├── single_batching/ │ │ │ ├── CMakeLists.txt │ │ │ ├── cmake/ │ │ │ │ └── triton-single-batching.cmake.in │ │ │ └── src/ │ │ │ ├── libtriton_singlebatching.ldscript │ │ │ └── single_batching.cc │ │ └── volume_batching/ │ │ ├── CMakeLists.txt │ │ ├── cmake/ │ │ │ └── triton-volume-batching.cmake.in │ │ └── src/ │ │ ├── libtriton_volumebatching.ldscript │ │ └── volume_batching.cc │ ├── clients/ │ │ ├── bls_client │ │ ├── minimal_client │ │ └── recommended_client │ └── model_repos/ │ ├── bls_models/ │ │ ├── addsub_onnx/ │ │ │ ├── 1/ │ │ │ │ └── model.onnx │ │ │ └── config.pbtxt │ │ ├── addsub_python/ │ │ │ ├── 1/ │ │ │ │ └── model.py │ │ │ └── config.pbtxt │ │ └── bls_fp32/ │ │ └── config.pbtxt │ ├── minimal_models/ │ │ ├── batching/ │ │ │ ├── 1/ │ │ │ │ └── .gitkeep │ │ │ └── config.pbtxt │ │ └── nonbatching/ │ │ ├── 1/ │ │ │ └── .gitkeep │ │ └── config.pbtxt │ └── recommended_models/ │ └── batching/ │ ├── 1/ │ │ └── .gitkeep │ └── config.pbtxt ├── include/ │ └── triton/ │ └── backend/ │ ├── backend_common.h │ ├── backend_input_collector.h │ ├── backend_memory.h │ ├── backend_model.h │ ├── backend_model_instance.h │ ├── backend_output_responder.h │ └── device_memory_tracker.h ├── pyproject.toml └── src/ ├── backend_common.cc ├── backend_input_collector.cc ├── backend_memory.cc ├── backend_model.cc ├── backend_model_instance.cc ├── backend_output_responder.cc ├── device_memory_tracker.cc ├── kernel.cu └── kernel.h ================================================ FILE CONTENTS ================================================ ================================================ FILE: .clang-format ================================================ --- BasedOnStyle: Google IndentWidth: 2 ColumnLimit: 80 ContinuationIndentWidth: 4 UseTab: Never MaxEmptyLinesToKeep: 2 SortIncludes: true CompactNamespaces: true ReflowComments: true DerivePointerAlignment: false PointerAlignment: Left AllowShortIfStatementsOnASingleLine: false AllowShortBlocksOnASingleLine: false AllowShortFunctionsOnASingleLine: Inline AlwaysBreakAfterReturnType: TopLevelDefinitions AlignAfterOpenBracket: AlwaysBreak BreakBeforeBraces: Custom BraceWrapping: AfterClass: false AfterControlStatement: false AfterEnum: false AfterFunction: true AfterNamespace: false AfterStruct: false AfterUnion: false BeforeCatch: true BinPackArguments: true BinPackParameters: true ConstructorInitializerAllOnOneLineOrOnePerLine: false IndentCaseLabels: true ================================================ FILE: .github/workflows/pre-commit.yml ================================================ # Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of NVIDIA CORPORATION nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. name: pre-commit on: pull_request: jobs: pre-commit: runs-on: ubuntu-latest steps: - uses: actions/checkout@v5.0.0 - uses: actions/setup-python@v6.0.0 - uses: pre-commit/action@v3.0.1 ================================================ FILE: .gitignore ================================================ /build /.vscode *.so ================================================ FILE: .pre-commit-config.yaml ================================================ # Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of NVIDIA CORPORATION nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. repos: - repo: https://github.com/PyCQA/isort rev: 5.12.0 hooks: - id: isort additional_dependencies: [toml] - repo: https://github.com/psf/black rev: 23.1.0 hooks: - id: black types_or: [python, cython] - repo: https://github.com/PyCQA/flake8 rev: 7.3.0 hooks: - id: flake8 args: [--max-line-length=88, --select=C,E,F,W,B,B950, --extend-ignore = E203,E501] types_or: [python, cython] - repo: https://github.com/pre-commit/mirrors-clang-format rev: v16.0.5 hooks: - id: clang-format types_or: [c, c++, cuda, proto, textproto, java] args: ["-fallback-style=none", "-style=file", "-i"] - repo: https://github.com/codespell-project/codespell rev: v2.2.4 hooks: - id: codespell additional_dependencies: [tomli] args: ["--toml", "pyproject.toml"] exclude: (?x)^(.*stemmer.*|.*stop_words.*|^CHANGELOG.md$) # More details about these pre-commit hooks here: # https://pre-commit.com/hooks.html - repo: https://github.com/pre-commit/pre-commit-hooks rev: v6.0.0 hooks: - id: check-case-conflict - id: check-executables-have-shebangs - id: check-merge-conflict - id: check-json - id: check-toml - id: check-yaml - id: check-shebang-scripts-are-executable - id: end-of-file-fixer types_or: [c, c++, cuda, proto, textproto, java, python] - id: mixed-line-ending - id: requirements-txt-fixer - id: trailing-whitespace ================================================ FILE: CMakeLists.txt ================================================ # Copyright 2020-2025, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of NVIDIA CORPORATION nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmake_minimum_required(VERSION 3.31.8) project(tritonbackend LANGUAGES C CXX) # # Options # option(TRITON_ENABLE_GPU "Enable GPU support in backend utilities" ON) option(TRITON_ENABLE_MALI_GPU "Enable Arm MALI GPU support in backend utilities" OFF) option(TRITON_ENABLE_STATS "Include statistics collections in backend utilities" ON) # Default OFF unless backend explicitly request to use provided implementation option(TRITON_ENABLE_MEMORY_TRACKER "Include device memory tracker in backend utilities" OFF) set(TRITON_REPO_ORGANIZATION "https://github.com/triton-inference-server" CACHE STRING "Git repository to pull from") set(TRITON_COMMON_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/common repo") set(TRITON_CORE_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/core repo") # # Setting C++ standard # set(TRITON_MIN_CXX_STANDARD 17 CACHE STRING "The minimum C++ standard whose features are requested to build this target.") if(NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE Release) endif() if(TRITON_ENABLE_MEMORY_TRACKER AND NOT TRITON_ENABLE_GPU) message(WARNING "TRITON_ENABLE_MEMORY_TRACKER=ON requires TRITON_ENABLE_GPU=ON, TRITON_ENABLE_MEMORY_TRACKER will be disable") set(TRITON_ENABLE_MEMORY_TRACKER OFF CACHE BOOL "Device memory tracker disabled" FORCE) endif() # # Dependencies # include(FetchContent) FetchContent_Declare( repo-common GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/common.git GIT_TAG ${TRITON_COMMON_REPO_TAG} GIT_SHALLOW ON ) FetchContent_Declare( repo-core GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/core.git GIT_TAG ${TRITON_CORE_REPO_TAG} GIT_SHALLOW ON ) FetchContent_MakeAvailable(repo-common repo-core) # # CUDA # if(${TRITON_ENABLE_GPU}) find_package(CUDAToolkit REQUIRED) set(CMAKE_CUDA_RUNTIME_LIBRARY Shared) message(STATUS "Using CUDA ${CUDAToolkit_VERSION}") if(CUDAToolkit_VERSION VERSION_GREATER "10.1" OR CUDAToolkit_VERSION VERSION_EQUAL "10.1") add_definitions(-DTRITON_ENABLE_CUDA_GRAPH=1) else() message(WARNING "CUDA ${CUDA_VERSION} does not support CUDA graphs.") endif() endif() # TRITON_ENABLE_GPU # # Backend library containing useful source and utilities # set(SRC_FILES "src/backend_common.cc" "src/backend_input_collector.cc" "src/backend_memory.cc" "src/backend_model_instance.cc" "src/backend_model.cc" "src/backend_output_responder.cc" ) if(${TRITON_ENABLE_GPU}) set(SRC_FILES ${SRC_FILES} "src/kernel.h") if(${TRITON_ENABLE_MEMORY_TRACKER}) set(SRC_FILES ${SRC_FILES} "src/device_memory_tracker.cc") endif() # TRITON_ENABLE_MEMORY_TRACKER endif() # TRITON_ENABLE_GPU add_library( triton-backend-utils ${SRC_FILES} ) if(${TRITON_ENABLE_GPU}) add_library( kernel_library_new src/kernel.cu src/kernel.h ) enable_language(CUDA) include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/define.cuda_architectures.cmake) set_target_properties(kernel_library_new PROPERTIES LANGUAGE CUDA) set_target_properties(kernel_library_new PROPERTIES CUDA_ARCHITECTURES "${CUDA_ARCHITECTURES}") set_target_properties(kernel_library_new PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(kernel_library_new PROPERTIES LINKER_LANGUAGE CUDA) target_compile_features(kernel_library_new PUBLIC cxx_std_${TRITON_MIN_CXX_STANDARD}) set_target_properties(kernel_library_new PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON) endif() # TRITON_ENABLE_GPU add_library( TritonBackend::triton-backend-utils ALIAS triton-backend-utils ) target_include_directories( triton-backend-utils PUBLIC $ $ PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src ) if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") message("Using MSVC as compiler, default target on Windows 10. " "If the target system is not Windows 10, please update _WIN32_WINNT " "to corresponding value.") endif() target_compile_features(triton-backend-utils PRIVATE cxx_std_${TRITON_MIN_CXX_STANDARD}) target_compile_options( triton-backend-utils PRIVATE $<$,$,$>: -Wall -Wextra -Wno-unused-parameter> $<$:/Wall /D_WIN32_WINNT=0x0A00 /EHsc /Zc:preprocessor> ) # TRITON_ENABLE_GPU exposed in header so set PUBLIC if(${TRITON_ENABLE_GPU}) target_compile_definitions( triton-backend-utils PUBLIC TRITON_ENABLE_GPU=1 ) if(${TRITON_ENABLE_MEMORY_TRACKER}) target_compile_definitions( triton-backend-utils PUBLIC TRITON_ENABLE_MEMORY_TRACKER=1 ) endif() # TRITON_ENABLE_MEMORY_TRACKER endif() # TRITON_ENABLE_GPU # TRITON_ENABLE_MALI_GPU exposed in header so set PUBLIC if(${TRITON_ENABLE_MALI_GPU}) target_compile_definitions( triton-backend-utils PUBLIC TRITON_ENABLE_MALI_GPU=1 ) endif() # TRITON_ENABLE_MALI_GPU # TRITON_ENABLE_STATS exposed in header so set PUBLIC if(${TRITON_ENABLE_STATS}) target_compile_definitions( triton-backend-utils PUBLIC TRITON_ENABLE_STATS=1 ) endif() # TRITON_ENABLE_STATS set_target_properties( triton-backend-utils PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS TRUE POSITION_INDEPENDENT_CODE ON OUTPUT_NAME tritonbackendutils ) target_link_libraries( triton-backend-utils PUBLIC triton-core-backendapi # from repo-core triton-core-serverapi # from repo-core triton-common-async-work-queue # from repo-common triton-common-json # from repo-common ) if(${TRITON_ENABLE_GPU}) target_link_libraries( triton-backend-utils PUBLIC CUDA::cudart PRIVATE kernel_library_new ) if(${TRITON_ENABLE_MEMORY_TRACKER}) target_link_libraries( triton-backend-utils PUBLIC CUDA::cupti ) endif() # TRITON_ENABLE_MEMORY_TRACKER endif() # TRITON_ENABLE_GPU # # Install # include(GNUInstallDirs) set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/TritonBackend) install( TARGETS triton-backend-utils EXPORT triton-backend-targets LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} ) if(${TRITON_ENABLE_GPU}) install( TARGETS kernel_library_new EXPORT triton-backend-targets LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} ) endif() # TRITON_ENABLE_GPU install( DIRECTORY include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} ) install( EXPORT triton-backend-targets FILE TritonBackendTargets.cmake NAMESPACE TritonBackend:: DESTINATION ${INSTALL_CONFIGDIR} ) include(CMakePackageConfigHelpers) configure_package_config_file( ${CMAKE_CURRENT_LIST_DIR}/cmake/TritonBackendConfig.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/TritonBackendConfig.cmake INSTALL_DESTINATION ${INSTALL_CONFIGDIR} ) install( FILES ${CMAKE_CURRENT_BINARY_DIR}/TritonBackendConfig.cmake DESTINATION ${INSTALL_CONFIGDIR} ) # # Export from build tree # export( EXPORT triton-backend-targets FILE ${CMAKE_CURRENT_BINARY_DIR}/TritonBackendTargets.cmake NAMESPACE TritonBackend:: ) export(PACKAGE TritonBackend) ================================================ FILE: LICENSE ================================================ Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of NVIDIA CORPORATION nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ================================================ FILE: README.md ================================================ [![License](https://img.shields.io/badge/License-BSD3-lightgrey.svg)](https://opensource.org/licenses/BSD-3-Clause) # Triton Inference Server Backend A Triton *backend* is the implementation that executes a model. A backend can be a wrapper around a deep-learning framework, like PyTorch, TensorFlow, TensorRT or ONNX Runtime. Or a backend can be custom C/C++ logic performing any operation (for example, image pre-processing). This repo contains documentation on Triton backends and also source, scripts and utilities for creating Triton backends. You do not need to use anything provided in this repo to create a Triton backend but you will likely find its contents useful. ## Frequently Asked Questions Full documentation is included below but these shortcuts can help you get started in the right direction. ### Where can I ask general questions about Triton and Triton backends? Be sure to read all the information below as well as the [general Triton documentation](https://github.com/triton-inference-server/server#triton-inference-server) available in the main [server](https://github.com/triton-inference-server/server) repo. If you don't find your answer there you can ask questions on the main Triton [issues page](https://github.com/triton-inference-server/server/issues). ### Where can I find all the backends that are available for Triton? Anyone can develop a Triton backend, so it isn't possible for us to know about all available backends. But the Triton project does provide a set of supported backends that are tested and updated with each Triton release. **TensorRT**: The TensorRT backend is used to execute TensorRT models. The [tensorrt_backend](https://github.com/triton-inference-server/tensorrt_backend) repo contains the source for the backend. **ONNX Runtime**: The ONNX Runtime backend is used to execute ONNX models. The [onnxruntime_backend](https://github.com/triton-inference-server/onnxruntime_backend) repo contains the documentation and source for the backend. **TensorFlow**: The TensorFlow backend is used to execute TensorFlow models in both GraphDef and SavedModel formats. The same backend is used to execute both TensorFlow 1 and TensorFlow 2 models. The [tensorflow_backend](https://github.com/triton-inference-server/tensorflow_backend) repo contains the documentation and source for the backend. **PyTorch**: The PyTorch backend is used to execute PyTorch models in both TorchScript and PyTorch 2.0 formats. The [pytorch_backend](https://github.com/triton-inference-server/pytorch_backend) repo contains the documentation and source for the backend. **OpenVINO**: The OpenVINO backend is used to execute [OpenVINO](https://docs.openvinotoolkit.org/latest/index.html) models. The [openvino_backend](https://github.com/triton-inference-server/openvino_backend) repo contains the documentation and source for the backend. **Python**: The Python backend allows you to write your model logic in Python. For example, you can use this backend to execute pre/post processing code written in Python, or to execute a PyTorch Python script directly (instead of first converting it to TorchScript and then using the PyTorch backend). The [python_backend](https://github.com/triton-inference-server/python_backend) repo contains the documentation and source for the backend. **DALI**: [DALI](https://github.com/NVIDIA/DALI) is a collection of highly optimized building blocks and an execution engine that accelerates the pre-processing of the input data for deep learning applications. The DALI backend allows you to execute your DALI pipeline within Triton. The [dali_backend](https://github.com/triton-inference-server/dali_backend) repo contains the documentation and source for the backend. **FIL**: The FIL ([Forest Inference Library](https://github.com/rapidsai/cuml/tree/branch-21.10/python/cuml/fil)) backend is used to execute a variety of tree-based ML models, including XGBoost models, LightGBM models, Scikit-Learn random forest models, and cuML random forest models. The [fil_backend](https://github.com/triton-inference-server/fil_backend) repo contains the documentation and source for the backend. **TensorRT-LLM**: The TensorRT-LLM backend allows you to serve [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) models with Triton Server. Check out the [Triton TRT-LLM user guide](https://github.com/triton-inference-server/server/blob/main/docs/getting_started/trtllm_user_guide.md) for more information. The [tensorrtllm_backend](https://github.com/triton-inference-server/tensorrtllm_backend) repo contains the documentation and source for the backend. **vLLM**: The vLLM backend is designed to run [supported models](https://vllm.readthedocs.io/en/latest/models/supported_models.html) on a [vLLM engine](https://github.com/vllm-project/vllm/blob/main/vllm/engine/async_llm_engine.py). This backend depends on [python_backend](https://github.com/triton-inference-server/python_backend) to load and serve models. The [vllm_backend](https://github.com/triton-inference-server/vllm_backend) repo contains the documentation and source for the backend. **Important Note!** Not all the above backends are supported on every platform supported by Triton. Look at the [Backend-Platform Support Matrix](docs/backend_platform_support_matrix.md) to learn about the same. ### How can I develop my own Triton backend? First you probably want to ask on the main Triton [issues page](https://github.com/triton-inference-server/server/issues) to make sure you are not duplicating a backend that already exists. Then follow the [tutorial](examples/README.md) to learn how to create your first simple Triton backend and incrementally improve it to add more features. You should also read the complete documentation on [Triton backends](#backends). ### Can I add (or remove) a backend to an existing Triton installation? Yes. See [Backend Shared Library](#backend-shared-library) for general information about how the shared library implementing a backend is managed by Triton, and [Triton with Unsupported and Custom Backends](https://github.com/triton-inference-server/server/blob/main/docs/customization_guide/compose.md#triton-with-unsupported-and-custom-backends) for documentation on how to add your backend to the released Triton Docker image. For a standard install the globally available backends are in /opt/tritonserver/backends. ### What about backends developed using the "legacy custom backend" API. The legacy custom API is removed from Triton. If you have custom backends that you developed using this older API you must port them to the new [Triton Backend API](#triton-backend-api). ## Backends A Triton *backend* is the implementation that executes a model. A backend can be a wrapper around a deep-learning framework, like PyTorch, TensorFlow, TensorRT, ONNX Runtime or OpenVINO. A backend can also implement any functionality you want as long as it adheres to the [backend API](#triton-backend-api). Triton uses this API to send requests to the backend for execution and the backend uses the API to communicate with Triton. Every model must be associated with a backend. A model's backend is specified in the model's configuration using the `backend` setting. For using TensorRT backend, the value of this setting should be `tensorrt`. Similarly, for using PyTorch, ONNX and TensorFlow backends, the `backend` field should be set to `pytorch`, `onnxruntime` or `tensorflow` respectively. For all other backends, `backend` must be set to the name of the backend. Some backends may also check the `platform` setting for categorizing the model, for example, in TensorFlow backend, `platform` should be set to `tensorflow_savedmodel` or `tensorflow_graphdef` according to the model format. Please refer to the specific backend repository on whether `platform` is used. ### Backend Shared Library Each backend must be implemented as a shared library and the name of the shared library must be *libtriton_\.so*. For example, if the name of the backend is "mybackend", a model indicates that it uses the backend by setting the model configuration 'backend' setting to "mybackend", and Triton looks for *libtriton_mybackend.so* as the shared library that implements the backend. The [tutorial](examples/README.md) shows examples of how to build your backend logic into the appropriate shared library. For a model, *M* that specifies backend *B*, Triton searches for the backend shared library in the following places, in this order: * \/M/\/libtriton_B.so * \/M/libtriton_B.so * \/B/libtriton_B.so Where \ is by default /opt/tritonserver/backends. The --backend-directory flag can be used to override the default. Typically you will install your backend into the global backend directory. For example, if using Triton Docker images you can follow the instructions in [Triton with Unsupported and Custom Backends](https://github.com/triton-inference-server/server/blob/main/docs/customization_guide/compose.md#triton-with-unsupported-and-custom-backends). Continuing the example of a backend names "mybackend", you would install into the Triton image as: ``` /opt/ tritonserver/ backends/ mybackend/ libtriton_mybackend.so ... # other files needed by mybackend ``` Starting from 24.01, the default backend shared library name can be changed by providing the `runtime` setting in the model configuration. For example, ``` runtime: "my_backend_shared_library_name.so" ``` A model may choose a specific runtime implementation provided by the backend. ### Triton Backend API A Triton backend must implement the C interface defined in [tritonbackend.h](https://github.com/triton-inference-server/core/tree/main/include/triton/core/tritonbackend.h). The following abstractions are used by the API. #### TRITONBACKEND_Backend A TRITONBACKEND_Backend object represents the backend itself. The same backend object is shared across all models that use the backend. The associated API, like TRITONBACKEND_BackendName, is used to get information about the backend and to associate a user-defined state with the backend. A backend can optionally implement TRITONBACKEND_Initialize and TRITONBACKEND_Finalize to get notification of when the backend object is created and destroyed (for more information see [backend lifecycles](#backend-lifecycles)). #### TRITONBACKEND_Model A TRITONBACKEND_Model object represents a model. Each model loaded by Triton is associated with a TRITONBACKEND_Model. Each model can use the TRITONBACKEND_ModelBackend API to get the backend object representing the backend that is used by the model. The same model object is shared across all instances of that model. The associated API, like TRITONBACKEND_ModelName, is used to get information about the model and to associate a user-defined state with the model. Most backends will implement TRITONBACKEND_ModelInitialize and TRITONBACKEND_ModelFinalize to initialize the backend for a given model and to manage the user-defined state associated with the model (for more information see [backend lifecycles](#backend-lifecycles)). The backend must take into account threading concerns when implementing TRITONBACKEND_ModelInitialize and TRITONBACKEND_ModelFinalize. Triton will not perform multiple simultaneous calls to these functions for a given model; however, if a backend is used by multiple models Triton may simultaneously call the functions with a different thread for each model. As a result, the backend must be able to handle multiple simultaneous calls to the functions. Best practice for backend implementations is to use only function-local and model-specific user-defined state in these functions, as is shown in the [tutorial](examples/README.md). #### TRITONBACKEND_ModelInstance A TRITONBACKEND_ModelInstance object represents a model *instance*. Triton creates one or more instances of the model based on the *instance_group* settings specified in the model configuration. Each of these instances is associated with a TRITONBACKEND_ModelInstance object. The only function that the backend must implement is TRITONBACKEND_ModelInstanceExecute. The TRITONBACKEND_ModelInstanceExecute function is called by Triton to perform inference/computation on a batch of inference requests. Most backends will also implement TRITONBACKEND_ModelInstanceInitialize and TRITONBACKEND_ModelInstanceFinalize to initialize the backend for a given model instance and to manage the user-defined state associated with the model (for more information see [backend lifecycles](#backend-lifecycles)). A backend can optionally implement TRITONBACKEND_ModelInstanceReady. This function is called by the Triton server's ready endpoint to check whether a model instance is ready to handle requests. The function returns `nullptr` (indicating success) if the instance is ready, or a `TRITONSERVER_Error` if the instance is not ready. The backend must take into account threading concerns when implementing TRITONBACKEND_ModelInstanceInitialize, TRITONBACKEND_ModelInstanceFinalize and TRITONBACKEND_ModelInstanceExecute. Triton will not perform multiple simultaneous calls to these functions for a given model instance; however, if a backend is used by a model with multiple instances or by multiple models Triton may simultaneously call the functions with a different thread for each model instance. As a result, the backend must be able to handle multiple simultaneous calls to the functions. Best practice for backend implementations is to use only function-local and model-specific user-defined state in these functions, as is shown in the [tutorial](examples/README.md). #### TRITONBACKEND_Request A TRITONBACKEND_Request object represents an inference request made to the model. The backend takes ownership of the request object(s) in TRITONBACKEND_ModelInstanceExecute and must release each request by calling TRITONBACKEND_RequestRelease. However, the ownership of request object is returned back to Triton in case TRITONBACKEND_ModelInstanceExecute returns an error. See [Inference Requests and Responses](#inference-requests-and-responses) for more information about request lifecycle. The Triton Backend API allows the backend to get information about the request as well as the input and request output tensors of the request. Each request input is represented by a TRITONBACKEND_Input object. #### TRITONBACKEND_Response A TRITONBACKEND_Response object represents a response sent by the backend for a specific request. The backend uses the response API to set the name, shape, datatype and tensor values for each output tensor included in the response. The response can indicate either a failed or a successful request. See [Inference Requests and Responses](#inference-requests-and-responses) for more information about request-response lifecycle. #### TRITONBACKEND_BackendAttribute A `TRITONBACKEND_BackendAttribute` allows a backend to set certain attributes which are queried by Triton to inform certain feature support, preferred configurations, and other types of backend-specific behavior. When initializing a backend, Triton will query the `TRITONBACKEND_GetBackendAttribute` function if implemented by the backend. This function is optional to implement, but is generally used to call the related `TRITONBACKEND_BackendAttribute` APIs for setting backend-specific attributes. Some of the relevant BackendAttribute setter APIs are listed below: - `TRITONBACKEND_BackendSetExecutionPolicy` - `TRITONBACKEND_BackendAttributeAddPreferredInstanceGroup` - Defines a priority list of instance groups to prefer for this backend if a model config doesn't explicitly define any instance groups. - `TRITONBACKEND_BackendAttributeSetParallelModelInstanceLoading` - Defines whether the backend can safely handle concurrent calls to `TRITONBACKEND_ModelInstanceInitialize` or not. - Loading model instances in parallel can improve server startup times for large instance counts. - By default, this attribute is set to false, meaning that parallel instance loading is disabled for all backends unless explicitly enabled. - The following official backends currently support loading model instances in parallel: - Python - ONNXRuntime The full list of `TRITONBACKEND_BackendAttribute` related APIs are defined in [tritonbackend.h](https://github.com/triton-inference-server/core/blob/main/include/triton/core/tritonbackend.h). ### Backend Lifecycles A backend must carefully manage the lifecycle of the backend itself, the models and model instances that use the backend and the inference requests that execute on the model instances using the backend. #### Backend and Model Backend, model and model instance initialization is triggered when Triton loads a model. * If the model requires a backend that is not already in use by an already loaded model, then: * Triton [loads the shared library](#backend-shared-library) that implements the backend required by the model. * Triton creates the TRITONBACKEND_Backend object that represents the backend. * Triton calls TRITONBACKEND_Initialize if it is implemented in the backend shared library. TRITONBACKEND_Initialize should not return until the backend is completely initialized. If TRITONBACKEND_Initialize returns an error, Triton will report that the model failed to load. * Triton creates the TRITONBACKEND_Model object that represents the model. Triton calls TRITONBACKEND_ModelInitialize if it is implemented in the backend shared library. TRITONBACKEND_ModelInitialize should not return until the backend is completely initialized for the model. If TRITONBACKEND_ModelInitialize returns an error, Triton will show that the model failed to load. * For each model instance specified for the model in the model configuration: * Triton creates the TRITONBACKEND_ModelInstance object that represents the model instance. * Triton calls TRITONBACKEND_ModelInstanceInitialize if it is implemented in the backend shared library. TRITONBACKEND_ModelInstanceInitialize should not return until the backend is completely initialized for the instance. If TRITONBACKEND_ModelInstanceInitialize returns an error, Triton will show that the model failed to load. Backend, model and model instance finalization is triggered when Triton unloads a model. * For each model instance: * Triton calls TRITONBACKEND_ModelInstanceFinalize if it is implemented in the backend shared library. TRITONBACKEND_ModelInstanceFinalize should not return until the backend is completely finalized, including stopping any threads create for the model instance and freeing any user-defined state created for the model instance. * Triton destroys the TRITONBACKEND_ModelInstance object that represents the model instance. * Triton calls TRITONBACKEND_ModelFinalize if it is implemented in the backend shared library. TRITONBACKEND_ModelFinalize should not return until the backend is completely finalized, including stopping any threads create for the model and freeing any user-defined state created for the model. * Triton destroys the TRITONBACKEND_Model object that represents the model. * Even if no other loaded model requires the backend, Triton does not finalize and unload the backend until the tritonserver process is exiting. When the tritonserver process exits: * Triton calls TRITONBACKEND_Finalize if it is implemented in the backend shared library. TRITONBACKEND_ModelFinalize should not return until the backend is completely finalized, including stopping any threads create for the backend and freeing any user-defined state created for the backend. * Triton destroys the TRITONBACKEND_Backend object that represents the backend. #### Inference Requests and Responses Triton calls TRITONBACKEND_ModelInstanceExecute to execute inference requests on a model instance. Each call to TRITONBACKEND_ModelInstanceExecute communicates a batch of requests to execute and the instance of the model that should be used to execute those requests. The backend should not allow the caller thread to return from TRITONBACKEND_ModelInstanceExecute until that instance is ready to handle another set of requests. Typically this means that the TRITONBACKEND_ModelInstanceExecute function will create responses and release the requests before returning. However, in case TRITONBACKEND_ModelInstanceExecute returns an error, the ownership of requests is transferred back to Triton which will then be responsible for releasing them. Therefore, in the case where TRITONBACKEND_ModelInstanceExecute returns an error, the backend must not retain references to the requests or access them in any way. For more detailed description of request/response lifetimes, study the documentation of TRITONBACKEND_ModelInstanceExecute in [tritonbackend.h](https://github.com/triton-inference-server/core/blob/main/include/triton/core/tritonbackend.h). ##### Single Response Most backends will create a single response for each request. For that kind of backend, executing a single inference request requires the following steps: * Create a response for the request using TRITONBACKEND_ResponseNew. * For each request input tensor use TRITONBACKEND_InputProperties to get shape and datatype of the input as well as the buffer(s) containing the tensor contents. * For each output tensor which the request expects to be returned, use TRITONBACKEND_ResponseOutput to create the output tensor of the required datatype and shape. Use TRITONBACKEND_OutputBuffer to get a pointer to the buffer where the tensor's contents should be written. * Use the inputs to perform the inference computation that produces the requested output tensor contents into the appropriate output buffers. * Optionally set parameters in the response. * Send the response using TRITONBACKEND_ResponseSend. * Release the request using TRITONBACKEND_RequestRelease. For a batch of requests the backend should attempt to combine the execution of the individual requests as much as possible to increase performance. ##### Decoupled Responses It is also possible for a backend to send multiple responses for a request. A backend may also send responses out-of-order relative to the order that the request batches are executed. Such backends are called *decoupled* backends. The decoupled backends use one `ResponseFactory` object per request to create and send any number of responses for the request. They must send at least one final response per request (even if it is a flags-only response). You can send a flags-only response with TRITONBACKEND_ResponseFactorySendFlags. For this kind of backend, executing a single inference request typically requires the following steps: 1. For each request input tensor, use TRITONBACKEND_InputProperties to get shape and datatype of the input as well as the buffer(s) containing the tensor contents. 2. Create a `ResponseFactory` object for the request using TRITONBACKEND_ResponseFactoryNew. 3. Create a response from the `ResponseFactory` object using TRITONBACKEND_ResponseNewFromFactory. As long as you have the `ResponseFactory` object, you can continue creating responses. 4. For each output tensor which the request expects to be returned, use TRITONBACKEND_ResponseOutput to create the output tensor of the required datatype and shape. Use TRITONBACKEND_OutputBuffer to get a pointer to the buffer where the tensor's contents should be written. 5. Use the inputs to perform the inference computation that produces the requested output tensor contents into the appropriate output buffers. 6. Optionally set parameters in the response. 7. Send the response using TRITONBACKEND_ResponseSend. 8. Repeat steps 3-7 until there are no more responses. 9. Send the last response for a request using either TRIONBACKEND_ResponseSend with a TRITONSERVER_ResponseCompleteFlag or after all responses have been sent for a request using TRITONBACKEND_ResponseFactorySendFlags. This is required for every request. 10. Release the request using TRITONBACKEND_RequestRelease. ###### Special Cases The decoupled API is powerful and supports various special cases: * The model can also send responses out-of-order in which it received requests. * The backend can copy out the contents of the input buffer(s) if request is to be released before the contents are completely consumed to generate responses. After copy, the request can be released anytime before exiting TRITONBACKEND_ModelInstanceExecute. The copies and `ResponseFactory` object can be passed to a separate thread in backend. This means main caller thread can exit from TRITONBACKEND_ModelInstanceExecute and the backend can still continue generating responses as long as it holds `ResponseFactory` object. The [repeat example](examples/README.md) demonstrates full power of what can be achieved from decoupled API. Study documentation of these TRITONBACKEND_* functions in [tritonbackend.h](https://github.com/triton-inference-server/core/blob/main/include/triton/core/tritonbackend.h) for more details on these APIs. Read [Decoupled Backends and Models](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/decoupled_models.md) for more details on how to host a decoupled model. ## Build the Backend Utilities The source in this repo builds into a single "backend utilities" library that is useful when building backends. You don't need to use these utilities but they will be helpful for most backends. Typically you don't need to build this repo directly but instead you can include it in the build of your backend as is shown in the CMakeLists.txt files of the [tutorial examples](examples/README.md). To build and install in a local directory use the following commands. ``` $ mkdir build $ cd build $ cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install .. $ make install ``` The following required Triton repositories will be pulled and used in the build. By default the "main" branch/tag will be used for each repo but the listed CMake argument can be used to override. * triton-inference-server/common: -DTRITON_COMMON_REPO_TAG=[tag] * triton-inference-server/core: -DTRITON_CORE_REPO_TAG=[tag] See the [CMakeLists.txt](CMakeLists.txt) file for other build options. ## Python-based Backends Triton also provides an option to create [Python-based backends](docs/python_based_backends.md). These backends should implement the [`TritonPythonModel` interface](https://github.com/triton-inference-server/python_backend#usage), which could be re-used as a backend by multiple models. While the only required function is `execute`, you may find it helpful to enhance your implementation by adding `initialize`, `finalize`, and any other helper functions. For examples, please refer to the [vLLM backend](https://github.com/triton-inference-server/vllm_backend), which provides a common python script to serve models supported by vLLM. ================================================ FILE: cmake/TritonBackendConfig.cmake.in ================================================ # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of NVIDIA CORPORATION nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. include(CMakeFindDependencyMacro) get_filename_component( TRITONBACKEND_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH ) list(APPEND CMAKE_MODULE_PATH ${TRITONBACKEND_CMAKE_DIR}) if(NOT TARGET TritonBackend::triton-backend-utils) include("${TRITONBACKEND_CMAKE_DIR}/TritonBackendTargets.cmake") endif() set(TRITONBACKEND_LIBRARIES TritonBackend::triton-backend-utils) ================================================ FILE: cmake/define.cuda_architectures.cmake ================================================ # Copyright 2025-2026, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of NVIDIA CORPORATION nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. function(set_cuda_architectures_list) # Check if CUDA_ARCH_LIST environment variable is set if(DEFINED ENV{CUDA_ARCH_LIST}) # Parse CUDA_ARCH_LIST: split by spaces, skip PTX, validate each code set(raw_input "$ENV{CUDA_ARCH_LIST}") string(REGEX REPLACE "PTX" "" raw_input "${raw_input}") string(REPLACE " " ";" arch_list "${raw_input}") set(cuda_arch_result_list "") foreach(arch IN LISTS arch_list) string(STRIP "${arch}" arch) if(arch STREQUAL "") continue() endif() # Normalize: remove dots so 10.0 -> 100, 12.0 -> 120 string(REGEX REPLACE "\\." "" arch_num "${arch}") if(NOT arch_num MATCHES "^[0-9]+$") continue() endif() # Code >= 100 (10.x, 11.x, 12.x): use family code, no -real if(arch_num GREATER_EQUAL 100) math(EXPR arch_major "${arch_num} / 10") set(arch_entry "${arch_major}0f") else() set(arch_entry "${arch_num}-real") endif() list(APPEND cuda_arch_result_list "${arch_entry}") endforeach() # If last element is below 100 (has -real), leave it without -real list(LENGTH cuda_arch_result_list result_len) if(result_len GREATER 0) math(EXPR last_index "${result_len} - 1") list(GET cuda_arch_result_list ${last_index} last_entry) string(REGEX REPLACE "-real$" "" last_entry_stripped "${last_entry}") if(NOT last_entry_stripped STREQUAL last_entry) list(REMOVE_AT cuda_arch_result_list ${last_index}) list(APPEND cuda_arch_result_list "${last_entry_stripped}") endif() endif() list(JOIN cuda_arch_result_list ";" cuda_arch_input) set(CUDA_ARCHITECTURES "${cuda_arch_input}" PARENT_SCOPE) message(STATUS "CUDA_ARCH_LIST found, defined CUDA_ARCHITECTURES: $ENV{CUDA_ARCH_LIST}") else() # Set default value if CUDA_ARCH_LIST is not present set(CUDA_ARCHITECTURES "75-real;80-real;86-real;89-real;90-real;100f;120f" PARENT_SCOPE) message(STATUS "CUDA_ARCH_LIST not found, using default values for CUDA_ARCHITECTURES: ${CUDA_ARCHITECTURES}") endif() endfunction() # Call the function to validate and set CUDA_ARCHITECTURES set_cuda_architectures_list() message(STATUS "Defined CUDA_ARCHITECTURES: ${CUDA_ARCHITECTURES}") ================================================ FILE: docs/backend_platform_support_matrix.md ================================================ # Backend-Platform Support Matrix Even though Triton supports inference across various platforms such as cloud, data center, edge and embedded devices on NVIDIA GPUs, x86 and ARM CPU, or AWS Inferentia, it does so by relying on the backends. Note that not all Triton backends support every platform. The purpose of this document is to go over what all compute platforms are supported by each of these Triton backends. GPU in this document refers to Nvidia GPU. See [GPU, Driver, and CUDA Support Matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html) to learn more about supported GPUs. ## Ubuntu 22.04 The table below describes target device(s) supported for inference by each backend on different platforms. | Backend | x86 | ARM-SBSA | | ------------ | --------- | ------------- | | TensorRT | :heavy_check_mark: GPU
:x: CPU | :heavy_check_mark: GPU
:x: CPU | | ONNX Runtime | :heavy_check_mark: GPU
:heavy_check_mark: CPU | :heavy_check_mark: GPU
:heavy_check_mark: CPU | | TensorFlow | :heavy_check_mark: GPU
:heavy_check_mark: CPU | :heavy_check_mark: GPU
:heavy_check_mark: CPU | | PyTorch | :heavy_check_mark: GPU
:heavy_check_mark: CPU | :heavy_check_mark: GPU
:heavy_check_mark: CPU | | OpenVINO | :x: GPU
:heavy_check_mark: CPU | :x: GPU
:x: CPU | | Python[^1] | :heavy_check_mark: GPU
:heavy_check_mark: CPU | :heavy_check_mark: GPU
:heavy_check_mark: CPU | | DALI | :heavy_check_mark: GPU
:heavy_check_mark: CPU | :heavy_check_mark: GPU[^2]
:heavy_check_mark: CPU[^2] | | FIL | :heavy_check_mark: GPU
:heavy_check_mark: CPU | Unsupported | | TensorRT-LLM | :heavy_check_mark: GPU
:x: CPU | :heavy_check_mark: GPU
:x: CPU | | vLLM | :heavy_check_mark: GPU
:heavy_check_mark: CPU | Unsupported | ## Windows 10 Only TensorRT and ONNX Runtime backends are supported on Windows. | Backend | x86 | ARM-SBSA | | ------------ | --------- | ------------- | | TensorRT | :heavy_check_mark: GPU
:x: CPU | :heavy_check_mark: GPU
:x: CPU | | ONNX Runtime | :heavy_check_mark: GPU
:heavy_check_mark: CPU | :heavy_check_mark: GPU
:heavy_check_mark: CPU | ## Jetson JetPack Following backends are currently supported on Jetson Jetpack: | Backend | Jetson | | ------------ | --------- | | TensorRT | :heavy_check_mark: GPU
:x: CPU | | ONNX Runtime | :heavy_check_mark: GPU
:heavy_check_mark: CPU | :heavy_check_mark: GPU
:heavy_check_mark: CPU | | TensorFlow | :heavy_check_mark: GPU
:heavy_check_mark: CPU | :heavy_check_mark: GPU
:heavy_check_mark: CPU | | PyTorch | :heavy_check_mark: GPU
:heavy_check_mark: CPU | :heavy_check_mark: GPU
:heavy_check_mark: CPU | | Python[^1] | :x: GPU
:heavy_check_mark: CPU | Look at the [Triton Inference Server Support for Jetson and JetPack](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/jetson.md). ## AWS Inferentia Currently, inference on AWS Inferentia is only supported via [python backend](https://github.com/triton-inference-server/python_backend#running-with-inferentia) where the deployed python script invokes AWS Neuron SDK. [^1]: The supported devices for Python Backend are mentioned with respect to Triton. The python script running in Python Backend can be used to execute inference on any hardware if there are available python APIs to do so. AWS inferentia is one such example. Triton core is largely unaware of the fact that inference will run on Inferentia. [^2]: In case of ARM-SBSA, some operations are not fully supported. ================================================ FILE: docs/python_based_backends.md ================================================ # Python-based Backends Python-based backend is a special type of Triton's backends, which does not require any C++ code. However, this type of backends depends on [Python backend](https://github.com/triton-inference-server/python_backend) and requires the following artifacts being present: `libtriton_python.so`, `triton_python_backend_stub`, and `triton_python_backend_utils.py`. ## Usage To implement and use a Python-based backend, make sure to follow these steps. * Implement the [`TritonPythonModel` interface](https://github.com/triton-inference-server/python_backend#usage), which could be re-used as a backend by multiple models. This script should be named `model.py`. * Create a folder for your custom backend under the backends directory (ex: /opt/tritonserver/backends) with the corresponding backend name, containing the `model.py`. For example, for a backend named `my_python_based_backend`, Triton would expect to find the full path `/opt/tritonserver/backends/my_python_based_backend/model.py`. * Make sure that `libtriton_python.so`, `triton_python_backend_stub`, and `triton_python_backend_utils.py` are present either under `/opt/tritonserver/backends/my_python_based_backend/` or `/opt/tritonserver/backends/python/`. When both locations contain mentioned artifacts, custom backend's artifacts will take priority over Python backend's artifacts. This way, if custom backends needs to use a different Python version than what is shipped by default, it can easily be done. Please, refer to [customization](#customization) section for more details. * Specify `my_python_based_backend` as a backend in `config.pbtxt` for any model, that should use this backend. ``` ... backend: "my_python_based_backend" ... ``` Since Triton uses Python backend under the hood, it is expected, to see `python` backend entry in server logs, even when Python backend is not explicitly used. ``` I1013 21:52:45.756456 18668 server.cc:619] +-------------------------+-------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------+ | Backend | Path | Config | +-------------------------+-------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------+ | python | /opt/tritonserver/backends/python/libtriton_python.so | {"cmdline":{"auto-complete-config":"true","backend-directory":"/opt/tritonserver/backends","min-compute-capability" | | | | :"6.000000","default-max-batch-size":"4"}} | | my_python_based_backend | /opt/tritonserver/backends/my_python_based_backend/model.py | {"cmdline":{"auto-complete-config":"true","backend-directory":"/opt/tritonserver/backends","min-compute-capability" | | | | :"6.000000","default-max-batch-size":"4"}} | +-------------------------+-------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------+ ``` ## Customization Python backend shipped in the NVIDIA GPU Cloud containers uses Python 3.10. Python backend is able to use the libraries that exist in the current Python environment. These libraries can be installed in a virtualenv, conda environment, or the global system Python, and will only be used if the Python version matches the Python version of the Python backend's stub executable (`triton_python_backend_stub`). For example, if you install a set of libraries in a Python 3.9 environment and your Python backend stub is compiled with Python 3.10 these libraries will *NOT* be available. You would need to [compile](https://github.com/triton-inference-server/python_backend#building-custom-python-backend-stub) the stub executable with Python 3.9. If you want to create a tar file that contains all your Python dependencies or you want to use different Python environments for each Python model you need to create a [Custom Execution Environment](https://github.com/triton-inference-server/python_backend#creating-custom-execution-environments) in Python backend. ## Background In some use cases, it is sufficient to implement [`TritonPythonModel` interface](https://github.com/triton-inference-server/python_backend#usage) only once and re-use it across multiple models. As an example, please refer to the [vLLM backend](https://github.com/triton-inference-server/vllm_backend), which provides a common python script to serve models supported by vLLM. Triton Inference Server can handle this special case and treats common `model.py` script as a Python-based backend. In the scenario, when model relies on a custom Python-based backend, Triton loads `libtriton_python.so` first, this ensures that Triton knows how to send requests to the backend for execution and the backend knows how to communicate with Triton. Then, Triton makes sure to use common `model.py` from the backend's repository, and not look for it in the model repository. While the only required function is `execute`, it is typically helpful to enhance your implementation by adding `initialize`, `finalize`, and any other helper functions. Users are also encouraged to make use of the [`auto_complete_config`](https://github.com/triton-inference-server/python_backend#auto_complete_config) function to define standardized input and output properties upfront. ================================================ FILE: examples/README.md ================================================ [![License](https://img.shields.io/badge/License-BSD3-lightgrey.svg)](https://opensource.org/licenses/BSD-3-Clause) # Triton Example Backends To learn how to create a Triton backend, and to see a best-practices baseline onto which you can add your own backend log, follow the [Tutorial](#tutorial). Triton also provides a couple of example backends that demonstrate specific aspects of the backend API not covered by the [Tutorial](#tutorial). * The [*repeat*](https://github.com/triton-inference-server/repeat_backend) backend shows a more advanced example of how a backend can produce multiple responses per request. * The [*stateful*](https://github.com/triton-inference-server/stateful_backend) backend shows an example of how a backend can manage model state tensors on the server-side for the [sequence batcher](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/batcher.md#sequence-batcher) to avoid transferring state tensors between client and server. Triton also implements [Implicit State Management](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/architecture.md#implicit-state-management) which allows backends to behave in a stateless manner and leave the state management to Triton. ## Tutorial The [Triton Backend API](../README.md#triton-backend-api) exposes a large number of features. The backend utilities and classes provide many functions commonly used when creating a backend. But to create a functional backend it is not necessary to use most of the backend API or utilities. The tutorial starts with an implementation that shows a *minimal* backend and then adds on recommended and optional enhancements. The tutorial implementations follow best practices for Triton backends and so can be used as templates for your own backend. ### *Minimal* Triton Backend The source code for the *minimal* backend is contained in [minimal.cc](backends/minimal/src/minimal.cc). The source code contains extensive documentation describing the operation of the backend and the use of the [Triton Backend API](../README.md#triton-backend-api) and the backend utilities. Before reading the source code, make sure you understand the concepts associated with Triton backend abstractions [TRITONBACKEND_Backend](../README.md#tritonbackend_backend), [TRITONBACKEND_Model](../README.md#tritonbackend_model), and [TRITONBACKEND_ModelInstance](../README.md#tritonbackend_modelinstance). The *minimal* backend does not do any interesting operation, it simply copies a single input tensor to a single output tensor, but it does demonstrate the basic organization required for a Triton backend. The *minimal* backend is complete but for clarity leaves out some important aspects of writing a full-featured backend that are described in [*Recommended* Triton Backend](#recommended-triton-backend). When creating your own backend use the [*Recommended* Triton Backend](#recommended-triton-backend) as a starting point. #### Building the *Minimal* Backend [backends/minimal/CMakeLists.txt](backends/minimal/CMakeLists.txt) shows the recommended build and install script for a Triton backend. To build the *minimal* backend and install in a local directory use the following commands. ``` $ cd backends/minimal $ mkdir build $ cd build $ cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install .. $ make install ``` The following required Triton repositories will be pulled and used in the build. By default the "main" branch/tag will be used for each repo but the listed CMake argument can be used to override. * triton-inference-server/backend: -DTRITON_BACKEND_REPO_TAG=[tag] * triton-inference-server/core: -DTRITON_CORE_REPO_TAG=[tag] * triton-inference-server/common: -DTRITON_COMMON_REPO_TAG=[tag] If you are building on a release branch (or on a development branch that is based off of a release branch), then you must set these cmake arguments to point to that release branch as well. For example, if you are building the r21.10 identity_backend branch then you need to use the following additional cmake flags: ``` -DTRITON_BACKEND_REPO_TAG=r21.10 -DTRITON_CORE_REPO_TAG=r21.10 -DTRITON_COMMON_REPO_TAG=r21.10 ``` After building the install directory will contain a backends/minimal directory that contains the *minimal* backend. Instructions for adding this backend to the Triton server are described in [Backend Shared Library](../README.md#backend-shared-library). #### Running Triton with the *Minimal* Backend After adding the *minimal* backend to the Triton server as described in [Backend Shared Library](../README.md#backend-shared-library), you can run Triton and have it load the models in [model_repos/minimal_models](model_repos/minimal_models). Assuming you have created a *tritonserver* Docker image by adding the *minimal* backend to Triton, the following command will run Triton: ``` $ docker run --rm -it --net=host -v/path/to/model_repos/minimal_models:/models tritonserver --model-repository=/models ``` The console output will show similar to the following indicating that the *batching* and *nonbatching* models from the minimal_models repository have loaded correctly. Note that the model repository has two models that both use the *minimal* backend. A backend can support any number of different models. ``` I1215 23:46:00.250284 68 server.cc:589] +-------------+---------+--------+ | Model | Version | Status | +-------------+---------+--------+ | batching | 1 | READY | | nonbatching | 1 | READY | +-------------+---------+--------+ ``` The models are identical except that the *batching* model enabled the [dynamic batcher](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/batcher.md#dynamic-batcher) and supports batch sizes up to 8. Note that the *batching* model sets the [batch delay](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/batcher.md#delayed-batching) to 5 seconds so that the example client described below can demonstrate how the *minimal* backend receives a batch of requests. #### Testing the *Minimal* Backend The [clients](clients) directory holds example clients. The [minimal_client](clients/minimal_client) Python script demonstrates sending a couple of inference requests to the *minimal* backend. With Triton running as described in [Running Triton with the *Minimal* Backend](#running-triton-with-the-minimal-backend), execute the client: ``` $ clients/minimal_client ``` The minimal_client first sends a single request to nonbatching model. From the output you can see that the input value is returned in the output. ``` ========= Sending request to nonbatching model: IN0 = [1 2 3 4] Response: {'model_name': 'nonbatching', 'model_version': '1', 'outputs': [{'name': 'OUT0', 'datatype': 'INT32', 'shape': [4], 'parameters': {'binary_data_size': 16}}]} OUT0 = [1 2 3 4] ``` In the Triton console output you can see the log message printed by the *minimal* backend that indicates that it received a batch containing the single request. ``` I1221 18:14:12.964836 86 minimal.cc:348] model nonbatching: requests in batch 1 I1221 18:14:12.964857 86 minimal.cc:356] batched IN0 value: [ 1, 2, 3, 4 ] ``` The minimal_client next sends 2 requests at the same time to the batching model. Triton will dynamically batch those requests into a single batch and send that single batch to the *minimal* backend. ``` ========= Sending request to batching model: IN0 = [[10 11 12 13]] Sending request to batching model: IN0 = [[20 21 22 23]] Response: {'model_name': 'batching', 'model_version': '1', 'outputs': [{'name': 'OUT0', 'datatype': 'INT32', 'shape': [1, 4], 'parameters': {'binary_data_size': 16}}]} OUT0 = [[10 11 12 13]] Response: {'model_name': 'batching', 'model_version': '1', 'outputs': [{'name': 'OUT0', 'datatype': 'INT32', 'shape': [1, 4], 'parameters': {'binary_data_size': 16}}]} OUT0 = [[20 21 22 23]] ``` In the Triton console output you can see the log message indicating that the *minimal* backend received a batch containing both requests. ``` I1221 18:14:17.965982 86 minimal.cc:348] model batching: requests in batch 2 I1221 18:14:17.966035 86 minimal.cc:356] batched IN0 value: [ 10, 11, 12, 13, 20, 21, 22, 23 ] ``` ### *Recommended* Triton Backend The source code for the *recommended* backend is contained in [recommended.cc](backends/recommended/src/recommended.cc). The source code contains extensive documentation describing the operation of the backend and the use of the [Triton Backend API](../README.md#triton-backend-api) and the backend utilities. Before reading the source code, make sure you understand the concepts associated with Triton backend abstractions [TRITONBACKEND_Backend](../README.md#tritonbackend_backend), [TRITONBACKEND_Model](../README.md#tritonbackend_model), and [TRITONBACKEND_ModelInstance](../README.md#tritonbackend_modelinstance). The *recommended* backend improves the [*minimal* backend](#minimal-triton-backend) to include the following features which should be present in any robust backend implementation: * Enhances the backend to support models with input/output tensors that have datatypes other than INT32. * Enhances the backend to support models with input/output tensors that have any shape. * Uses the Triton backend metric APIs to record statistics about requests executing in the backend. These metrics can then we queried using the Triton [metrics](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/metrics.md) and [statistics](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_statistics.md) APIs. * Additional error checking to ensure that the backend's version is compatible with Triton and that each model's configuration is compatible with the backend. As with the *minimal* backend, the *recommended* backend just returns the input tensor value in the output tensor. Because of the additions described above, the *recommended* backend can serve as a starting point for your backend. #### Building the *Recommended* Backend [backends/recommended/CMakeLists.txt](backends/recommended/CMakeLists.txt) shows the recommended build and install script for a Triton backend. Building and installing is the same as described in [Building the *Minimal* Backend](#building-the-minimal-backend). #### Running Triton with the *Recommended* Backend After adding the *recommended* backend to the Triton server as described in [Backend Shared Library](../README.md#backend-shared-library), you can run Triton and have it load the models in [model_repos/recommended_models](model_repos/recommended_models). Assuming you have created a *tritonserver* Docker image by adding the *recommended* backend to Triton, the following command will run Triton: ``` $ docker run --rm -it --net=host -v/path/to/model_repos/recommended_models:/models tritonserver --model-repository=/models ``` The console output will show similar to the following indicating that the *batching* model from the recommended_models repository have loaded correctly. ``` I1215 23:46:00.250284 68 server.cc:589] +-------------+---------+--------+ | Model | Version | Status | +-------------+---------+--------+ | batching | 1 | READY | +-------------+---------+--------+ ``` #### Testing the *Recommended* Backend The [clients](clients) directory holds example clients. The [recommended_client](clients/recommended_client) Python script demonstrates sending a couple of inference requests to the *recommended* backend. With Triton running as described in [Running Triton with the *Recommended* Backend](#running-triton-with-the-recommended-backend), execute the client: ``` $ clients/recommended_client ``` The recommended_client next sends 2 requests at the same time to the batching model, similar to what was done above with the *minimal* backend. Triton will dynamically batch those requests into a single batch and send that single batch to the *recommended* backend. In this model, batching is supported, the datatype is FP32 and the tensor shape is [ -1, 4, 4 ]. ``` ========= Sending request to batching model: input = [[[1. 1.1 1.2 1.3] [2. 2.1 2.2 2.3] [3. 3.1 3.2 3.3] [4. 4.1 4.2 4.3]]] Sending request to batching model: input = [[[10. 10.1 10.2 10.3] [20. 20.1 20.2 20.3] [30. 30.1 30.2 30.3] [40. 40.1 40.2 40.3]]] Response: {'model_name': 'batching', 'model_version': '1', 'outputs': [{'name': 'OUTPUT', 'datatype': 'FP32', 'shape': [1, 4, 4], 'parameters': {'binary_data_size': 64}}]} OUTPUT = [[[1. 1.1 1.2 1.3] [2. 2.1 2.2 2.3] [3. 3.1 3.2 3.3] [4. 4.1 4.2 4.3]]] Response: {'model_name': 'batching', 'model_version': '1', 'outputs': [{'name': 'OUTPUT', 'datatype': 'FP32', 'shape': [1, 4, 4], 'parameters': {'binary_data_size': 64}}]} OUTPUT = [[[10. 10.1 10.2 10.3] [20. 20.1 20.2 20.3] [30. 30.1 30.2 30.3] [40. 40.1 40.2 40.3]]] ``` In the Triton console output you can see the log message indicating that the *recommended* backend received a batch containing both requests. ``` I1221 18:30:52.223226 127 recommended.cc:604] model batching: requests in batch 2 I1221 18:30:52.223313 127 recommended.cc:613] batched INPUT value: [ 1.000000, 1.100000, 1.200000, 1.300000, 2.000000, 2.100000, 2.200000, 2.300000, 3.000000, 3.100000, 3.200000, 3.300000, 4.000000, 4.100000, 4.200000, 4.300000, 10.000000, 10.100000, 10.200000, 10.300000, 20.000000, 20.100000, 20.200001, 20.299999, 30.000000, 30.100000, 30.200001, 30.299999, 40.000000, 40.099998, 40.200001, 40.299999 ] ``` Because the *recommended* backend can support models that have input/output tensors with any datatype and shape, you can edit the model configuration and the client to experiment with these options. To see the metrics collected for these two inference requests, use the following command to access Triton's metrics endpoint. ``` $ curl localhost:8002/metrics ``` The output will be metric values in Prometheus data format. The [metrics documentation](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/metrics.md) gives a description of these metric values. ``` # HELP nv_inference_request_success Number of successful inference requests, all batch sizes # TYPE nv_inference_request_success counter nv_inference_request_success{model="batching",version="1"} 2.000000 # HELP nv_inference_request_failure Number of failed inference requests, all batch sizes # TYPE nv_inference_request_failure counter nv_inference_request_failure{model="batching",version="1"} 0.000000 # HELP nv_inference_count Number of inferences performed # TYPE nv_inference_count counter nv_inference_count{model="batching",version="1"} 2.000000 # HELP nv_inference_exec_count Number of model executions performed # TYPE nv_inference_exec_count counter nv_inference_exec_count{model="batching",version="1"} 1.000000 ... ``` You can also see the collected statistics using the [statistics endpoint](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_statistics.md). ``` $ curl localhost:8000/v2/models/batching/stats {"model_stats":[{"name":"batching","version":"1","last_inference":1640111452223,"inference_count":2,"execution_count":1,"inference_stats":{"success":{"count":2,"ns":9997025869},"fail":{"count":0,"ns":0},"queue":{"count":2,"ns":9996491319},"compute_input":{"count":2,"ns":95288},"compute_infer":{"count":2,"ns":232202},"compute_output":{"count":2,"ns":195850}},"batch_stats":[{"batch_size":2,"compute_input":{"count":1,"ns":47644},"compute_infer":{"count":1,"ns":116101},"compute_output":{"count":1,"ns":97925}}]}]} ``` ### *BLS* Triton Backend Please see the [documentation](backends/bls/README.md) of *BLS* Backend. ### Custom Batching When using the dynamic batcher, Triton allows you to set custom batching rules. These rules are added on top of the specified dynamic batcher behavior. To set them, you pass in a library that implements the custom batching API. Two example batching libraries are located in the [batching_strategies directory](batching_strategies). For this tutorial, you can use the [volume_batching](batching_strategies/volume_batching) example to set up a maximum byte volume per request. To build the library and install in a local directory, use the following commands: ``` $ cd batch_strategies/volume_batching $ mkdir build $ cd build $ cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install .. $ make install ``` Next, move the library to the desired location. You can pass the file location via the model configuration. If not specified, Triton will look for a library called `batchstrategy.so` in the model version, model, and backend directories, in that order. For ease, we'll pass it via the model configuration. Select a model to use this strategy with. Then, update the model configuration to have these fields: ``` dynamic_batching { } parameters: { key: "TRITON_BATCH_STRATEGY_PATH", value: {string_value: "/path/to/libtriton_volumebatching.so"}} parameters { key: "MAX_BATCH_VOLUME_BYTES" value: {string_value: "96"}} ``` You can update the path to the filepath of your library. You can also update the value of `MAX_BATCH_VOLUME_BYTES` to the maximum volume per batch for your use case. After starting Triton, you should see the scheduler apply a volume constraint per batch on top of default batching behavior for your model. This can be made more visible by setting a [max queue delay](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/batcher.md#delayed-batching) to give the scheduler more time for each batch to be completed. For example, you could set the delay to 100,000 microseconds. ### Enhancements This section describes several optional features that you can add to enhance the capabilities of your backend. #### Automatically Model Configuration Generation [Automatic model configuration generation](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#auto-generated-model-configuration) is enabled by the backend implementing the appropriate logic (for example, in a function called AutoCompleteConfig) during TRITONBACKEND_ModelInitialize. For the *recommended* backend you would add a call to AutoCompleteConfig in the ModelState constructor just before the call to ValidateModelConfig. The AutoCompleteConfig function can update the model configuration with input tensor, output tensor, and max-batch-size configuration; and then update the configuration using TRITONBACKEND_ModelSetConfig. Examples can be found in [ONNXRuntime backend](https://github.com/triton-inference-server/onnxruntime_backend), [TensorFlow backend](https://github.com/triton-inference-server/tensorflow_backend) and other backends. #### Add Key-Value Parameters to a Response A backend can add a key-value pair to a response any time after the response is created and before it is sent. The parameter key must be a string and the parameter value can be a string, integer or boolean. The following example shows the TRITONBACKEND API used to set response parameters. Error checking code is not shown to improve clarity. ``` TRITONBACKEND_ResponseSetStringParameter(response, "param0", "an example string parameter"); TRITONBACKEND_ResponseSetIntParameter(responses[r], "param1", 42); TRITONBACKEND_ResponseSetBoolParameter(responses[r], "param2", false); ``` #### Access Model Artifacts in the Model Repository A backend can access any of the files in a model's area of the model registry. These files are typically needed during TRITONBACKEND_ModelInitialize but can be accessed at other times as well. The TRITONBACKEND_ModelRepository API gives the location of the model's repository. For example, the following code can be run during TRITONBACKEND_ModelInitialize to write the location to the log. ``` // Can get location of the model artifacts. Normally we would need // to check the artifact type to make sure it was something we can // handle... but we are just going to log the location so we don't // need the check. We would use the location if we wanted to load // something from the model's repo. TRITONBACKEND_ArtifactType artifact_type; const char* clocation; RETURN_IF_ERROR( TRITONBACKEND_ModelRepository(model, &artifact_type, &clocation)); LOG_MESSAGE( TRITONSERVER_LOG_INFO, (std::string("Repository location: ") + clocation).c_str()); ``` The framework backends (for example, TensorRT, ONNXRuntime, TensorFlow, PyTorch) read the actual model file from the model repository using this API. See those backends for examples of how it can be used. ================================================ FILE: examples/backends/bls/CMakeLists.txt ================================================ # Copyright 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of NVIDIA CORPORATION nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmake_minimum_required(VERSION 3.31.8) project(tritonblsbackend LANGUAGES C CXX) # # Options # # Must include options required for this project as well as any # projects included in this one by FetchContent. # # GPU support is disabled by default because BLS backend doesn't # support GPUs. # option(TRITON_ENABLE_GPU "Enable GPU support in backend" OFF) option(TRITON_ENABLE_STATS "Include statistics collections in backend" ON) set(TRITON_REPO_ORGANIZATION "https://github.com/triton-inference-server" CACHE STRING "Git repository to pull from") set(TRITON_BACKEND_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/backend repo") set(TRITON_CORE_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/core repo") set(TRITON_COMMON_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/common repo") # # Setting C++ min standard # set(TRITON_MIN_CXX_STANDARD 17 CACHE STRING "The minimum C++ standard whose features are requested to build this target.") if(NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE Release) endif() # # Dependencies # # FetchContent's composability isn't very good. We must include the # transitive closure of all repos so that we can override the tag. # include(FetchContent) FetchContent_Declare( repo-common GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/common.git GIT_TAG ${TRITON_COMMON_REPO_TAG} GIT_SHALLOW ON ) FetchContent_Declare( repo-core GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/core.git GIT_TAG ${TRITON_CORE_REPO_TAG} GIT_SHALLOW ON ) FetchContent_Declare( repo-backend GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/backend.git GIT_TAG ${TRITON_BACKEND_REPO_TAG} GIT_SHALLOW ON ) FetchContent_MakeAvailable(repo-common repo-core repo-backend) # # Shared library implementing the Triton Backend API # configure_file(src/libtriton_bls.ldscript libtriton_bls.ldscript COPYONLY) add_library( triton-bls-backend SHARED src/backend.cc src/bls.h src/bls.cc src/bls_utils.h src/bls_utils.cc ) add_library( TritonBLSBackend::triton-bls-backend ALIAS triton-bls-backend ) target_include_directories( triton-bls-backend PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src ) target_compile_features(triton-bls-backend PRIVATE cxx_std_${TRITON_MIN_CXX_STANDARD}) target_compile_options( triton-bls-backend PRIVATE $<$,$,$>: -Wall -Wextra -Wno-unused-parameter -Wno-error=maybe-uninitialized> ) target_link_libraries( triton-bls-backend PRIVATE triton-core-serverstub # from repo-core triton-backend-utils # from repo-backend ) set_target_properties( triton-bls-backend PROPERTIES POSITION_INDEPENDENT_CODE ON OUTPUT_NAME triton_bls LINK_DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libtriton_bls.ldscript LINK_FLAGS "-Wl,--version-script libtriton_bls.ldscript" ) # # Install # include(GNUInstallDirs) set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/TritonBLSBackend) install( TARGETS triton-bls-backend EXPORT triton-bls-backend-targets LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/bls ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/bls ) install( EXPORT triton-bls-backend-targets FILE TritonBLSBackendTargets.cmake NAMESPACE TritonBLSBackend:: DESTINATION ${INSTALL_CONFIGDIR} ) include(CMakePackageConfigHelpers) configure_package_config_file( ${CMAKE_CURRENT_LIST_DIR}/cmake/TritonBLSBackendConfig.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/TritonBLSBackendConfig.cmake INSTALL_DESTINATION ${INSTALL_CONFIGDIR} ) install( FILES ${CMAKE_CURRENT_BINARY_DIR}/TritonBLSBackendConfig.cmake DESTINATION ${INSTALL_CONFIGDIR} ) # # Export from build tree # export( EXPORT triton-bls-backend-targets FILE ${CMAKE_CURRENT_BINARY_DIR}/TritonBLSBackendTargets.cmake NAMESPACE TritonBLSBackend:: ) export(PACKAGE TritonBLSBackend) ================================================ FILE: examples/backends/bls/README.md ================================================ # *BLS* Triton Backend The [*BLS*](../bls) backend demonstrates using in-process C-API to execute inferences within the backend. This backend serves as an example to backend developers for implementing their own custom pipeline in C++. For Python use cases, please refer to [Business Logic Scripting](https://github.com/triton-inference-server/python_backend/blob/main/README.md#business-logic-scripting) section in Python backend. The source code for the *bls* backend is contained in [src](./src). * [backend.cc](./src/backend.cc) contains the main backend implementation. The content of this file is not BLS specific. It only includes the required Triton backend functions that is standard for any backend implementation. The BLS logic is set off in the `TRITONBACKEND_ModelInstanceExecute` with lines `bls_executor.Execute(requests[r], &responses[r]);`. * [bls.h](./src/bls.h) is where the BLS (class `BLSExecutor`) of this example is located. You can refer to this file to see how to interact with Triton in-process C-API to build the custom execution pipeline. * [bls_utils.h](./src/bls_utils.h) is where all the utilities that are not BLS dependent are located. The source code contains extensive documentation describing the operation of the backend and the use of the [Triton Backend API](../../../README.md#triton-backend-api) and the [Triton Server API](https://github.com/triton-inference-server/server/blob/main/docs/customization_guide/inprocess_c_api.md). Before reading the source code, make sure you understand the concepts associated with Triton backend abstractions [TRITONBACKEND_Backend](../../../README.md#tritonbackend_backend), [TRITONBACKEND_Model](../../../README.md#tritonbackend_model), and [TRITONBACKEND_ModelInstance](../../../README.md#tritonbackend_modelinstance). The *bls* backend will send two requests on the 'addsub_python' and 'addsub_onnx' models. After the inference requests are completed, this backend will extract OUTPUT0 from the 'addsub_python' and OUTPUT1 from the 'addsub_onnx' model to construct the final inference response object using these tensors. There are some self-imposed limitations that were made for the simplicity of this example: 1. This backend does not support batching. 2. This backend does not support decoupled models. 3. This backend does not support GPU tensors. 4. The model configuration should be strictly set as the comments described in [backend.cc](./src/backend.cc). You can implement your custom backend that is not limited to the limitations mentioned above. ## Building the *BLS* Backend [backends/bls/CMakeLists.txt](CMakeLists.txt) shows the recommended build and install script for a Triton backend. Building and installing is the same as described in [Building the *Minimal* Backend](../../README.md#building-the-minimal-backend). ## Running Triton with the *BLS* Backend After adding the *bls* backend to the Triton server as described in [Backend Shared Library](../../../README.md#backend-shared-library), you can run Triton and have it load the models in [model_repos/bls_models](../../model_repos/bls_models). Assuming you have created a *tritonserver* Docker image by adding the *bls* backend to Triton, the following command will run Triton: ``` $ docker run --rm -it --net=host -v/path/to/model_repos/bls_models:/models tritonserver --model-repository=/models ``` The console output will show similar to the following indicating that the *bls_fp32*, *addsub_python* and *addsub_onnx* models from the bls_models repository have loaded correctly. ``` I0616 09:34:47.767433 19214 server.cc:629] +---------------+---------+--------+ | Model | Version | Status | +---------------+---------+--------+ | addsub_python | 1 | READY | | addsub_onnx | 1 | READY | | bls_fp32 | 1 | READY | +---------------+---------+--------+ ``` ## Testing the *BLS* Backend The [clients](../../clients) directory holds example clients. The [bls_client](../../clients/bls_client) Python script demonstrates sending an inference requests to the *bls* backend. With Triton running as described in [Running Triton with the *BLS* Backend](#running-triton-with-the-bls-backend), execute the client: ``` $ clients/bls_client ``` You should see an output similar to the output below: ``` INPUT0 ([0.42935285 0.51512766 0.43625894 ... 0.6670954 0.17747518 0.7976901 ]) + INPUT1 ([6.7752063e-01 2.4223252e-01 6.7743927e-01 ... 4.1531715e-01 2.5451833e-01 7.9097062e-01]) = OUTPUT0 ([1.1068735 0.75736016 1.1136982 ... 1.0824126 0.4319935 1.5886607 ]) INPUT0 ([0.42935285 0.51512766 0.43625894 ... 0.6670954 0.17747518 0.7976901 ]) - INPUT1 ([6.7752063e-01 2.4223252e-01 6.7743927e-01 ... 4.1531715e-01 2.5451833e-01 7.9097062e-01]) = OUTPUT1 ([-0.24816778 0.27289516 -0.24118033 ... 0.25177827 -0.07704315 0.00671947]) PASS ``` ================================================ FILE: examples/backends/bls/cmake/TritonBLSBackendConfig.cmake.in ================================================ # Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of NVIDIA CORPORATION nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. include(CMakeFindDependencyMacro) get_filename_component( TRITONBLSBACKEND_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH ) list(APPEND CMAKE_MODULE_PATH ${TRITONBLSBACKEND_CMAKE_DIR}) if(NOT TARGET TritonBLSBackend::triton-bls-backend) include("${TRITONBLSBACKEND_CMAKE_DIR}/TritonBLSBackendTargets.cmake") endif() set(TRITONBLSBACKEND_LIBRARIES TritonBLSBackend::triton-bls-backend) ================================================ FILE: examples/backends/bls/src/backend.cc ================================================ // Copyright 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // * Neither the name of NVIDIA CORPORATION nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "bls.h" #include "triton/backend/backend_model.h" #include "triton/backend/backend_model_instance.h" // // Backend that demonstrates using in-process C-API to execute inferences // within the backend. // // Two particular models, 'addsub_python' and 'addsub_onnx', must be loaded on // the server for a successful inference execution on this backend. // // The model configuration should be set as follows in order to be in line with // the 'addsub_python' and 'addsub_onnx' models. This backend does not support // batching. These limitations are only for this specific backend. You can // implement your custom BLS backend with less limitations. // // Model Configuration: // - Input 'INPUT0' must have shape [16] and datatype must be TYPE_FP32. // // - Input 'INPUT1' must have shape [16] and datatype must be TYPE_FP32. // // - For each response, output 'OUTPUT0' must have shape [16] and // datatype TYPE_FP32. // // - For each response, output 'OUTPUT1' must have shape [16] and // datatype TYPE_FP32. // // This backend will send two requests on the 'addsub_python' and 'addsub_onnx' // models. After the inference requests are completed, this backend // will extract OUTPUT0 from the 'addsub_python' and OUTPUT1 from the // 'addsub_onnx' model to construct the final inference response object using // these tensors. namespace triton { namespace backend { namespace bls { // // ModelState // // State associated with a model that is using this backend. An object // of this class is created and associated with each // TRITONBACKEND_Model. // class ModelState : public BackendModel { public: static TRITONSERVER_Error* Create( TRITONBACKEND_Model* triton_model, ModelState** state); virtual ~ModelState() = default; // Validate that model configuration is supported by this backend. TRITONSERVER_Error* ValidateModelConfig(); private: ModelState(TRITONBACKEND_Model* triton_model) : BackendModel(triton_model) {} }; TRITONSERVER_Error* ModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state) { try { *state = new ModelState(triton_model); } catch (const BackendModelException& ex) { RETURN_ERROR_IF_TRUE( ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL, std::string("unexpected nullptr in BackendModelException")); RETURN_IF_ERROR(ex.err_); } return nullptr; // success } TRITONSERVER_Error* ModelState::ValidateModelConfig() { // We have the json DOM for the model configuration... common::TritonJson::WriteBuffer buffer; RETURN_IF_ERROR(model_config_.PrettyWrite(&buffer)); LOG_MESSAGE( TRITONSERVER_LOG_INFO, (std::string("model configuration:\n") + buffer.Contents()).c_str()); // max_batch_size must be 0 because this backend does not support // batching int64_t max_batch_size; RETURN_IF_ERROR(model_config_.MemberAsInt("max_batch_size", &max_batch_size)); RETURN_ERROR_IF_FALSE( max_batch_size == 0, TRITONSERVER_ERROR_INVALID_ARG, std::string("bls backend only supports models with max_batch_size == 0")); common::TritonJson::Value inputs, outputs; RETURN_IF_ERROR(model_config_.MemberAsArray("input", &inputs)); RETURN_IF_ERROR(model_config_.MemberAsArray("output", &outputs)); // There must be 2 inputs and 2 outputs. RETURN_ERROR_IF_FALSE( inputs.ArraySize() == 2, TRITONSERVER_ERROR_INVALID_ARG, std::string("expected 2 inputs, got ") + std::to_string(inputs.ArraySize())); RETURN_ERROR_IF_FALSE( outputs.ArraySize() == 2, TRITONSERVER_ERROR_INVALID_ARG, std::string("expected 2 outputs, got ") + std::to_string(outputs.ArraySize())); // Here we rely on the model configuration listing the inputs and // outputs in a specific order, which we shouldn't really require... common::TritonJson::Value input0, input1, output0, output1; RETURN_IF_ERROR(inputs.IndexAsObject(0, &input0)); RETURN_IF_ERROR(inputs.IndexAsObject(1, &input1)); RETURN_IF_ERROR(outputs.IndexAsObject(0, &output0)); RETURN_IF_ERROR(outputs.IndexAsObject(1, &output1)); // Check tensor names std::string in0_name, in1_name, out0_name, out1_name; RETURN_IF_ERROR(input0.MemberAsString("name", &in0_name)); RETURN_IF_ERROR(input1.MemberAsString("name", &in1_name)); RETURN_IF_ERROR(output0.MemberAsString("name", &out0_name)); RETURN_IF_ERROR(output1.MemberAsString("name", &out1_name)); RETURN_ERROR_IF_FALSE( in0_name == "INPUT0", TRITONSERVER_ERROR_INVALID_ARG, std::string("expected first input tensor name to be INPUT0, got ") + in0_name); RETURN_ERROR_IF_FALSE( in1_name == "INPUT1", TRITONSERVER_ERROR_INVALID_ARG, std::string("expected second input tensor name to be INPUT1, got ") + in1_name); RETURN_ERROR_IF_FALSE( out0_name == "OUTPUT0", TRITONSERVER_ERROR_INVALID_ARG, std::string("expected first output tensor name to be OUTPUT0, got ") + out0_name); RETURN_ERROR_IF_FALSE( out1_name == "OUTPUT1", TRITONSERVER_ERROR_INVALID_ARG, std::string("expected second output tensor name to be OUTPUT1, got ") + out1_name); // Check shapes std::vector in0_shape, in1_shape, out0_shape, out1_shape; RETURN_IF_ERROR(backend::ParseShape(input0, "dims", &in0_shape)); RETURN_IF_ERROR(backend::ParseShape(input1, "dims", &in1_shape)); RETURN_IF_ERROR(backend::ParseShape(output0, "dims", &out0_shape)); RETURN_IF_ERROR(backend::ParseShape(output1, "dims", &out1_shape)); RETURN_ERROR_IF_FALSE( in0_shape.size() == 1, TRITONSERVER_ERROR_INVALID_ARG, std::string("expected INPUT0 shape to have one dimension, got ") + backend::ShapeToString(in0_shape)); RETURN_ERROR_IF_FALSE( in1_shape.size() == 1, TRITONSERVER_ERROR_INVALID_ARG, std::string("expected INPUT1 shape to have one dimension, got ") + backend::ShapeToString(in1_shape)); RETURN_ERROR_IF_FALSE( out0_shape.size() == 1, TRITONSERVER_ERROR_INVALID_ARG, std::string("expected OUTPUT0 shape to have one dimension, got ") + backend::ShapeToString(out0_shape)); RETURN_ERROR_IF_FALSE( out1_shape.size() == 1, TRITONSERVER_ERROR_INVALID_ARG, std::string("expected OUTPUT1 shape to have one dimension, got ") + backend::ShapeToString(out1_shape)); // Check datatypes std::string in0_dtype, in1_dtype, out0_dtype, out1_dtype; RETURN_IF_ERROR(input0.MemberAsString("data_type", &in0_dtype)); RETURN_IF_ERROR(input1.MemberAsString("data_type", &in1_dtype)); RETURN_IF_ERROR(output0.MemberAsString("data_type", &out0_dtype)); RETURN_IF_ERROR(output1.MemberAsString("data_type", &out1_dtype)); RETURN_ERROR_IF_FALSE( in0_dtype == "TYPE_FP32", TRITONSERVER_ERROR_INVALID_ARG, std::string("expected INPUT0 datatype to be TYPE_FP32, got ") + in0_dtype); RETURN_ERROR_IF_FALSE( in1_dtype == "TYPE_FP32", TRITONSERVER_ERROR_INVALID_ARG, std::string("expected INPUT1 datatype to be TYPE_FP32, got ") + in1_dtype); RETURN_ERROR_IF_FALSE( out0_dtype == "TYPE_FP32", TRITONSERVER_ERROR_INVALID_ARG, std::string("expected OUTPUT0 datatype to be TYPE_FP32, got ") + out0_dtype); RETURN_ERROR_IF_FALSE( out1_dtype == "TYPE_FP32", TRITONSERVER_ERROR_INVALID_ARG, std::string("expected OUTPUT1 datatype to be TYPE_FP32, got ") + out1_dtype); return nullptr; // success } // // ModelInstanceState // // State associated with a model instance. An object of this class is // created and associated with each TRITONBACKEND_ModelInstance. // class ModelInstanceState : public BackendModelInstance { public: static TRITONSERVER_Error* Create( ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance, ModelInstanceState** state); virtual ~ModelInstanceState() = default; void ProcessRequests( TRITONBACKEND_Request** requests, const uint32_t request_count); private: ModelInstanceState( ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance) : BackendModelInstance(model_state, triton_model_instance) { } }; TRITONSERVER_Error* ModelInstanceState::Create( ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance, ModelInstanceState** state) { try { *state = new ModelInstanceState(model_state, triton_model_instance); } catch (const BackendModelInstanceException& ex) { RETURN_ERROR_IF_TRUE( ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL, std::string("unexpected nullptr in BackendModelInstanceException")); RETURN_IF_ERROR(ex.err_); } return nullptr; // success } void ModelInstanceState::ProcessRequests( TRITONBACKEND_Request** requests, const uint32_t request_count) { uint64_t exec_start_ns = 0; SET_TIMESTAMP(exec_start_ns); for (size_t i = 0; i < request_count; i++) { // If we get a nullptr request then something is badly wrong. Fail // and release all requests. if (requests[i] == nullptr) { RequestsRespondWithError( requests, request_count, TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INTERNAL, std::string( "null request given to BLS backend for '" + Name() + "'") .c_str())); return; } } // At this point we accept ownership of 'requests', which means that // even if something goes wrong we must still return success from // this function. If something does go wrong in processing a // particular request then we send an error response just for the // specific request. std::vector responses; responses.reserve(request_count); for (size_t i = 0; i < request_count; i++) { TRITONBACKEND_Response* response; auto err = TRITONBACKEND_ResponseNew(&response, requests[i]); if (err == nullptr) { responses.emplace_back(response); } else { responses.emplace_back(nullptr); LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "Fail to create response"); TRITONSERVER_ErrorDelete(err); } } ModelState* model_state = reinterpret_cast(Model()); // The way we collect these batch timestamps is not entirely // accurate. Normally, in a performant backend you would execute all // the requests at the same time, and so there would be a single // compute-start / compute-end time-range. But here we execute each // request separately so there is no single range. As a result we // just show the entire execute time as being the compute time as // well. uint64_t compute_start_ns = 0; SET_TIMESTAMP(compute_start_ns); // Create a BLSExecutor object. To separate from standard backend // implementation, the BLS logic is placed inside class BLSExecutor. BLSExecutor bls_executor(model_state->TritonServer()); for (size_t r = 0; r < request_count; r++) { bls_executor.Execute(requests[r], &responses[r]); } uint64_t compute_end_ns = 0; SET_TIMESTAMP(compute_end_ns); uint64_t exec_end_ns = 0; SET_TIMESTAMP(exec_end_ns); // Send all the responses that haven't already been sent because of // an earlier error. Note that the responses are not set to nullptr // here as we need that indication below to determine if the request // we successful or not. for (auto& response : responses) { if (response != nullptr) { LOG_IF_ERROR( TRITONBACKEND_ResponseSend( response, TRITONSERVER_RESPONSE_COMPLETE_FINAL, nullptr), "failed to send BLS backend response"); } } // Report statistics for each request. for (uint32_t r = 0; r < request_count; ++r) { auto& request = requests[r]; LOG_IF_ERROR( TRITONBACKEND_ModelInstanceReportStatistics( TritonModelInstance(), request, (responses[r] != nullptr) /* success */, exec_start_ns, compute_start_ns, compute_end_ns, exec_end_ns), "failed reporting request statistics"); LOG_IF_ERROR( TRITONBACKEND_RequestRelease(request, TRITONSERVER_REQUEST_RELEASE_ALL), "failed releasing request"); } // Report the entire batch statistics. LOG_IF_ERROR( TRITONBACKEND_ModelInstanceReportBatchStatistics( TritonModelInstance(), 1 /*total_batch_size*/, exec_start_ns, compute_start_ns, compute_end_ns, exec_end_ns), "failed reporting batch request statistics"); LOG_MESSAGE( TRITONSERVER_LOG_VERBOSE, (std::string("TRITONBACKEND_ModelExecute: model ") + Name() + " released " + std::to_string(request_count) + " requests") .c_str()); } ///////////// extern "C" { // Implementing TRITONBACKEND_ModelInitialize is optional. The backend // should initialize any state that is intended to be shared across // all instances of the model. TRITONSERVER_Error* TRITONBACKEND_ModelInitialize(TRITONBACKEND_Model* model) { const char* cname; RETURN_IF_ERROR(TRITONBACKEND_ModelName(model, &cname)); std::string name(cname); uint64_t version; RETURN_IF_ERROR(TRITONBACKEND_ModelVersion(model, &version)); LOG_MESSAGE( TRITONSERVER_LOG_INFO, (std::string("TRITONBACKEND_ModelInitialize: ") + name + " (version " + std::to_string(version) + ")") .c_str()); // With each model we create a ModelState object and associate it // with the TRITONBACKEND_Model. ModelState* model_state; RETURN_IF_ERROR(ModelState::Create(model, &model_state)); RETURN_IF_ERROR( TRITONBACKEND_ModelSetState(model, reinterpret_cast(model_state))); // One of the primary things to do in ModelInitialize is to examine // the model configuration to ensure that it is something that this // backend can support. If not, returning an error from this // function will prevent the model from loading. RETURN_IF_ERROR(model_state->ValidateModelConfig()); return nullptr; // success } // Implementing TRITONBACKEND_ModelFinalize is optional unless state // is set using TRITONBACKEND_ModelSetState. The backend must free // this state and perform any other cleanup. TRITONSERVER_Error* TRITONBACKEND_ModelFinalize(TRITONBACKEND_Model* model) { void* vstate; RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vstate)); ModelState* model_state = reinterpret_cast(vstate); LOG_MESSAGE( TRITONSERVER_LOG_INFO, "TRITONBACKEND_ModelFinalize: delete model state"); delete model_state; return nullptr; // success } // Implementing TRITONBACKEND_ModelInstanceInitialize is optional. The // backend should initialize any state that is required for a model // instance. TRITONSERVER_Error* TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance) { const char* cname; RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceName(instance, &cname)); std::string name(cname); int32_t device_id; RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceDeviceId(instance, &device_id)); TRITONSERVER_InstanceGroupKind kind; RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceKind(instance, &kind)); LOG_MESSAGE( TRITONSERVER_LOG_INFO, (std::string("TRITONBACKEND_ModelInstanceInitialize: ") + name + " (" + TRITONSERVER_InstanceGroupKindString(kind) + " device " + std::to_string(device_id) + ")") .c_str()); // The instance can access the corresponding model as well... here // we get the model and from that get the model's state. TRITONBACKEND_Model* model; RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceModel(instance, &model)); void* vmodelstate; RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vmodelstate)); ModelState* model_state = reinterpret_cast(vmodelstate); // With each instance we create a ModelInstanceState object and // associate it with the TRITONBACKEND_ModelInstance. ModelInstanceState* instance_state; RETURN_IF_ERROR( ModelInstanceState::Create(model_state, instance, &instance_state)); RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceSetState( instance, reinterpret_cast(instance_state))); LOG_MESSAGE( TRITONSERVER_LOG_VERBOSE, (std::string("TRITONBACKEND_ModelInstanceInitialize: instance " "initialization successful ") + name + " (device " + std::to_string(device_id) + ")") .c_str()); return nullptr; // success } // Implementing TRITONBACKEND_ModelInstanceFinalize is optional unless // state is set using TRITONBACKEND_ModelInstanceSetState. The backend // must free this state and perform any other cleanup. TRITONSERVER_Error* TRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance) { void* vstate; RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(instance, &vstate)); ModelInstanceState* instance_state = reinterpret_cast(vstate); LOG_MESSAGE( TRITONSERVER_LOG_INFO, "TRITONBACKEND_ModelInstanceFinalize: delete instance state"); delete instance_state; return nullptr; // success } // Implementing TRITONBACKEND_ModelInstanceExecute is required. TRITONSERVER_Error* TRITONBACKEND_ModelInstanceExecute( TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Request** requests, const uint32_t request_count) { // Triton will not call this function simultaneously for the same // 'instance'. But since this backend could be used by multiple // instances from multiple models the implementation needs to handle // multiple calls to this function at the same time (with different // 'instance' objects). Suggested practice for this is to use only // function-local and model-instance-specific state (obtained from // 'instance'), which is what we do here. ModelInstanceState* instance_state; RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState( instance, reinterpret_cast(&instance_state))); ModelState* model_state = reinterpret_cast(instance_state->Model()); LOG_MESSAGE( TRITONSERVER_LOG_VERBOSE, (std::string("model ") + model_state->Name() + ", instance " + instance_state->Name() + ", executing " + std::to_string(request_count) + " requests") .c_str()); instance_state->ProcessRequests(requests, request_count); return nullptr; // success } } // extern "C" }}} // namespace triton::backend::bls ================================================ FILE: examples/backends/bls/src/bls.cc ================================================ // Copyright 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // * Neither the name of NVIDIA CORPORATION nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "bls.h" namespace triton { namespace backend { namespace bls { BLSExecutor::BLSExecutor(TRITONSERVER_Server* server) : server_(server), model_executor_(server) { } TRITONSERVER_Error* BLSExecutor::PrepareInferenceRequest( TRITONBACKEND_Request* bls_request, TRITONSERVER_InferenceRequest** irequest, const std::string model_name) { // Get request_id, correlation_id, and flags from the current request // for preparing a new inference request that we will send to 'addsub_python' // or 'addsub_onnx' model later. const char* request_id; uint64_t correlation_id; uint32_t flags; RETURN_IF_ERROR(TRITONBACKEND_RequestId(bls_request, &request_id)); RETURN_IF_ERROR( TRITONBACKEND_RequestCorrelationId(bls_request, &correlation_id)); RETURN_IF_ERROR(TRITONBACKEND_RequestFlags(bls_request, &flags)); // Create an inference request object. The inference request object // is where we set the name of the model we want to use for // inference and the input tensors. RETURN_IF_ERROR(TRITONSERVER_InferenceRequestNew( irequest, server_, model_name.c_str(), -1 /* model_version */)); // Set request_id, correlation_id, and flags for the new request. RETURN_IF_ERROR(TRITONSERVER_InferenceRequestSetId(*irequest, request_id)); RETURN_IF_ERROR( TRITONSERVER_InferenceRequestSetCorrelationId(*irequest, correlation_id)); RETURN_IF_ERROR(TRITONSERVER_InferenceRequestSetFlags(*irequest, flags)); RETURN_IF_ERROR(TRITONSERVER_InferenceRequestSetReleaseCallback( *irequest, InferRequestComplete, nullptr /* request_release_userp */)); return nullptr; // success } TRITONSERVER_Error* BLSExecutor::PrepareInferenceInput( TRITONBACKEND_Request* bls_request, TRITONSERVER_InferenceRequest* irequest) { // Get the properties of the two inputs from the current request. // Then, add the two input tensors and append the input data to the new // request. uint32_t input_count; RETURN_IF_ERROR(TRITONBACKEND_RequestInputCount(bls_request, &input_count)); TRITONBACKEND_Input* input; const char* name; TRITONSERVER_DataType datatype; const int64_t* shape; uint32_t dims_count; size_t data_byte_size; TRITONSERVER_MemoryType data_memory_type; int64_t data_memory_id; const char* data_buffer; for (size_t count = 0; count < input_count; count++) { RETURN_IF_ERROR(TRITONBACKEND_RequestInputByIndex( bls_request, count /* index */, &input)); RETURN_IF_ERROR(TRITONBACKEND_InputProperties( input, &name, &datatype, &shape, &dims_count, nullptr, nullptr)); RETURN_IF_ERROR(TRITONBACKEND_InputBuffer( input, 0 /* idx */, reinterpret_cast(&data_buffer), &data_byte_size, &data_memory_type, &data_memory_id)); RETURN_IF_ERROR(TRITONSERVER_InferenceRequestAddInput( irequest, name, datatype, shape, dims_count)); RETURN_IF_ERROR(TRITONSERVER_InferenceRequestAppendInputData( irequest, name, &data_buffer[0], data_byte_size, data_memory_type, data_memory_id)); } return nullptr; // success } TRITONSERVER_Error* BLSExecutor::PrepareInferenceOutput( TRITONBACKEND_Request* bls_request, TRITONSERVER_InferenceRequest* irequest) { // Indicate the output tensors to be calculated and returned // for the inference request. uint32_t output_count; RETURN_IF_ERROR(TRITONBACKEND_RequestOutputCount(bls_request, &output_count)); const char* output_name; for (size_t count = 0; count < output_count; count++) { RETURN_IF_ERROR(TRITONBACKEND_RequestOutputName( bls_request, count /* index */, &output_name)); RETURN_IF_ERROR( TRITONSERVER_InferenceRequestAddRequestedOutput(irequest, output_name)); } return nullptr; // success } void BLSExecutor::Execute( TRITONBACKEND_Request* bls_request, TRITONBACKEND_Response** response) { // The names of the models that we will send internal requests on. std::vector model_names = {"addsub_python", "addsub_onnx"}; // Check if both models are valid before executing request. try { for (size_t i = 0; i < 2; i++) { // Check if the model is ready. bool is_ready = false; THROW_IF_TRITON_ERROR(TRITONSERVER_ServerModelIsReady( server_, model_names[i].c_str(), -1 /* model_version */, &is_ready)); if (!is_ready) { throw BLSBackendException( (std::string("Failed to execute the inference request. Model '") + model_names[i].c_str() + "' is not ready.") .c_str()); } // For simplicity, decoupled API is not supported in this BLS backend. You // can implement your own backend that supports decoupled models. uint32_t txn_flags; THROW_IF_TRITON_ERROR(TRITONSERVER_ServerModelTransactionProperties( server_, model_names[i].c_str(), -1 /* model_version */, &txn_flags, nullptr /* voidp */)); if ((txn_flags & TRITONSERVER_TXN_DECOUPLED) != 0) { throw BLSBackendException( std::string("Model '") + model_names[i].c_str() + "' is using the decoupled. This BLS Backend doesn't support models " "using the decoupled transaction policy."); } } } catch (const BLSBackendException& bls_exception) { LOG_MESSAGE(TRITONSERVER_LOG_ERROR, bls_exception.what()); RESPOND_AND_SET_NULL_IF_ERROR( response, TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INTERNAL, "Failed to send inference requests")); return; } // Prepare std::future for each model. Since this BLS backend // can handle requests in parallel, we will send all the inference // requests first and then retrieve them later. std::vector> futures(2); // The inference request object for sending internal requests. TRITONSERVER_InferenceRequest* irequest = nullptr; // For each inference request, the backend sends two requests on the // 'addsub_python' and 'addsub_onnx' models. try { for (size_t icount = 0; icount < 2; icount++) { // Initialize the inference request with required information. THROW_IF_TRITON_ERROR( PrepareInferenceRequest(bls_request, &irequest, model_names[icount])); THROW_IF_TRITON_ERROR(PrepareInferenceInput(bls_request, irequest)); THROW_IF_TRITON_ERROR(PrepareInferenceOutput(bls_request, irequest)); // Execute inference request. THROW_IF_TRITON_ERROR( model_executor_.AsyncExecute(irequest, &futures[icount])); } } catch (const BLSBackendException& bls_exception) { LOG_MESSAGE(TRITONSERVER_LOG_ERROR, bls_exception.what()); LOG_IF_ERROR( TRITONSERVER_InferenceRequestDelete(irequest), "Failed to delete inference request."); RESPOND_AND_SET_NULL_IF_ERROR( response, TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INTERNAL, "Failed to send inference requests")); return; } // If both internal requests are sent successfully, retrieve the output from // each request and construct the final response. ConstructFinalResponse(response, std::move(futures)); } void BLSExecutor::ConstructFinalResponse( TRITONBACKEND_Response** response, std::vector> futures) { // Prepare two TRITONSERVER_InferenceResponse* objects for 'addsub_python' and // 'addsub_onnx' respectively. std::vector completed_responses = { nullptr, nullptr}; const char* output_name; TRITONSERVER_DataType output_datatype; const int64_t* output_shape; uint64_t dims_count; size_t output_byte_size; TRITONSERVER_MemoryType output_memory_type; int64_t output_memory_id; const void* output_base; void* userp; for (size_t icount = 0; icount < 2; icount++) { // Retrieve the corresponding TRITONSERVER_InferenceResponse object from // 'futures'. The InferResponseComplete function sets the std::promise // so that this thread will block until the response is returned. completed_responses[icount] = futures[icount].get(); try { THROW_IF_TRITON_ERROR( TRITONSERVER_InferenceResponseError(completed_responses[icount])); } catch (const BLSBackendException& bls_exception) { LOG_MESSAGE(TRITONSERVER_LOG_ERROR, bls_exception.what()); if (completed_responses[icount] != nullptr) { LOG_IF_ERROR( TRITONSERVER_InferenceResponseDelete(completed_responses[icount]), "Failed to delete inference response."); } return; } // Retrieve outputs from 'completed_responses'. // Extract OUTPUT0 from the 'addsub_python' and OUTPUT1 from the // 'addsub_onnx' model to form the final inference response object. // Get all the information about the output tensor. RESPOND_AND_SET_NULL_IF_ERROR( response, TRITONSERVER_InferenceResponseOutput( completed_responses[icount], icount, &output_name, &output_datatype, &output_shape, &dims_count, &output_base, &output_byte_size, &output_memory_type, &output_memory_id, &userp)); // Create an output tensor in the final response with // the information retrieved above. TRITONBACKEND_Output* output; RESPOND_AND_SET_NULL_IF_ERROR( response, TRITONBACKEND_ResponseOutput( *response, &output, output_name, output_datatype, output_shape, dims_count)); // Get a buffer that holds the tensor data for the output. // We request a buffer in CPU memory but we have to handle any returned // type. If we get back a buffer in GPU memory we just fail the request. void* output_buffer; output_memory_type = TRITONSERVER_MEMORY_CPU; RESPOND_AND_SET_NULL_IF_ERROR( response, TRITONBACKEND_OutputBuffer( output, &output_buffer, output_byte_size, &output_memory_type, &output_memory_id)); if (output_memory_type == TRITONSERVER_MEMORY_GPU) { RESPOND_AND_SET_NULL_IF_ERROR( response, TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INTERNAL, "failed to create output buffer in CPU memory")); } // Fill the BLS output buffer with output data returned by internal // requests. memcpy(output_buffer, output_base, output_byte_size); LOG_IF_ERROR( TRITONSERVER_InferenceResponseDelete(completed_responses[icount]), "Failed to delete inference response."); } } }}} // namespace triton::backend::bls ================================================ FILE: examples/backends/bls/src/bls.h ================================================ // Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // * Neither the name of NVIDIA CORPORATION nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include "bls_utils.h" #include "triton/backend/backend_common.h" #include "triton/core/tritonbackend.h" #include "triton/core/tritonserver.h" namespace triton { namespace backend { namespace bls { // // BLSExecutor // // Includes the custom BLS logic for this backend. // This class shows how to utilize Triton in-process C-API to build the // execution pipeline. // class BLSExecutor { public: BLSExecutor(TRITONSERVER_Server* server); // Prepares the inference request that will be used internally. TRITONSERVER_Error* PrepareInferenceRequest( TRITONBACKEND_Request* bls_request, TRITONSERVER_InferenceRequest** irequest, const std::string model_name); // Prepares the input for the internal inference request. TRITONSERVER_Error* PrepareInferenceInput( TRITONBACKEND_Request* bls_request, TRITONSERVER_InferenceRequest* irequest); // Prepares the output for the internal inference request. TRITONSERVER_Error* PrepareInferenceOutput( TRITONBACKEND_Request* bls_request, TRITONSERVER_InferenceRequest* irequest); // Performs the whole BLS pipeline. void Execute( TRITONBACKEND_Request* bls_request, TRITONBACKEND_Response** response); // Constructs the final response. void ConstructFinalResponse( TRITONBACKEND_Response** response, std::vector> futures); private: // The server object that encapsulates all the functionality of the Triton // server and allows access to the Triton server API. TRITONSERVER_Server* server_; // The ModelExecutor object for executing inference request on a model. ModelExecutor model_executor_; }; }}} // namespace triton::backend::bls ================================================ FILE: examples/backends/bls/src/bls_utils.cc ================================================ // Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // * Neither the name of NVIDIA CORPORATION nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "bls_utils.h" namespace triton { namespace backend { namespace bls { TRITONSERVER_Error* CPUAllocator( TRITONSERVER_ResponseAllocator* allocator, const char* tensor_name, size_t byte_size, TRITONSERVER_MemoryType preferred_memory_type, int64_t preferred_memory_type_id, void* userp, void** buffer, void** buffer_userp, TRITONSERVER_MemoryType* actual_memory_type, int64_t* actual_memory_type_id) { // For simplicity, this backend example always uses CPU memory regardless of // the preferred memory type. You can make the actual memory type and id that // we allocate be the same as preferred memory type. You can also provide a // customized allocator to support different preferred_memory_type, and reuse // memory buffer when possible. *actual_memory_type = TRITONSERVER_MEMORY_CPU; *actual_memory_type_id = preferred_memory_type_id; // If 'byte_size' is zero just return 'buffer' == nullptr, we don't // need to do any other book-keeping. if (byte_size == 0) { *buffer = nullptr; *buffer_userp = nullptr; LOG_MESSAGE( TRITONSERVER_LOG_VERBOSE, ("allocated " + std::to_string(byte_size) + " bytes for result tensor " + tensor_name) .c_str()); } else { void* allocated_ptr = nullptr; *actual_memory_type = TRITONSERVER_MEMORY_CPU; allocated_ptr = malloc(byte_size); // Pass the tensor name with buffer_userp so we can show it when // releasing the buffer. if (allocated_ptr != nullptr) { *buffer = allocated_ptr; *buffer_userp = new std::string(tensor_name); LOG_MESSAGE( TRITONSERVER_LOG_VERBOSE, ("allocated " + std::to_string(byte_size) + " bytes in " + TRITONSERVER_MemoryTypeString(*actual_memory_type) + " for result tensor " + tensor_name) .c_str()); } } return nullptr; // Success } TRITONSERVER_Error* ResponseRelease( TRITONSERVER_ResponseAllocator* allocator, void* buffer, void* buffer_userp, size_t byte_size, TRITONSERVER_MemoryType memory_type, int64_t memory_type_id) { std::string* name = nullptr; if (buffer_userp != nullptr) { name = reinterpret_cast(buffer_userp); } else { name = new std::string(""); } std::stringstream ss; ss << buffer; std::string buffer_str = ss.str(); LOG_MESSAGE( TRITONSERVER_LOG_VERBOSE, ("Releasing buffer " + buffer_str + " of size " + std::to_string(byte_size) + " in " + TRITONSERVER_MemoryTypeString(memory_type) + " for result '" + *name) .c_str()); switch (memory_type) { case TRITONSERVER_MEMORY_CPU: free(buffer); break; default: LOG_MESSAGE( TRITONSERVER_LOG_ERROR, std::string( "error: unexpected buffer allocated in CUDA managed memory") .c_str()); break; } delete name; return nullptr; // Success } void InferRequestComplete( TRITONSERVER_InferenceRequest* request, const uint32_t flags, void* userp) { if (request != nullptr) { LOG_IF_ERROR( TRITONSERVER_InferenceRequestDelete(request), "Failed to delete inference request."); } } void InferResponseComplete( TRITONSERVER_InferenceResponse* response, const uint32_t flags, void* userp) { // The following logic only works for non-decoupled models as for decoupled // models it may send multiple responses for a request or not send any // responses for a request. Need to modify this function if the model is using // decoupled API. if (response != nullptr) { // Send 'response' to the future. std::promise* p = reinterpret_cast*>(userp); p->set_value(response); delete p; } } ModelExecutor::ModelExecutor(TRITONSERVER_Server* server) : server_(server) { // When triton needs a buffer to hold an output tensor, it will ask // us to provide the buffer. In this way we can have any buffer // management and sharing strategy that we want. To communicate to // triton the functions that we want it to call to perform the // allocations, we create a "response allocator" object. We pass // this response allocate object to triton when requesting // inference. We can reuse this response allocator object for any // number of inference requests. allocator_ = nullptr; THROW_IF_TRITON_ERROR(TRITONSERVER_ResponseAllocatorNew( &allocator_, CPUAllocator, ResponseRelease, nullptr /* start_fn */)); } TRITONSERVER_Error* ModelExecutor::AsyncExecute( TRITONSERVER_InferenceRequest* irequest, std::future* future) { // Perform inference by calling TRITONSERVER_ServerInferAsync. This // call is asynchronous and therefore returns immediately. The // completion of the inference and delivery of the response is done // by triton by calling the "response complete" callback functions // (InferResponseComplete in this case). auto p = new std::promise(); *future = p->get_future(); RETURN_IF_ERROR(TRITONSERVER_InferenceRequestSetResponseCallback( irequest, allocator_, nullptr /* response_allocator_userp */, InferResponseComplete, reinterpret_cast(p))); RETURN_IF_ERROR( TRITONSERVER_ServerInferAsync(server_, irequest, nullptr /* trace */)); return nullptr; // success } }}} // namespace triton::backend::bls ================================================ FILE: examples/backends/bls/src/bls_utils.h ================================================ // Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // * Neither the name of NVIDIA CORPORATION nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include "triton/backend/backend_common.h" #include "triton/core/tritonbackend.h" #include "triton/core/tritonserver.h" namespace triton { namespace backend { namespace bls { #define THROW_IF_TRITON_ERROR(X) \ do { \ TRITONSERVER_Error* tie_err__ = (X); \ if (tie_err__ != nullptr) { \ throw BLSBackendException(TRITONSERVER_ErrorMessage(tie_err__)); \ } \ } while (false) // // BLSBackendException // // Exception thrown if error occurs in BLSBackend. // struct BLSBackendException : std::exception { BLSBackendException(const std::string& message) : message_(message) {} const char* what() const throw() { return message_.c_str(); } std::string message_; }; // Performs the allocations of output tensors. TRITONSERVER_Error* CPUAllocator( TRITONSERVER_ResponseAllocator* allocator, const char* tensor_name, size_t byte_size, TRITONSERVER_MemoryType preferred_memory_type, int64_t preferred_memory_type_id, void* userp, void** buffer, void** buffer_userp, TRITONSERVER_MemoryType* actual_memory_type, int64_t* actual_memory_type_id); // Callback functions for server inference. TRITONSERVER_Error* ResponseRelease( TRITONSERVER_ResponseAllocator* allocator, void* buffer, void* buffer_userp, size_t byte_size, TRITONSERVER_MemoryType memory_type, int64_t memory_type_id); void InferRequestComplete( TRITONSERVER_InferenceRequest* request, const uint32_t flags, void* userp); void InferResponseComplete( TRITONSERVER_InferenceResponse* response, const uint32_t flags, void* userp); // // ModelExecutor // // Execute inference request on a model. // class ModelExecutor { public: ModelExecutor(TRITONSERVER_Server* server); // Performs async inference request. TRITONSERVER_Error* AsyncExecute( TRITONSERVER_InferenceRequest* irequest, std::future* future); private: // The server object that encapsulates all the functionality of the Triton // server and allows access to the Triton server API. TRITONSERVER_Server* server_; // The allocator object that will be used for allocating output tensors. TRITONSERVER_ResponseAllocator* allocator_; }; }}} // namespace triton::backend::bls ================================================ FILE: examples/backends/bls/src/libtriton_bls.ldscript ================================================ # Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of NVIDIA CORPORATION nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. { global: TRITONBACKEND_*; local: *; }; ================================================ FILE: examples/backends/minimal/CMakeLists.txt ================================================ # Copyright 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of NVIDIA CORPORATION nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmake_minimum_required(VERSION 3.31.8) project(tutorialminimalbackend LANGUAGES C CXX) # # Options # # Must include options required for this project as well as any # projects included in this one by FetchContent. # # GPU support is disabled by default because minimal backend doesn't # use GPUs. # option(TRITON_ENABLE_GPU "Enable GPU support in backend" OFF) option(TRITON_ENABLE_STATS "Include statistics collections in backend" ON) set(TRITON_REPO_ORGANIZATION "https://github.com/triton-inference-server" CACHE STRING "Git repository to pull from") set(TRITON_COMMON_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/common repo") set(TRITON_CORE_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/core repo") set(TRITON_BACKEND_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/backend repo") # # Setting C++ min standard # set(TRITON_MIN_CXX_STANDARD 17 CACHE STRING "The minimum C++ standard whose features are requested to build this target.") if(NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE Release) endif() # # Dependencies # # FetchContent requires us to include the transitive closure of all # repos that we depend on so that we can override the tags. # include(FetchContent) FetchContent_Declare( repo-common GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/common.git GIT_TAG ${TRITON_COMMON_REPO_TAG} GIT_SHALLOW ON ) FetchContent_Declare( repo-core GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/core.git GIT_TAG ${TRITON_CORE_REPO_TAG} GIT_SHALLOW ON ) FetchContent_Declare( repo-backend GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/backend.git GIT_TAG ${TRITON_BACKEND_REPO_TAG} GIT_SHALLOW ON ) FetchContent_MakeAvailable(repo-common repo-core repo-backend) # # The backend must be built into a shared library. Use an ldscript to # hide all symbols except for the TRITONBACKEND API. # configure_file(src/libtriton_minimal.ldscript libtriton_minimal.ldscript COPYONLY) add_library( triton-minimal-backend SHARED src/minimal.cc ) add_library( TutorialMinimalBackend::triton-minimal-backend ALIAS triton-minimal-backend ) target_include_directories( triton-minimal-backend PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src ) target_compile_features(triton-minimal-backend PRIVATE cxx_std_${TRITON_MIN_CXX_STANDARD}) target_compile_options( triton-minimal-backend PRIVATE $<$,$,$>: -Wall -Wextra -Wno-unused-parameter -Wno-type-limits -Werror> $<$:/Wall /D_WIN32_WINNT=0x0A00 /EHsc /Zc:preprocessor> ) target_link_libraries( triton-minimal-backend PRIVATE triton-core-serverapi # from repo-core triton-core-backendapi # from repo-core triton-core-serverstub # from repo-core triton-backend-utils # from repo-backend ) if(WIN32) set_target_properties( triton-minimal-backend PROPERTIES POSITION_INDEPENDENT_CODE ON OUTPUT_NAME triton_minimal ) else() set_target_properties( triton-minimal-backend PROPERTIES POSITION_INDEPENDENT_CODE ON OUTPUT_NAME triton_minimal LINK_DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libtriton_minimal.ldscript LINK_FLAGS "-Wl,--version-script libtriton_minimal.ldscript" ) endif() # # Install # include(GNUInstallDirs) set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/TutorialMinimalBackend) install( TARGETS triton-minimal-backend EXPORT triton-minimal-backend-targets LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/minimal RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/minimal ) install( EXPORT triton-minimal-backend-targets FILE TutorialMinimalBackendTargets.cmake NAMESPACE TutorialMinimalBackend:: DESTINATION ${INSTALL_CONFIGDIR} ) include(CMakePackageConfigHelpers) configure_package_config_file( ${CMAKE_CURRENT_LIST_DIR}/cmake/TutorialMinimalBackendConfig.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/TutorialMinimalBackendConfig.cmake INSTALL_DESTINATION ${INSTALL_CONFIGDIR} ) install( FILES ${CMAKE_CURRENT_BINARY_DIR}/TutorialMinimalBackendConfig.cmake DESTINATION ${INSTALL_CONFIGDIR} ) # # Export from build tree # export( EXPORT triton-minimal-backend-targets FILE ${CMAKE_CURRENT_BINARY_DIR}/TutorialMinimalBackendTargets.cmake NAMESPACE TutorialMinimalBackend:: ) export(PACKAGE TutorialMinimalBackend) ================================================ FILE: examples/backends/minimal/cmake/TutorialMinimalBackendConfig.cmake.in ================================================ # Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of NVIDIA CORPORATION nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. include(CMakeFindDependencyMacro) get_filename_component( TUTORIALMINIMALBACKEND_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH ) list(APPEND CMAKE_MODULE_PATH ${TUTORIALMINIMALBACKEND_CMAKE_DIR}) if(NOT TARGET TutorialMinimalBackend::triton-minimal-backend) include("${TUTORIALMINIMALBACKEND_CMAKE_DIR}/TutorialMinimalBackendTargets.cmake") endif() set(TUTORIALMINIMALBACKEND_LIBRARIES TutorialMinimalBackend::triton-minimal-backend) ================================================ FILE: examples/backends/minimal/src/libtriton_minimal.ldscript ================================================ # Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of NVIDIA CORPORATION nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. { global: TRITONBACKEND_*; local: *; }; ================================================ FILE: examples/backends/minimal/src/minimal.cc ================================================ // Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // * Neither the name of NVIDIA CORPORATION nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "triton/backend/backend_common.h" #include "triton/backend/backend_input_collector.h" #include "triton/backend/backend_model.h" #include "triton/backend/backend_model_instance.h" #include "triton/backend/backend_output_responder.h" #include "triton/core/tritonbackend.h" namespace triton { namespace backend { namespace minimal { // // Minimal backend that demonstrates the TRITONBACKEND API. This // backend works for any model that has 1 input called "IN0" with // INT32 datatype and shape [ 4 ] and 1 output called "OUT0" with // INT32 datatype and shape [ 4 ]. The backend supports both batching // and non-batching models. // // For each batch of requests, the backend returns the input tensor // value in the output tensor. // ///////////// // // ModelState // // State associated with a model that is using this backend. An object // of this class is created and associated with each // TRITONBACKEND_Model. ModelState is derived from BackendModel class // provided in the backend utilities that provides many common // functions. // class ModelState : public BackendModel { public: static TRITONSERVER_Error* Create( TRITONBACKEND_Model* triton_model, ModelState** state); virtual ~ModelState() = default; private: ModelState(TRITONBACKEND_Model* triton_model) : BackendModel(triton_model) {} }; TRITONSERVER_Error* ModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state) { try { *state = new ModelState(triton_model); } catch (const BackendModelException& ex) { RETURN_ERROR_IF_TRUE( ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL, std::string("unexpected nullptr in BackendModelException")); RETURN_IF_ERROR(ex.err_); } return nullptr; // success } extern "C" { // Triton calls TRITONBACKEND_ModelInitialize when a model is loaded // to allow the backend to create any state associated with the model, // and to also examine the model configuration to determine if the // configuration is suitable for the backend. Any errors reported by // this function will prevent the model from loading. // TRITONSERVER_Error* TRITONBACKEND_ModelInitialize(TRITONBACKEND_Model* model) { // Create a ModelState object and associate it with the // TRITONBACKEND_Model. If anything goes wrong with initialization // of the model state then an error is returned and Triton will fail // to load the model. ModelState* model_state; RETURN_IF_ERROR(ModelState::Create(model, &model_state)); RETURN_IF_ERROR( TRITONBACKEND_ModelSetState(model, reinterpret_cast(model_state))); return nullptr; // success } // Triton calls TRITONBACKEND_ModelFinalize when a model is no longer // needed. The backend should cleanup any state associated with the // model. This function will not be called until all model instances // of the model have been finalized. // TRITONSERVER_Error* TRITONBACKEND_ModelFinalize(TRITONBACKEND_Model* model) { void* vstate; RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vstate)); ModelState* model_state = reinterpret_cast(vstate); delete model_state; return nullptr; // success } } // extern "C" ///////////// // // ModelInstanceState // // State associated with a model instance. An object of this class is // created and associated with each // TRITONBACKEND_ModelInstance. ModelInstanceState is derived from // BackendModelInstance class provided in the backend utilities that // provides many common functions. // class ModelInstanceState : public BackendModelInstance { public: static TRITONSERVER_Error* Create( ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance, ModelInstanceState** state); virtual ~ModelInstanceState() = default; // Get the state of the model that corresponds to this instance. ModelState* StateForModel() const { return model_state_; } private: ModelInstanceState( ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance) : BackendModelInstance(model_state, triton_model_instance), model_state_(model_state) { } ModelState* model_state_; }; TRITONSERVER_Error* ModelInstanceState::Create( ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance, ModelInstanceState** state) { try { *state = new ModelInstanceState(model_state, triton_model_instance); } catch (const BackendModelInstanceException& ex) { RETURN_ERROR_IF_TRUE( ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL, std::string("unexpected nullptr in BackendModelInstanceException")); RETURN_IF_ERROR(ex.err_); } return nullptr; // success } extern "C" { // Triton calls TRITONBACKEND_ModelInstanceInitialize when a model // instance is created to allow the backend to initialize any state // associated with the instance. // TRITONSERVER_Error* TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance) { // Get the model state associated with this instance's model. TRITONBACKEND_Model* model; RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceModel(instance, &model)); void* vmodelstate; RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vmodelstate)); ModelState* model_state = reinterpret_cast(vmodelstate); // Create a ModelInstanceState object and associate it with the // TRITONBACKEND_ModelInstance. ModelInstanceState* instance_state; RETURN_IF_ERROR( ModelInstanceState::Create(model_state, instance, &instance_state)); RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceSetState( instance, reinterpret_cast(instance_state))); return nullptr; // success } // Triton calls TRITONBACKEND_ModelInstanceFinalize when a model // instance is no longer needed. The backend should cleanup any state // associated with the model instance. // TRITONSERVER_Error* TRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance) { void* vstate; RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(instance, &vstate)); ModelInstanceState* instance_state = reinterpret_cast(vstate); delete instance_state; return nullptr; // success } } // extern "C" ///////////// extern "C" { // When Triton calls TRITONBACKEND_ModelInstanceExecute it is required // that a backend create a response for each request in the batch. A // response may be the output tensors required for that request or may // be an error that is returned in the response. // TRITONSERVER_Error* TRITONBACKEND_ModelInstanceExecute( TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Request** requests, const uint32_t request_count) { // Triton will not call this function simultaneously for the same // 'instance'. But since this backend could be used by multiple // instances from multiple models the implementation needs to handle // multiple calls to this function at the same time (with different // 'instance' objects). Best practice for a high-performance // implementation is to avoid introducing mutex/lock and instead use // only function-local and model-instance-specific state. ModelInstanceState* instance_state; RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState( instance, reinterpret_cast(&instance_state))); ModelState* model_state = instance_state->StateForModel(); // 'responses' is initialized as a parallel array to 'requests', // with one TRITONBACKEND_Response object for each // TRITONBACKEND_Request object. If something goes wrong while // creating these response objects, the backend simply returns an // error from TRITONBACKEND_ModelInstanceExecute, indicating to // Triton that this backend did not create or send any responses and // so it is up to Triton to create and send an appropriate error // response for each request. RETURN_IF_ERROR is one of several // useful macros for error handling that can be found in // backend_common.h. std::vector responses; responses.reserve(request_count); for (uint32_t r = 0; r < request_count; ++r) { TRITONBACKEND_Request* request = requests[r]; TRITONBACKEND_Response* response; RETURN_IF_ERROR(TRITONBACKEND_ResponseNew(&response, request)); responses.push_back(response); } // At this point, the backend takes ownership of 'requests', which // means that it is responsible for sending a response for every // request. From here, even if something goes wrong in processing, // the backend must return 'nullptr' from this function to indicate // success. Any errors and failures must be communicated via the // response objects. // // To simplify error handling, the backend utilities manage // 'responses' in a specific way and it is recommended that backends // follow this same pattern. When an error is detected in the // processing of a request, an appropriate error response is sent // and the corresponding TRITONBACKEND_Response object within // 'responses' is set to nullptr to indicate that the // request/response has already been handled and no further processing // should be performed for that request. Even if all responses fail, // the backend still allows execution to flow to the end of the // function. RESPOND_AND_SET_NULL_IF_ERROR, and // RESPOND_ALL_AND_SET_NULL_IF_ERROR are macros from // backend_common.h that assist in this management of response // objects. // The backend could iterate over the 'requests' and process each // one separately. But for performance reasons it is usually // preferred to create batched input tensors that are processed // simultaneously. This is especially true for devices like GPUs // that are capable of exploiting the large amount parallelism // exposed by larger data sets. // // The backend utilities provide a "collector" to facilitate this // batching process. The 'collector's ProcessTensor function will // combine a tensor's value from each request in the batch into a // single contiguous buffer. The buffer can be provided by the // backend or 'collector' can create and manage it. In this backend, // there is not a specific buffer into which the batch should be // created, so use ProcessTensor arguments that cause collector to // manage it. BackendInputCollector collector( requests, request_count, &responses, model_state->TritonMemoryManager(), false /* pinned_enabled */, nullptr /* stream*/); // To instruct ProcessTensor to "gather" the entire batch of IN0 // input tensors into a single contiguous buffer in CPU memory, set // the "allowed input types" to be the CPU ones (see tritonserver.h // in the triton-inference-server/core repo for allowed memory // types). std::vector> allowed_input_types = {{TRITONSERVER_MEMORY_CPU_PINNED, 0}, {TRITONSERVER_MEMORY_CPU, 0}}; const char* input_buffer; size_t input_buffer_byte_size; TRITONSERVER_MemoryType input_buffer_memory_type; int64_t input_buffer_memory_type_id; RESPOND_ALL_AND_SET_NULL_IF_ERROR( responses, request_count, collector.ProcessTensor( "IN0", nullptr /* existing_buffer */, 0 /* existing_buffer_byte_size */, allowed_input_types, &input_buffer, &input_buffer_byte_size, &input_buffer_memory_type, &input_buffer_memory_type_id)); // Finalize the collector. If 'true' is returned, 'input_buffer' // will not be valid until the backend synchronizes the CUDA // stream or event that was used when creating the collector. For // this backend, GPU is not supported and so no CUDA sync should // be needed; so if 'true' is returned simply log an error. const bool need_cuda_input_sync = collector.Finalize(); if (need_cuda_input_sync) { LOG_MESSAGE( TRITONSERVER_LOG_ERROR, "'minimal' backend: unexpected CUDA sync required by collector"); } // 'input_buffer' contains the batched "IN0" tensor. The backend can // implement whatever logic is necessary to produce "OUT0". This // backend simply returns the IN0 value in OUT0 so no actual // computation is needed. LOG_MESSAGE( TRITONSERVER_LOG_INFO, (std::string("model ") + model_state->Name() + ": requests in batch " + std::to_string(request_count)) .c_str()); std::string tstr; IGNORE_ERROR(BufferAsTypedString( tstr, input_buffer, input_buffer_byte_size, TRITONSERVER_TYPE_INT32)); LOG_MESSAGE( TRITONSERVER_LOG_INFO, (std::string("batched IN0 value: ") + tstr).c_str()); const char* output_buffer = input_buffer; TRITONSERVER_MemoryType output_buffer_memory_type = input_buffer_memory_type; int64_t output_buffer_memory_type_id = input_buffer_memory_type_id; // This backend supports models that batch along the first dimension // and those that don't batch. For non-batch models the output shape // will be [ 4 ]. For batch models the output shape will be [ -1, 4 // ] and the backend "responder" utility below will set the // appropriate batch dimension value for each response. std::vector output_batch_shape; bool supports_first_dim_batching; RESPOND_ALL_AND_SET_NULL_IF_ERROR( responses, request_count, model_state->SupportsFirstDimBatching(&supports_first_dim_batching)); if (supports_first_dim_batching) { output_batch_shape.push_back(-1); } output_batch_shape.push_back(4); // Because the OUT0 values are concatenated into a single contiguous // 'output_buffer', the backend must "scatter" them out to the // individual response OUT0 tensors. The backend utilities provide // a "responder" to facilitate this scattering process. // The 'responders's ProcessTensor function will copy the portion of // 'output_buffer' corresponding to each request's output into the // response for that request. BackendOutputResponder responder( requests, request_count, &responses, model_state->TritonMemoryManager(), supports_first_dim_batching, false /* pinned_enabled */, nullptr /* stream*/); responder.ProcessTensor( "OUT0", TRITONSERVER_TYPE_INT32, output_batch_shape, output_buffer, output_buffer_memory_type, output_buffer_memory_type_id); // Finalize the responder. If 'true' is returned, the OUT0 // tensors' data will not be valid until the backend synchronizes // the CUDA stream or event that was used when creating the // responder. For this backend, GPU is not supported and so no // CUDA sync should be needed; so if 'true' is returned simply log // an error. const bool need_cuda_output_sync = responder.Finalize(); if (need_cuda_output_sync) { LOG_MESSAGE( TRITONSERVER_LOG_ERROR, "'minimal' backend: unexpected CUDA sync required by responder"); } // Send all the responses that haven't already been sent because of // an earlier error. for (auto& response : responses) { if (response != nullptr) { LOG_IF_ERROR( TRITONBACKEND_ResponseSend( response, TRITONSERVER_RESPONSE_COMPLETE_FINAL, nullptr), "failed to send response"); } } // Done with the request objects so release them. for (uint32_t r = 0; r < request_count; ++r) { auto& request = requests[r]; LOG_IF_ERROR( TRITONBACKEND_RequestRelease(request, TRITONSERVER_REQUEST_RELEASE_ALL), "failed releasing request"); } return nullptr; // success } } // extern "C" }}} // namespace triton::backend::minimal ================================================ FILE: examples/backends/recommended/CMakeLists.txt ================================================ # Copyright 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of NVIDIA CORPORATION nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmake_minimum_required(VERSION 3.31.8) project(tutorialrecommendedbackend LANGUAGES C CXX) # # Options # # Must include options required for this project as well as any # projects included in this one by FetchContent. # # GPU support is disabled by default because recommended backend # doesn't use GPUs. # option(TRITON_ENABLE_GPU "Enable GPU support in backend" OFF) option(TRITON_ENABLE_STATS "Include statistics collections in backend" ON) set(TRITON_REPO_ORGANIZATION "https://github.com/triton-inference-server" CACHE STRING "Git repository to pull from") set(TRITON_COMMON_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/common repo") set(TRITON_CORE_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/core repo") set(TRITON_BACKEND_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/backend repo") # # Setting C++ min standard # set(TRITON_MIN_CXX_STANDARD 17 CACHE STRING "The minimum C++ standard whose features are requested to build this target.") if(NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE Release) endif() # # Dependencies # # FetchContent requires us to include the transitive closure of all # repos that we depend on so that we can override the tags. # include(FetchContent) FetchContent_Declare( repo-common GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/common.git GIT_TAG ${TRITON_COMMON_REPO_TAG} GIT_SHALLOW ON ) FetchContent_Declare( repo-core GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/core.git GIT_TAG ${TRITON_CORE_REPO_TAG} GIT_SHALLOW ON ) FetchContent_Declare( repo-backend GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/backend.git GIT_TAG ${TRITON_BACKEND_REPO_TAG} GIT_SHALLOW ON ) FetchContent_MakeAvailable(repo-common repo-core repo-backend) # # The backend must be built into a shared library. Use an ldscript to # hide all symbols except for the TRITONBACKEND API. # configure_file(src/libtriton_recommended.ldscript libtriton_recommended.ldscript COPYONLY) add_library( triton-recommended-backend SHARED src/recommended.cc ) add_library( TutorialRecommendedBackend::triton-recommended-backend ALIAS triton-recommended-backend ) target_include_directories( triton-recommended-backend PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src ) target_compile_features(triton-recommended-backend PRIVATE cxx_std_${TRITON_MIN_CXX_STANDARD}) target_compile_options( triton-recommended-backend PRIVATE $<$,$,$>: -Wall -Wextra -Wno-unused-parameter -Wno-type-limits -Werror> $<$:/Wall /D_WIN32_WINNT=0x0A00 /EHsc /Zc:preprocessor> ) target_link_libraries( triton-recommended-backend PRIVATE triton-core-serverapi # from repo-core triton-core-backendapi # from repo-core triton-core-serverstub # from repo-core triton-backend-utils # from repo-backend ) if(WIN32) set_target_properties( triton-recommended-backend PROPERTIES POSITION_INDEPENDENT_CODE ON OUTPUT_NAME triton_recommended ) else() set_target_properties( triton-recommended-backend PROPERTIES POSITION_INDEPENDENT_CODE ON OUTPUT_NAME triton_recommended LINK_DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libtriton_recommended.ldscript LINK_FLAGS "-Wl,--version-script libtriton_recommended.ldscript" ) endif() # # Install # include(GNUInstallDirs) set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/TutorialRecommendedBackend) install( TARGETS triton-recommended-backend EXPORT triton-recommended-backend-targets LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/recommended RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/recommended ) install( EXPORT triton-recommended-backend-targets FILE TutorialRecommendedBackendTargets.cmake NAMESPACE TutorialRecommendedBackend:: DESTINATION ${INSTALL_CONFIGDIR} ) include(CMakePackageConfigHelpers) configure_package_config_file( ${CMAKE_CURRENT_LIST_DIR}/cmake/TutorialRecommendedBackendConfig.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/TutorialRecommendedBackendConfig.cmake INSTALL_DESTINATION ${INSTALL_CONFIGDIR} ) install( FILES ${CMAKE_CURRENT_BINARY_DIR}/TutorialRecommendedBackendConfig.cmake DESTINATION ${INSTALL_CONFIGDIR} ) # # Export from build tree # export( EXPORT triton-recommended-backend-targets FILE ${CMAKE_CURRENT_BINARY_DIR}/TutorialRecommendedBackendTargets.cmake NAMESPACE TutorialRecommendedBackend:: ) export(PACKAGE TutorialRecommendedBackend) ================================================ FILE: examples/backends/recommended/cmake/TutorialRecommendedBackendConfig.cmake.in ================================================ # Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of NVIDIA CORPORATION nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. include(CMakeFindDependencyMacro) get_filename_component( TUTORIALRECOMMENDEDBACKEND_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH ) list(APPEND CMAKE_MODULE_PATH ${TUTORIALRECOMMENDEDBACKEND_CMAKE_DIR}) if(NOT TARGET TutorialRecommendedBackend::triton-recommended-backend) include("${TUTORIALRECOMMENDEDBACKEND_CMAKE_DIR}/TutorialRecommendedBackendTargets.cmake") endif() set(TUTORIALRECOMMENDEDBACKEND_LIBRARIES TutorialRecommendedBackend::triton-recommended-backend) ================================================ FILE: examples/backends/recommended/src/libtriton_recommended.ldscript ================================================ # Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of NVIDIA CORPORATION nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. { global: TRITONBACKEND_*; local: *; }; ================================================ FILE: examples/backends/recommended/src/recommended.cc ================================================ // Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // * Neither the name of NVIDIA CORPORATION nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "triton/backend/backend_common.h" #include "triton/backend/backend_input_collector.h" #include "triton/backend/backend_model.h" #include "triton/backend/backend_model_instance.h" #include "triton/backend/backend_output_responder.h" #include "triton/core/tritonbackend.h" namespace triton { namespace backend { namespace recommended { // // Backend that demonstrates the TRITONBACKEND API. This backend works // for any model that has 1 input with any datatype and any shape and // 1 output with the same shape and datatype as the input. The backend // supports both batching and non-batching models. // // For each batch of requests, the backend returns the input tensor // value in the output tensor. // ///////////// extern "C" { // Triton calls TRITONBACKEND_Initialize when a backend is loaded into // Triton to allow the backend to create and initialize any state that // is intended to be shared across all models and model instances that // use the backend. The backend should also verify version // compatibility with Triton in this function. // TRITONSERVER_Error* TRITONBACKEND_Initialize(TRITONBACKEND_Backend* backend) { const char* cname; RETURN_IF_ERROR(TRITONBACKEND_BackendName(backend, &cname)); std::string name(cname); LOG_MESSAGE( TRITONSERVER_LOG_INFO, (std::string("TRITONBACKEND_Initialize: ") + name).c_str()); // Check the backend API version that Triton supports vs. what this // backend was compiled against. Make sure that the Triton major // version is the same and the minor version is >= what this backend // uses. uint32_t api_version_major, api_version_minor; RETURN_IF_ERROR( TRITONBACKEND_ApiVersion(&api_version_major, &api_version_minor)); LOG_MESSAGE( TRITONSERVER_LOG_INFO, (std::string("Triton TRITONBACKEND API version: ") + std::to_string(api_version_major) + "." + std::to_string(api_version_minor)) .c_str()); LOG_MESSAGE( TRITONSERVER_LOG_INFO, (std::string("'") + name + "' TRITONBACKEND API version: " + std::to_string(TRITONBACKEND_API_VERSION_MAJOR) + "." + std::to_string(TRITONBACKEND_API_VERSION_MINOR)) .c_str()); if ((api_version_major != TRITONBACKEND_API_VERSION_MAJOR) || (api_version_minor < TRITONBACKEND_API_VERSION_MINOR)) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_UNSUPPORTED, "triton backend API version does not support this backend"); } // The backend configuration may contain information needed by the // backend, such as tritonserver command-line arguments. This // backend doesn't use any such configuration but for this example // print whatever is available. TRITONSERVER_Message* backend_config_message; RETURN_IF_ERROR( TRITONBACKEND_BackendConfig(backend, &backend_config_message)); const char* buffer; size_t byte_size; RETURN_IF_ERROR(TRITONSERVER_MessageSerializeToJson( backend_config_message, &buffer, &byte_size)); LOG_MESSAGE( TRITONSERVER_LOG_INFO, (std::string("backend configuration:\n") + buffer).c_str()); // This backend does not require any "global" state but as an // example create a string to demonstrate. std::string* state = new std::string("backend state"); RETURN_IF_ERROR( TRITONBACKEND_BackendSetState(backend, reinterpret_cast(state))); return nullptr; // success } // Triton calls TRITONBACKEND_Finalize when a backend is no longer // needed. // TRITONSERVER_Error* TRITONBACKEND_Finalize(TRITONBACKEND_Backend* backend) { // Delete the "global" state associated with the backend. void* vstate; RETURN_IF_ERROR(TRITONBACKEND_BackendState(backend, &vstate)); std::string* state = reinterpret_cast(vstate); LOG_MESSAGE( TRITONSERVER_LOG_INFO, (std::string("TRITONBACKEND_Finalize: state is '") + *state + "'") .c_str()); delete state; return nullptr; // success } } // extern "C" ///////////// // // ModelState // // State associated with a model that is using this backend. An object // of this class is created and associated with each // TRITONBACKEND_Model. ModelState is derived from BackendModel class // provided in the backend utilities that provides many common // functions. // class ModelState : public BackendModel { public: static TRITONSERVER_Error* Create( TRITONBACKEND_Model* triton_model, ModelState** state); virtual ~ModelState() = default; // Name of the input and output tensor const std::string& InputTensorName() const { return input_name_; } const std::string& OutputTensorName() const { return output_name_; } // Datatype of the input and output tensor TRITONSERVER_DataType TensorDataType() const { return datatype_; } // Shape of the input and output tensor as given in the model // configuration file. This shape will not include the batch // dimension (if the model has one). const std::vector& TensorNonBatchShape() const { return nb_shape_; } // Shape of the input and output tensor, including the batch // dimension (if the model has one). This method cannot be called // until the model is completely loaded and initialized, including // all instances of the model. In practice, this means that backend // should only call it in TRITONBACKEND_ModelInstanceExecute. TRITONSERVER_Error* TensorShape(std::vector& shape); // Validate that this model is supported by this backend. TRITONSERVER_Error* ValidateModelConfig(); private: ModelState(TRITONBACKEND_Model* triton_model); std::string input_name_; std::string output_name_; TRITONSERVER_DataType datatype_; bool shape_initialized_; std::vector nb_shape_; std::vector shape_; }; ModelState::ModelState(TRITONBACKEND_Model* triton_model) : BackendModel(triton_model), shape_initialized_(false) { // Validate that the model's configuration matches what is supported // by this backend. THROW_IF_BACKEND_MODEL_ERROR(ValidateModelConfig()); } TRITONSERVER_Error* ModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state) { try { *state = new ModelState(triton_model); } catch (const BackendModelException& ex) { RETURN_ERROR_IF_TRUE( ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL, std::string("unexpected nullptr in BackendModelException")); RETURN_IF_ERROR(ex.err_); } return nullptr; // success } TRITONSERVER_Error* ModelState::TensorShape(std::vector& shape) { // This backend supports models that batch along the first dimension // and those that don't batch. For non-batch models the output shape // will be the shape from the model configuration. For batch models // the output shape will be the shape from the model configuration // prepended with [ -1 ] to represent the batch dimension. The // backend "responder" utility used below will set the appropriate // batch dimension value for each response. The shape needs to be // initialized lazily because the SupportsFirstDimBatching function // cannot be used until the model is completely loaded. if (!shape_initialized_) { bool supports_first_dim_batching; RETURN_IF_ERROR(SupportsFirstDimBatching(&supports_first_dim_batching)); if (supports_first_dim_batching) { shape_.push_back(-1); } shape_.insert(shape_.end(), nb_shape_.begin(), nb_shape_.end()); shape_initialized_ = true; } shape = shape_; return nullptr; // success } TRITONSERVER_Error* ModelState::ValidateModelConfig() { // If verbose logging is enabled, dump the model's configuration as // JSON into the console output. if (TRITONSERVER_LogIsEnabled(TRITONSERVER_LOG_VERBOSE)) { common::TritonJson::WriteBuffer buffer; RETURN_IF_ERROR(ModelConfig().PrettyWrite(&buffer)); LOG_MESSAGE( TRITONSERVER_LOG_VERBOSE, (std::string("model configuration:\n") + buffer.Contents()).c_str()); } // ModelConfig is the model configuration as a TritonJson // object. Use the TritonJson utilities to parse the JSON and // determine if the configuration is supported by this backend. common::TritonJson::Value inputs, outputs; RETURN_IF_ERROR(ModelConfig().MemberAsArray("input", &inputs)); RETURN_IF_ERROR(ModelConfig().MemberAsArray("output", &outputs)); // The model must have exactly 1 input and 1 output. RETURN_ERROR_IF_FALSE( inputs.ArraySize() == 1, TRITONSERVER_ERROR_INVALID_ARG, std::string("model configuration must have 1 input")); RETURN_ERROR_IF_FALSE( outputs.ArraySize() == 1, TRITONSERVER_ERROR_INVALID_ARG, std::string("model configuration must have 1 output")); common::TritonJson::Value input, output; RETURN_IF_ERROR(inputs.IndexAsObject(0, &input)); RETURN_IF_ERROR(outputs.IndexAsObject(0, &output)); // Record the input and output name in the model state. const char* input_name; size_t input_name_len; RETURN_IF_ERROR(input.MemberAsString("name", &input_name, &input_name_len)); input_name_ = std::string(input_name); const char* output_name; size_t output_name_len; RETURN_IF_ERROR( output.MemberAsString("name", &output_name, &output_name_len)); output_name_ = std::string(output_name); // Input and output must have same datatype std::string input_dtype, output_dtype; RETURN_IF_ERROR(input.MemberAsString("data_type", &input_dtype)); RETURN_IF_ERROR(output.MemberAsString("data_type", &output_dtype)); RETURN_ERROR_IF_FALSE( input_dtype == output_dtype, TRITONSERVER_ERROR_INVALID_ARG, std::string("expected input and output datatype to match, got ") + input_dtype + " and " + output_dtype); datatype_ = ModelConfigDataTypeToTritonServerDataType(input_dtype); // Input and output must have same shape. Reshape is not supported // on either input or output so flag an error is the model // configuration uses it. triton::common::TritonJson::Value reshape; RETURN_ERROR_IF_TRUE( input.Find("reshape", &reshape), TRITONSERVER_ERROR_UNSUPPORTED, std::string("reshape not supported for input tensor")); RETURN_ERROR_IF_TRUE( output.Find("reshape", &reshape), TRITONSERVER_ERROR_UNSUPPORTED, std::string("reshape not supported for output tensor")); std::vector input_shape, output_shape; RETURN_IF_ERROR(backend::ParseShape(input, "dims", &input_shape)); RETURN_IF_ERROR(backend::ParseShape(output, "dims", &output_shape)); RETURN_ERROR_IF_FALSE( input_shape == output_shape, TRITONSERVER_ERROR_INVALID_ARG, std::string("expected input and output shape to match, got ") + backend::ShapeToString(input_shape) + " and " + backend::ShapeToString(output_shape)); nb_shape_ = input_shape; return nullptr; // success } extern "C" { // Triton calls TRITONBACKEND_ModelInitialize when a model is loaded // to allow the backend to create any state associated with the model, // and to also examine the model configuration to determine if the // configuration is suitable for the backend. Any errors reported by // this function will prevent the model from loading. // TRITONSERVER_Error* TRITONBACKEND_ModelInitialize(TRITONBACKEND_Model* model) { // Create a ModelState object and associate it with the // TRITONBACKEND_Model. If anything goes wrong with initialization // of the model state then an error is returned and Triton will fail // to load the model. ModelState* model_state; RETURN_IF_ERROR(ModelState::Create(model, &model_state)); RETURN_IF_ERROR( TRITONBACKEND_ModelSetState(model, reinterpret_cast(model_state))); return nullptr; // success } // Triton calls TRITONBACKEND_ModelFinalize when a model is no longer // needed. The backend should cleanup any state associated with the // model. This function will not be called until all model instances // of the model have been finalized. // TRITONSERVER_Error* TRITONBACKEND_ModelFinalize(TRITONBACKEND_Model* model) { void* vstate; RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vstate)); ModelState* model_state = reinterpret_cast(vstate); delete model_state; return nullptr; // success } } // extern "C" ///////////// // // ModelInstanceState // // State associated with a model instance. An object of this class is // created and associated with each // TRITONBACKEND_ModelInstance. ModelInstanceState is derived from // BackendModelInstance class provided in the backend utilities that // provides many common functions. // class ModelInstanceState : public BackendModelInstance { public: static TRITONSERVER_Error* Create( ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance, ModelInstanceState** state); virtual ~ModelInstanceState() = default; // Get the state of the model that corresponds to this instance. ModelState* StateForModel() const { return model_state_; } private: ModelInstanceState( ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance) : BackendModelInstance(model_state, triton_model_instance), model_state_(model_state) { } ModelState* model_state_; }; TRITONSERVER_Error* ModelInstanceState::Create( ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance, ModelInstanceState** state) { try { *state = new ModelInstanceState(model_state, triton_model_instance); } catch (const BackendModelInstanceException& ex) { RETURN_ERROR_IF_TRUE( ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL, std::string("unexpected nullptr in BackendModelInstanceException")); RETURN_IF_ERROR(ex.err_); } return nullptr; // success } extern "C" { // Triton calls TRITONBACKEND_ModelInstanceInitialize when a model // instance is created to allow the backend to initialize any state // associated with the instance. // TRITONSERVER_Error* TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance) { // Get the model state associated with this instance's model. TRITONBACKEND_Model* model; RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceModel(instance, &model)); void* vmodelstate; RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vmodelstate)); ModelState* model_state = reinterpret_cast(vmodelstate); // Create a ModelInstanceState object and associate it with the // TRITONBACKEND_ModelInstance. ModelInstanceState* instance_state; RETURN_IF_ERROR( ModelInstanceState::Create(model_state, instance, &instance_state)); RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceSetState( instance, reinterpret_cast(instance_state))); return nullptr; // success } // Triton calls TRITONBACKEND_ModelInstanceFinalize when a model // instance is no longer needed. The backend should cleanup any state // associated with the model instance. // TRITONSERVER_Error* TRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance) { void* vstate; RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(instance, &vstate)); ModelInstanceState* instance_state = reinterpret_cast(vstate); delete instance_state; return nullptr; // success } } // extern "C" ///////////// extern "C" { // When Triton calls TRITONBACKEND_ModelInstanceExecute it is required // that a backend create a response for each request in the batch. A // response may be the output tensors required for that request or may // be an error that is returned in the response. // TRITONSERVER_Error* TRITONBACKEND_ModelInstanceExecute( TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Request** requests, const uint32_t request_count) { // Collect various timestamps during the execution of this batch or // requests. These values are reported below before returning from // the function. uint64_t exec_start_ns = 0; SET_TIMESTAMP(exec_start_ns); // Triton will not call this function simultaneously for the same // 'instance'. But since this backend could be used by multiple // instances from multiple models the implementation needs to handle // multiple calls to this function at the same time (with different // 'instance' objects). Best practice for a high-performance // implementation is to avoid introducing mutex/lock and instead use // only function-local and model-instance-specific state. ModelInstanceState* instance_state; RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState( instance, reinterpret_cast(&instance_state))); ModelState* model_state = instance_state->StateForModel(); // 'responses' is initialized as a parallel array to 'requests', // with one TRITONBACKEND_Response object for each // TRITONBACKEND_Request object. If something goes wrong while // creating these response objects, the backend simply returns an // error from TRITONBACKEND_ModelInstanceExecute, indicating to // Triton that this backend did not create or send any responses and // so it is up to Triton to create and send an appropriate error // response for each request. RETURN_IF_ERROR is one of several // useful macros for error handling that can be found in // backend_common.h. std::vector responses; responses.reserve(request_count); for (uint32_t r = 0; r < request_count; ++r) { TRITONBACKEND_Request* request = requests[r]; TRITONBACKEND_Response* response; RETURN_IF_ERROR(TRITONBACKEND_ResponseNew(&response, request)); responses.push_back(response); } // At this point, the backend takes ownership of 'requests', which // means that it is responsible for sending a response for every // request. From here, even if something goes wrong in processing, // the backend must return 'nullptr' from this function to indicate // success. Any errors and failures must be communicated via the // response objects. // // To simplify error handling, the backend utilities manage // 'responses' in a specific way and it is recommended that backends // follow this same pattern. When an error is detected in the // processing of a request, an appropriate error response is sent // and the corresponding TRITONBACKEND_Response object within // 'responses' is set to nullptr to indicate that the // request/response has already been handled and no further processing // should be performed for that request. Even if all responses fail, // the backend still allows execution to flow to the end of the // function so that statistics are correctly reported by the calls // to TRITONBACKEND_ModelInstanceReportStatistics and // TRITONBACKEND_ModelInstanceReportBatchStatistics. // RESPOND_AND_SET_NULL_IF_ERROR, and // RESPOND_ALL_AND_SET_NULL_IF_ERROR are macros from // backend_common.h that assist in this management of response // objects. // The backend could iterate over the 'requests' and process each // one separately. But for performance reasons it is usually // preferred to create batched input tensors that are processed // simultaneously. This is especially true for devices like GPUs // that are capable of exploiting the large amount parallelism // exposed by larger data sets. // // The backend utilities provide a "collector" to facilitate this // batching process. The 'collector's ProcessTensor function will // combine a tensor's value from each request in the batch into a // single contiguous buffer. The buffer can be provided by the // backend or 'collector' can create and manage it. In this backend, // there is not a specific buffer into which the batch should be // created, so use ProcessTensor arguments that cause collector to // manage it. ProcessTensor does NOT support TRITONSERVER_TYPE_BYTES // data type. BackendInputCollector collector( requests, request_count, &responses, model_state->TritonMemoryManager(), false /* pinned_enabled */, nullptr /* stream*/); // To instruct ProcessTensor to "gather" the entire batch of input // tensors into a single contiguous buffer in CPU memory, set the // "allowed input types" to be the CPU ones (see tritonserver.h in // the triton-inference-server/core repo for allowed memory types). std::vector> allowed_input_types = {{TRITONSERVER_MEMORY_CPU_PINNED, 0}, {TRITONSERVER_MEMORY_CPU, 0}}; const char* input_buffer; size_t input_buffer_byte_size; TRITONSERVER_MemoryType input_buffer_memory_type; int64_t input_buffer_memory_type_id; RESPOND_ALL_AND_SET_NULL_IF_ERROR( responses, request_count, collector.ProcessTensor( model_state->InputTensorName().c_str(), nullptr /* existing_buffer */, 0 /* existing_buffer_byte_size */, allowed_input_types, &input_buffer, &input_buffer_byte_size, &input_buffer_memory_type, &input_buffer_memory_type_id)); // Finalize the collector. If 'true' is returned, 'input_buffer' // will not be valid until the backend synchronizes the CUDA // stream or event that was used when creating the collector. For // this backend, GPU is not supported and so no CUDA sync should // be needed; so if 'true' is returned simply log an error. const bool need_cuda_input_sync = collector.Finalize(); if (need_cuda_input_sync) { LOG_MESSAGE( TRITONSERVER_LOG_ERROR, "'recommended' backend: unexpected CUDA sync required by collector"); } // 'input_buffer' contains the batched input tensor. The backend can // implement whatever logic is necessary to produce the output // tensor. This backend simply logs the input tensor value and then // returns the input tensor value in the output tensor so no actual // computation is needed. uint64_t compute_start_ns = 0; SET_TIMESTAMP(compute_start_ns); LOG_MESSAGE( TRITONSERVER_LOG_INFO, (std::string("model ") + model_state->Name() + ": requests in batch " + std::to_string(request_count)) .c_str()); std::string tstr; IGNORE_ERROR(BufferAsTypedString( tstr, input_buffer, input_buffer_byte_size, model_state->TensorDataType())); LOG_MESSAGE( TRITONSERVER_LOG_INFO, (std::string("batched " + model_state->InputTensorName() + " value: ") + tstr) .c_str()); const char* output_buffer = input_buffer; TRITONSERVER_MemoryType output_buffer_memory_type = input_buffer_memory_type; int64_t output_buffer_memory_type_id = input_buffer_memory_type_id; uint64_t compute_end_ns = 0; SET_TIMESTAMP(compute_end_ns); bool supports_first_dim_batching; RESPOND_ALL_AND_SET_NULL_IF_ERROR( responses, request_count, model_state->SupportsFirstDimBatching(&supports_first_dim_batching)); std::vector tensor_shape; RESPOND_ALL_AND_SET_NULL_IF_ERROR( responses, request_count, model_state->TensorShape(tensor_shape)); // Because the output tensor values are concatenated into a single // contiguous 'output_buffer', the backend must "scatter" them out // to the individual response output tensors. The backend utilities // provide a "responder" to facilitate this scattering process. // BackendOutputResponder does NOT support TRITONSERVER_TYPE_BYTES // data type. // The 'responders's ProcessTensor function will copy the portion of // 'output_buffer' corresponding to each request's output into the // response for that request. BackendOutputResponder responder( requests, request_count, &responses, model_state->TritonMemoryManager(), supports_first_dim_batching, false /* pinned_enabled */, nullptr /* stream*/); responder.ProcessTensor( model_state->OutputTensorName().c_str(), model_state->TensorDataType(), tensor_shape, output_buffer, output_buffer_memory_type, output_buffer_memory_type_id); // Finalize the responder. If 'true' is returned, the output // tensors' data will not be valid until the backend synchronizes // the CUDA stream or event that was used when creating the // responder. For this backend, GPU is not supported and so no CUDA // sync should be needed; so if 'true' is returned simply log an // error. const bool need_cuda_output_sync = responder.Finalize(); if (need_cuda_output_sync) { LOG_MESSAGE( TRITONSERVER_LOG_ERROR, "'recommended' backend: unexpected CUDA sync required by responder"); } // Send all the responses that haven't already been sent because of // an earlier error. for (auto& response : responses) { if (response != nullptr) { LOG_IF_ERROR( TRITONBACKEND_ResponseSend( response, TRITONSERVER_RESPONSE_COMPLETE_FINAL, nullptr), "failed to send response"); } } uint64_t exec_end_ns = 0; SET_TIMESTAMP(exec_end_ns); #ifdef TRITON_ENABLE_STATS // For batch statistics need to know the total batch size of the // requests. This is not necessarily just the number of requests, // because if the model supports batching then any request can be a // batched request itself. size_t total_batch_size = 0; if (!supports_first_dim_batching) { total_batch_size = request_count; } else { for (uint32_t r = 0; r < request_count; ++r) { auto& request = requests[r]; TRITONBACKEND_Input* input = nullptr; LOG_IF_ERROR( TRITONBACKEND_RequestInputByIndex(request, 0 /* index */, &input), "failed getting request input"); if (input != nullptr) { const int64_t* shape = nullptr; LOG_IF_ERROR( TRITONBACKEND_InputProperties( input, nullptr, nullptr, &shape, nullptr, nullptr, nullptr), "failed getting input properties"); if (shape != nullptr) { total_batch_size += shape[0]; } } } } #else (void)exec_start_ns; (void)exec_end_ns; (void)compute_start_ns; (void)compute_end_ns; #endif // TRITON_ENABLE_STATS // Report statistics for each request, and then release the request. for (uint32_t r = 0; r < request_count; ++r) { auto& request = requests[r]; #ifdef TRITON_ENABLE_STATS LOG_IF_ERROR( TRITONBACKEND_ModelInstanceReportStatistics( instance_state->TritonModelInstance(), request, (responses[r] != nullptr) /* success */, exec_start_ns, compute_start_ns, compute_end_ns, exec_end_ns), "failed reporting request statistics"); #endif // TRITON_ENABLE_STATS LOG_IF_ERROR( TRITONBACKEND_RequestRelease(request, TRITONSERVER_REQUEST_RELEASE_ALL), "failed releasing request"); } #ifdef TRITON_ENABLE_STATS // Report batch statistics. LOG_IF_ERROR( TRITONBACKEND_ModelInstanceReportBatchStatistics( instance_state->TritonModelInstance(), total_batch_size, exec_start_ns, compute_start_ns, compute_end_ns, exec_end_ns), "failed reporting batch request statistics"); #endif // TRITON_ENABLE_STATS return nullptr; // success } } // extern "C" }}} // namespace triton::backend::recommended ================================================ FILE: examples/batching_strategies/single_batching/CMakeLists.txt ================================================ # Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of NVIDIA CORPORATION nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmake_minimum_required(VERSION 3.31.8) project(singlebatching LANGUAGES C CXX) set(TRITON_REPO_ORGANIZATION "https://github.com/triton-inference-server" CACHE STRING "Git repository to pull from") set(TRITON_COMMON_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/common repo") set(TRITON_CORE_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/core repo") set(TRITON_BACKEND_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/backend repo") # # Setting C++ min standard # set(TRITON_MIN_CXX_STANDARD 17 CACHE STRING "The minimum C++ standard whose features are requested to build this target.") if(NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE Release) endif() # # Dependencies # # FetchContent requires us to include the transitive closure of all # repos that we depend on so that we can override the tags. # include(FetchContent) FetchContent_Declare( repo-common GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/common.git GIT_TAG ${TRITON_COMMON_REPO_TAG} GIT_SHALLOW ON ) FetchContent_Declare( repo-core GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/core.git GIT_TAG ${TRITON_CORE_REPO_TAG} GIT_SHALLOW ON ) FetchContent_Declare( repo-backend GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/backend.git GIT_TAG ${TRITON_BACKEND_REPO_TAG} GIT_SHALLOW ON ) FetchContent_MakeAvailable(repo-common repo-core repo-backend) # # The backend must be built into a shared library. Use an ldscript to # hide all symbols except for the TRITONBACKEND_ModelBatch API. # configure_file(src/libtriton_singlebatching.ldscript libtriton_singlebatching.ldscript COPYONLY) add_library( triton-single-batching SHARED src/single_batching.cc ) target_include_directories( triton-single-batching PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src ) target_compile_features(triton-single-batching PRIVATE cxx_std_${TRITON_MIN_CXX_STANDARD}) target_compile_options( triton-single-batching PRIVATE $<$,$,$>: -Wall -Wextra -Wno-unused-parameter -Wno-type-limits -Werror> $<$:/Wall /D_WIN32_WINNT=0x0A00 /EHsc /Zc:preprocessor> ) target_link_libraries( triton-single-batching PRIVATE triton-core-serverapi # from repo-core triton-core-backendapi # from repo-core triton-core-serverstub # from repo-core triton-backend-utils # from repo-backend ) if(WIN32) set_target_properties( triton-single-batching PROPERTIES POSITION_INDEPENDENT_CODE ON OUTPUT_NAME triton_singlebatching ) else() set_target_properties( triton-single-batching PROPERTIES POSITION_INDEPENDENT_CODE ON OUTPUT_NAME triton_singlebatching LINK_DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libtriton_singlebatching.ldscript LINK_FLAGS "-Wl,--version-script libtriton_singlebatching.ldscript" ) endif() # # Install # include(GNUInstallDirs) set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/SingleBatching) install( TARGETS triton-single-batching EXPORT triton-single-batching-targets LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/batching/single_batching RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/batching/single_batching ) install( EXPORT triton-single-batching-targets FILE SingleBatchingTargets.cmake NAMESPACE triton-single-batching DESTINATION ${INSTALL_CONFIGDIR} ) include(CMakePackageConfigHelpers) configure_package_config_file( ${CMAKE_CURRENT_LIST_DIR}/cmake/triton-single-batching.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/triton-single-batching.cmake INSTALL_DESTINATION ${INSTALL_CONFIGDIR} ) install( FILES ${CMAKE_CURRENT_BINARY_DIR}/triton-single-batching.cmake DESTINATION ${INSTALL_CONFIGDIR} ) # # Export from build tree # export( EXPORT triton-single-batching-targets FILE ${CMAKE_CURRENT_BINARY_DIR}/triton-single-batching.cmake NAMESPACE triton-single-batching ) export(PACKAGE triton-single-batching) ================================================ FILE: examples/batching_strategies/single_batching/cmake/triton-single-batching.cmake.in ================================================ # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of NVIDIA CORPORATION nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. include(CMakeFindDependencyMacro) get_filename_component( SINGLEBATCHING_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH ) list(APPEND CMAKE_MODULE_PATH ${SINGLEBATCHING_CMAKE_DIR}) if(NOT TARGET triton-single-batching) include("${SINGLEBATCHING_CMAKE_DIR}/SingleBatchingTargets.cmake") endif() set(SINGLEBATCHING_LIBRARIES triton-single-batching) ================================================ FILE: examples/batching_strategies/single_batching/src/libtriton_singlebatching.ldscript ================================================ # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of NVIDIA CORPORATION nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. { global: TRITONBACKEND_ModelBatch*; local: *; }; ================================================ FILE: examples/batching_strategies/single_batching/src/single_batching.cc ================================================ // Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // * Neither the name of NVIDIA CORPORATION nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include "triton/core/tritonbackend.h" namespace triton { namespace core { namespace single_batching { // // Minimal custom batching strategy that demonstrates the // TRITONBACKEND_ModelBatch API. This custom batching strategy dynamically // creates batches up to 1 request. // ///////////// extern "C" { /// Check whether a request should be added to the pending model batch. /// /// \param request The request to be added to the pending batch. /// \param userp The placeholder for backend to store and retrieve information /// about this pending batch. When the callback returns, this should reflect /// the latest batch information. /// \param should_include The pointer to be updated on whether the request /// should be included in the batch. /// \return a TRITONSERVER_Error indicating success or failure. TRITONSERVER_Error* TRITONBACKEND_ModelBatchIncludeRequest( TRITONBACKEND_Request* request, void* userp, bool* should_include) { // Check if the batch is empty. // If so, include this request. Otherwise, do not. bool* empty = static_cast(userp); if (*empty) { *should_include = true; *empty = false; } else { *should_include = false; } return nullptr; // success } /// Callback to be invoked when Triton has begun forming a batch. /// /// \param batcher The read-only placeholder for backend to retrieve // information about the batching strategy for this model. /// \param userp The placeholder for backend to store and retrieve information /// about this pending batch. /// \return a TRITONSERVER_Error indicating success or failure. TRITONSERVER_Error* TRITONBACKEND_ModelBatchInitialize( const TRITONBACKEND_Batcher* batcher, void** userp) { // Userp will point to a boolean indicating whether the batch is empty. *userp = new bool(true); return nullptr; // success } /// Callback to be invoked when Triton has finishing forming a batch. /// /// \param userp The placeholder for backend to store and retrieve information /// about this pending batch. /// \return a TRITONSERVER_Error indicating success or failure. TRITONSERVER_Error* TRITONBACKEND_ModelBatchFinalize(void* userp) { delete static_cast(userp); return nullptr; // success } /// Create a new batcher for use with custom batching. This is called during /// model loading. The batcher will point to a user-defined data structure that /// holds read-only data used for custom batching. /// /// \param batcher User-defined placeholder for backend to store and /// retrieve information about the batching strategy for this model. /// return a TRITONSERVER_Error indicating success or failure. /// \param model The backend model for which Triton is forming a batch. /// \return a TRITONSERVER_Error indicating success or failure. TRITONSERVER_Error* TRITONBACKEND_ModelBatcherInitialize( TRITONBACKEND_Batcher** batcher, TRITONBACKEND_Model* model) { return nullptr; // success } /// Free memory associated with batcher. This is called during model unloading. /// /// \param batcher User-defined placeholder for backend to store and /// retrieve information about the batching strategy for this model. /// \return a TRITONSERVER_Error indicating success or failure. TRITONSERVER_Error* TRITONBACKEND_ModelBatcherFinalize(TRITONBACKEND_Batcher* batcher) { return nullptr; // success } } // extern "C" }}} // namespace triton::core::single_batching ================================================ FILE: examples/batching_strategies/volume_batching/CMakeLists.txt ================================================ # Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of NVIDIA CORPORATION nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmake_minimum_required(VERSION 3.31.8) project(volumebatching LANGUAGES C CXX) set(TRITON_REPO_ORGANIZATION "https://github.com/triton-inference-server" CACHE STRING "Git repository to pull from") set(TRITON_COMMON_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/common repo") set(TRITON_CORE_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/core repo") set(TRITON_BACKEND_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/backend repo") # # Setting C++ min standard # set(TRITON_MIN_CXX_STANDARD 17 CACHE STRING "The minimum C++ standard whose features are requested to build this target.") if(NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE Release) endif() # # Dependencies # # FetchContent requires us to include the transitive closure of all # repos that we depend on so that we can override the tags. # include(FetchContent) FetchContent_Declare( repo-common GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/common.git GIT_TAG ${TRITON_COMMON_REPO_TAG} GIT_SHALLOW ON ) FetchContent_Declare( repo-core GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/core.git GIT_TAG ${TRITON_CORE_REPO_TAG} GIT_SHALLOW ON ) FetchContent_Declare( repo-backend GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/backend.git GIT_TAG ${TRITON_BACKEND_REPO_TAG} GIT_SHALLOW ON ) FetchContent_MakeAvailable(repo-common repo-core repo-backend) # # The backend must be built into a shared library. Use an ldscript to # hide all symbols except for the TRITONBACKEND_ModelBatch API. # configure_file(src/libtriton_volumebatching.ldscript libtriton_volumebatching.ldscript COPYONLY) add_library( triton-volume-batching SHARED src/volume_batching.cc ) target_include_directories( triton-volume-batching PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src ) target_compile_features(triton-volume-batching PRIVATE cxx_std_${TRITON_MIN_CXX_STANDARD}) target_compile_options( triton-volume-batching PRIVATE $<$,$,$>: -Wall -Wextra -Wno-unused-parameter -Wno-type-limits -Werror> $<$:/Wall /D_WIN32_WINNT=0x0A00 /EHsc /Zc:preprocessor> ) target_link_libraries( triton-volume-batching PRIVATE triton-core-serverapi # from repo-core triton-core-backendapi # from repo-core triton-core-serverstub # from repo-core triton-backend-utils # from repo-backend ) if(WIN32) set_target_properties( triton-volume-batching PROPERTIES POSITION_INDEPENDENT_CODE ON OUTPUT_NAME triton_volumebatching ) else() set_target_properties( triton-volume-batching PROPERTIES POSITION_INDEPENDENT_CODE ON OUTPUT_NAME triton_volumebatching LINK_DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libtriton_volumebatching.ldscript LINK_FLAGS "-Wl,--version-script libtriton_volumebatching.ldscript" ) endif() # # Install # include(GNUInstallDirs) set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/volumeBatching) install( TARGETS triton-volume-batching EXPORT triton-volume-batching-targets LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/batching/volume_batching RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/batching/volume_batching ) install( EXPORT triton-volume-batching-targets FILE VolumeBatchingTargets.cmake NAMESPACE triton-volume-batching DESTINATION ${INSTALL_CONFIGDIR} ) include(CMakePackageConfigHelpers) configure_package_config_file( ${CMAKE_CURRENT_LIST_DIR}/cmake/triton-volume-batching.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/triton-volume-batching.cmake INSTALL_DESTINATION ${INSTALL_CONFIGDIR} ) install( FILES ${CMAKE_CURRENT_BINARY_DIR}/triton-volume-batching.cmake DESTINATION ${INSTALL_CONFIGDIR} ) # # Export from build tree # export( EXPORT triton-volume-batching-targets FILE ${CMAKE_CURRENT_BINARY_DIR}/triton-volume-batching.cmake NAMESPACE triton-volume-batching ) export(PACKAGE triton-volume-batching) ================================================ FILE: examples/batching_strategies/volume_batching/cmake/triton-volume-batching.cmake.in ================================================ # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of NVIDIA CORPORATION nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. include(CMakeFindDependencyMacro) get_filename_component( VOLUMEBATCHING_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH ) list(APPEND CMAKE_MODULE_PATH ${VOLUMEBATCHING_CMAKE_DIR}) if(NOT TARGET triton-volume-batching) include("${VOLUMEBATCHING_CMAKE_DIR}/VolumeBatchingTargets.cmake") endif() set(VOLUMEBATCHING_LIBRARIES triton-volume-batching) ================================================ FILE: examples/batching_strategies/volume_batching/src/libtriton_volumebatching.ldscript ================================================ # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of NVIDIA CORPORATION nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. { global: TRITONBACKEND_ModelBatch*; local: *; }; ================================================ FILE: examples/batching_strategies/volume_batching/src/volume_batching.cc ================================================ // Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // * Neither the name of NVIDIA CORPORATION nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include "triton/core/tritonbackend.h" #define TRITONJSON_STATUSTYPE TRITONSERVER_Error* #define TRITONJSON_STATUSRETURN(M) \ return TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, (M).c_str()) #define TRITONJSON_STATUSSUCCESS nullptr #include "triton/common/triton_json.h" namespace triton { namespace core { namespace volume_batching { // // Minimal custom batching strategy that demonstrates the // TRITONBACKEND_ModelBatch API. This custom batching strategy dynamically // creates batches up to 1 request. // ///////////// extern "C" { /// Check whether a request should be added to the pending model batch. /// /// \param request The request to be added to the pending batch. /// \param userp The placeholder for backend to store and retrieve information /// about this pending batch. When the callback returns, this should reflect /// the latest batch information. /// \param should_include The pointer to be updated on whether the request /// should be included in the batch. /// \return a TRITONSERVER_Error indicating success or failure. TRITONSERVER_Error* TRITONBACKEND_ModelBatchIncludeRequest( TRITONBACKEND_Request* request, void* userp, bool* should_include) { // Default should_include to false in case function returns error. *should_include = false; // Get current remaining batch volume. unsigned int* remaining_volume = static_cast(userp); // Get request's volume in bytes. unsigned int pending_volume = 0; uint32_t input_count; auto err = TRITONBACKEND_RequestInputCount(request, &input_count); if (err) return err; TRITONBACKEND_Input* input; size_t data_byte_size; for (size_t count = 0; count < input_count; count++) { auto err = TRITONBACKEND_RequestInputByIndex(request, count /* index */, &input); if (err) return err; err = TRITONBACKEND_InputProperties( input, nullptr, nullptr, nullptr, nullptr, &data_byte_size, nullptr); if (err) return err; pending_volume += static_cast(data_byte_size); } // Print remaining volume for debugging purposes. std::cout << "Pending volume : " << pending_volume << std::endl; std::cout << "Remaining volume : " << *remaining_volume << std::endl; // Check if there is enough remaining volume for this request. // If so, include this request. Otherwise, do not. if (pending_volume <= *remaining_volume) { *should_include = true; *remaining_volume = *remaining_volume - pending_volume; } else { *should_include = false; } return nullptr; // success } /// Callback to be invoked when Triton has begun forming a batch. /// /// \param batcher The read-only placeholder for backend to retrieve // information about the batching strategy for this model. /// \param userp The placeholder for backend to store and retrieve information /// about this pending batch. /// \return a TRITONSERVER_Error indicating success or failure. TRITONSERVER_Error* TRITONBACKEND_ModelBatchInitialize( const TRITONBACKEND_Batcher* batcher, void** userp) { // Userp will point to an unsigned integer representing the remaining volume // in bytes for this batch. *userp = new unsigned int(*reinterpret_cast(batcher)); return nullptr; // success } /// Callback to be invoked when Triton has finishing forming a batch. /// /// \param userp The placeholder for backend to store and retrieve information /// about this pending batch. /// \return a TRITONSERVER_Error indicating success or failure. TRITONSERVER_Error* TRITONBACKEND_ModelBatchFinalize(void* userp) { delete static_cast(userp); return nullptr; // success } /// Create a new batcher for use with custom batching. This is called during /// model loading. The batcher will point to a user-defined data structure that /// holds read-only data used for custom batching. /// /// \param batcher User-defined placeholder for backend to store and /// retrieve information about the batching strategy for this model. /// return a TRITONSERVER_Error indicating success or failure. /// \param model The backend model for which Triton is forming a batch. /// \return a TRITONSERVER_Error indicating success or failure. TRITONSERVER_Error* TRITONBACKEND_ModelBatcherInitialize( TRITONBACKEND_Batcher** batcher, TRITONBACKEND_Model* model) { // Batcher will point to an unsigned integer representing the maximum // volume in bytes for each batch. // Read the user-specified bytes from the model config. TRITONSERVER_Message* config_message; TRITONBACKEND_ModelConfig(model, 1 /* config_version */, &config_message); const char* buffer; size_t byte_size; uint64_t max_volume_bytes = 0; std::string max_volume_bytes_str; auto err = TRITONSERVER_MessageSerializeToJson(config_message, &buffer, &byte_size); if (err) return err; triton::common::TritonJson::Value model_config, params, volume_param; err = model_config.Parse(buffer, byte_size); TRITONSERVER_MessageDelete(config_message); if (!model_config.Find("parameters", ¶ms)) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_NOT_FOUND, "Unable to find parameters in model config"); } std::vector param_keys; if (!params.Find("MAX_BATCH_VOLUME_BYTES", &volume_param)) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_NOT_FOUND, "Unable to find MAX_BATCH_VOLUME_BYTES parameter in model config"); } err = volume_param.MemberAsString("string_value", &max_volume_bytes_str); if (err) return err; try { max_volume_bytes = static_cast(std::stoul(max_volume_bytes_str)); } catch (const std::invalid_argument& ia) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INVALID_ARG, (std::string("failed to convert '") + max_volume_bytes_str + "' to unsigned int64") .c_str()); } *batcher = reinterpret_cast( new unsigned int(max_volume_bytes)); return nullptr; // success } /// Free memory associated with batcher. This is called during model unloading. /// /// \param batcher User-defined placeholder for backend to store and /// retrieve information about the batching strategy for this model. /// \return a TRITONSERVER_Error indicating success or failure. TRITONSERVER_Error* TRITONBACKEND_ModelBatcherFinalize(TRITONBACKEND_Batcher* batcher) { delete reinterpret_cast(batcher); return nullptr; // success } } // extern "C" }}} // namespace triton::core::volume_batching ================================================ FILE: examples/clients/bls_client ================================================ #!/usr/bin/python # Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of NVIDIA CORPORATION nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import argparse import sys import numpy as np import tritonhttpclient as httpclient from tritonclientutils import np_to_triton_dtype if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "-u", "--url", type=str, required=False, default="localhost:8000", help="Inference server URL. Default is localhost:8000.", ) FLAGS = parser.parse_args() model_name = "bls_fp32" shape = [16] with httpclient.InferenceServerClient(url=FLAGS.url) as client: input0_data = np.random.rand(*shape).astype(np.float32) input1_data = np.random.rand(*shape).astype(np.float32) inputs = [ httpclient.InferInput( "INPUT0", input0_data.shape, np_to_triton_dtype(input0_data.dtype), ), httpclient.InferInput( "INPUT1", input1_data.shape, np_to_triton_dtype(input1_data.dtype), ), ] inputs[0].set_data_from_numpy(input0_data) inputs[1].set_data_from_numpy(input1_data) outputs = [ httpclient.InferRequestedOutput("OUTPUT0"), httpclient.InferRequestedOutput("OUTPUT1"), ] response = client.infer(model_name, inputs, request_id=str(1), outputs=outputs) result = response.get_response() output0_data = response.as_numpy("OUTPUT0") output1_data = response.as_numpy("OUTPUT1") print( "INPUT0 ({}) + INPUT1 ({}) = OUTPUT0 ({})".format( input0_data, input1_data, output0_data ) ) print( "INPUT0 ({}) - INPUT1 ({}) = OUTPUT1 ({})".format( input0_data, input1_data, output1_data ) ) if not np.allclose(input0_data + input1_data, output0_data): print("error: incorrect sum") sys.exit(1) if not np.allclose(input0_data - input1_data, output1_data): print("error: incorrect difference") sys.exit(1) print("\nPASS") sys.exit(0) ================================================ FILE: examples/clients/minimal_client ================================================ #!/usr/bin/env python # Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of NVIDIA CORPORATION nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import argparse import sys import numpy as np import tritonclient.http as httpclient if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "-u", "--url", type=str, required=False, default="localhost:8000", help="Inference server URL. Default is localhost:8000.", ) FLAGS = parser.parse_args() # For the HTTP client, need to specify large enough concurrency to # issue all the inference requests to the server in parallel. For # this example we want to be able to send 2 requests concurrently. try: concurrent_request_count = 2 triton_client = httpclient.InferenceServerClient( url=FLAGS.url, concurrency=concurrent_request_count ) except Exception as e: print("channel creation failed: " + str(e)) sys.exit(1) # First send a single request to the nonbatching model. print("=========") input0_data = np.array([1, 2, 3, 4], dtype=np.int32) print("Sending request to nonbatching model: IN0 = {}".format(input0_data)) inputs = [httpclient.InferInput("IN0", [4], "INT32")] inputs[0].set_data_from_numpy(input0_data) result = triton_client.infer("nonbatching", inputs) print("Response: {}".format(result.get_response())) print("OUT0 = {}".format(result.as_numpy("OUT0"))) # Send 2 requests to the batching model. Because these are sent # asynchronously and Triton's dynamic batcher is configured to # delay up to 5 seconds when forming a batch for this model, we # expect these 2 requests to be batched within Triton and sent to # the minimal backend as a single batch. print("\n=========") async_requests = [] input0_data = np.array([[10, 11, 12, 13]], dtype=np.int32) print("Sending request to batching model: IN0 = {}".format(input0_data)) inputs = [httpclient.InferInput("IN0", [1, 4], "INT32")] inputs[0].set_data_from_numpy(input0_data) async_requests.append(triton_client.async_infer("batching", inputs)) input0_data = np.array([[20, 21, 22, 23]], dtype=np.int32) print("Sending request to batching model: IN0 = {}".format(input0_data)) inputs = [httpclient.InferInput("IN0", [1, 4], "INT32")] inputs[0].set_data_from_numpy(input0_data) async_requests.append(triton_client.async_infer("batching", inputs)) for async_request in async_requests: # Get the result from the initiated asynchronous inference # request. This call will block till the server responds. result = async_request.get_result() print("Response: {}".format(result.get_response())) print("OUT0 = {}".format(result.as_numpy("OUT0"))) ================================================ FILE: examples/clients/recommended_client ================================================ #!/usr/bin/env python # Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of NVIDIA CORPORATION nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import argparse import sys import numpy as np import tritonclient.http as httpclient if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "-u", "--url", type=str, required=False, default="localhost:8000", help="Inference server URL. Default is localhost:8000.", ) FLAGS = parser.parse_args() # For the HTTP client, need to specify large enough concurrency to # issue all the inference requests to the server in parallel. For # this example we want to be able to send 2 requests concurrently. try: concurrent_request_count = 2 triton_client = httpclient.InferenceServerClient( url=FLAGS.url, concurrency=concurrent_request_count ) except Exception as e: print("channel creation failed: " + str(e)) sys.exit(1) # Send 2 requests to the batching model. Because these are sent # asynchronously and Triton's dynamic batcher is configured to # delay up to 5 seconds when forming a batch for this model, we # expect these 2 requests to be batched within Triton and sent to # the backend as a single batch. # # The recommended backend can handle any model with 1 input and 1 # output as long as the input and output datatype and shape are # the same. The batching model uses datatype FP32 and shape # [ 4, 4 ]. print("\n=========") async_requests = [] input0_data = np.array( [ [ [1.0, 1.1, 1.2, 1.3], [2.0, 2.1, 2.2, 2.3], [3.0, 3.1, 3.2, 3.3], [4.0, 4.1, 4.2, 4.3], ] ], dtype=np.float32, ) print("Sending request to batching model: input = {}".format(input0_data)) inputs = [httpclient.InferInput("INPUT", [1, 4, 4], "FP32")] inputs[0].set_data_from_numpy(input0_data) async_requests.append(triton_client.async_infer("batching", inputs)) input0_data = np.array( [ [ [10.0, 10.1, 10.2, 10.3], [20.0, 20.1, 20.2, 20.3], [30.0, 30.1, 30.2, 30.3], [40.0, 40.1, 40.2, 40.3], ] ], dtype=np.float32, ) print("Sending request to batching model: input = {}".format(input0_data)) inputs = [httpclient.InferInput("INPUT", [1, 4, 4], "FP32")] inputs[0].set_data_from_numpy(input0_data) async_requests.append(triton_client.async_infer("batching", inputs)) for async_request in async_requests: # Get the result from the initiated asynchronous inference # request. This call will block till the server responds. result = async_request.get_result() print("Response: {}".format(result.get_response())) print("OUTPUT = {}".format(result.as_numpy("OUTPUT"))) ================================================ FILE: examples/model_repos/bls_models/addsub_onnx/1/model.onnx ================================================  triton:  INPUT0_INPUT0"Identity  INPUT1_INPUT1"Identity  _INPUT0 _INPUT1CAST0"Add  _INPUT0 _INPUT1CAST1"Sub ! CAST0OUTPUT0"Cast* to ! CAST1OUTPUT1"Cast* to$onnx_nobatch_float32_float32_float32Z INPUT0  Z INPUT1  b OUTPUT0  b OUTPUT1  B ================================================ FILE: examples/model_repos/bls_models/addsub_onnx/config.pbtxt ================================================ name: "addsub_onnx" platform: "onnxruntime_onnx" max_batch_size: 0 input [ { name: "INPUT0" data_type: TYPE_FP32 dims: [ 16 ] }, { name: "INPUT1" data_type: TYPE_FP32 dims: [ 16 ] } ] output [ { name: "OUTPUT0" data_type: TYPE_FP32 dims: [ 16 ] }, { name: "OUTPUT1" data_type: TYPE_FP32 dims: [ 16 ] } ] ================================================ FILE: examples/model_repos/bls_models/addsub_python/1/model.py ================================================ # Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of NVIDIA CORPORATION nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import json import triton_python_backend_utils as pb_utils # This model calculates the sum and difference of the INPUT0 and INPUT1 and put # the results in OUTPUT0 and OUTPUT1 respectively. For more information # regarding how this model.py was written, please refer to Python Backend. class TritonPythonModel: def initialize(self, args): self.model_config = model_config = json.loads(args["model_config"]) output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0") output1_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT1") self.output0_dtype = pb_utils.triton_string_to_numpy( output0_config["data_type"] ) self.output1_dtype = pb_utils.triton_string_to_numpy( output1_config["data_type"] ) def execute(self, requests): output0_dtype = self.output0_dtype output1_dtype = self.output1_dtype responses = [] for request in requests: in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0") in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1") out_0, out_1 = ( in_0.as_numpy() + in_1.as_numpy(), in_0.as_numpy() - in_1.as_numpy(), ) out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0.astype(output0_dtype)) out_tensor_1 = pb_utils.Tensor("OUTPUT1", out_1.astype(output1_dtype)) inference_response = pb_utils.InferenceResponse( output_tensors=[out_tensor_0, out_tensor_1] ) responses.append(inference_response) return responses def finalize(self): print("Cleaning up...") ================================================ FILE: examples/model_repos/bls_models/addsub_python/config.pbtxt ================================================ # Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of NVIDIA CORPORATION nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. name: "addsub_python" backend: "python" max_batch_size: 0 input [ { name: "INPUT0" data_type: TYPE_FP32 dims: [ 16 ] } ] input [ { name: "INPUT1" data_type: TYPE_FP32 dims: [ 16 ] } ] output [ { name: "OUTPUT0" data_type: TYPE_FP32 dims: [ 16 ] } ] output [ { name: "OUTPUT1" data_type: TYPE_FP32 dims: [ 16 ] } ] ================================================ FILE: examples/model_repos/bls_models/bls_fp32/config.pbtxt ================================================ # Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of NVIDIA CORPORATION nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. name: "bls_fp32" backend: "bls" max_batch_size: 0 input [ { name: "INPUT0" data_type: TYPE_FP32 dims: [ 16 ] } ] input [ { name: "INPUT1" data_type: TYPE_FP32 dims: [ 16 ] } ] output [ { name: "OUTPUT0" data_type: TYPE_FP32 dims: [ 16 ] } ] output [ { name: "OUTPUT1" data_type: TYPE_FP32 dims: [ 16 ] } ] instance_group [ { kind: KIND_CPU } ] ================================================ FILE: examples/model_repos/minimal_models/batching/1/.gitkeep ================================================ ================================================ FILE: examples/model_repos/minimal_models/batching/config.pbtxt ================================================ backend: "minimal" max_batch_size: 8 dynamic_batching { max_queue_delay_microseconds: 5000000 } input [ { name: "IN0" data_type: TYPE_INT32 dims: [ 4 ] } ] output [ { name: "OUT0" data_type: TYPE_INT32 dims: [ 4 ] } ] instance_group [ { kind: KIND_CPU } ] ================================================ FILE: examples/model_repos/minimal_models/nonbatching/1/.gitkeep ================================================ ================================================ FILE: examples/model_repos/minimal_models/nonbatching/config.pbtxt ================================================ backend: "minimal" max_batch_size: 0 input [ { name: "IN0" data_type: TYPE_INT32 dims: [ 4 ] } ] output [ { name: "OUT0" data_type: TYPE_INT32 dims: [ 4 ] } ] instance_group [ { kind: KIND_CPU } ] ================================================ FILE: examples/model_repos/recommended_models/batching/1/.gitkeep ================================================ ================================================ FILE: examples/model_repos/recommended_models/batching/config.pbtxt ================================================ backend: "recommended" max_batch_size: 8 dynamic_batching { max_queue_delay_microseconds: 5000000 } input [ { name: "INPUT" data_type: TYPE_FP32 dims: [ 4, 4 ] } ] output [ { name: "OUTPUT" data_type: TYPE_FP32 dims: [ 4, 4 ] } ] instance_group [ { kind: KIND_CPU } ] ================================================ FILE: include/triton/backend/backend_common.h ================================================ // Copyright 2020-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // * Neither the name of NVIDIA CORPORATION nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma once #include #include #include #include #include #include #include #include #include #include "triton/common/error.h" #include "triton/core/tritonbackend.h" #define TRITONJSON_STATUSTYPE TRITONSERVER_Error* #define TRITONJSON_STATUSRETURN(M) \ return TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, (M).c_str()) #define TRITONJSON_STATUSSUCCESS nullptr #include "triton/common/triton_json.h" #ifdef TRITON_ENABLE_GPU #include #endif // TRITON_ENABLE_GPU namespace triton { namespace backend { #define IGNORE_ERROR(X) \ do { \ TRITONSERVER_Error* ie_err__ = (X); \ if (ie_err__ != nullptr) { \ TRITONSERVER_ErrorDelete(ie_err__); \ } \ } while (false) #define LOG_IF_ERROR(X, MSG) \ do { \ TRITONSERVER_Error* lie_err__ = (X); \ if (lie_err__ != nullptr) { \ IGNORE_ERROR(TRITONSERVER_LogMessage( \ TRITONSERVER_LOG_ERROR, __FILE__, __LINE__, \ (std::string(MSG) + ": " + TRITONSERVER_ErrorCodeString(lie_err__) + \ " - " + TRITONSERVER_ErrorMessage(lie_err__)) \ .c_str())); \ TRITONSERVER_ErrorDelete(lie_err__); \ } \ } while (false) #define LOG_MESSAGE(LEVEL, MSG) \ do { \ LOG_IF_ERROR( \ TRITONSERVER_LogMessage(LEVEL, __FILE__, __LINE__, MSG), \ ("failed to log message: ")); \ } while (false) #define RETURN_ERROR_IF_FALSE(P, C, MSG) \ do { \ if (!(P)) { \ return TRITONSERVER_ErrorNew(C, (MSG).c_str()); \ } \ } while (false) #define RETURN_ERROR_IF_TRUE(P, C, MSG) \ do { \ if ((P)) { \ return TRITONSERVER_ErrorNew(C, (MSG).c_str()); \ } \ } while (false) #define RETURN_IF_ERROR(X) \ do { \ TRITONSERVER_Error* rie_err__ = (X); \ if (rie_err__ != nullptr) { \ return rie_err__; \ } \ } while (false) #ifdef TRITON_ENABLE_GPU #define LOG_IF_CUDA_ERROR(X, MSG) \ do { \ cudaError_t lice_err__ = (X); \ if (lice_err__ != cudaSuccess) { \ IGNORE_ERROR(TRITONSERVER_LogMessage( \ TRITONSERVER_LOG_INFO, __FILE__, __LINE__, \ (std::string(MSG) + ": " + cudaGetErrorString(lice_err__)) \ .c_str())); \ } \ } while (false) #define RETURN_IF_CUDA_ERROR(X, C, MSG) \ do { \ cudaError_t rice_err__ = (X); \ if (rice_err__ != cudaSuccess) { \ return TRITONSERVER_ErrorNew( \ C, ((MSG) + ": " + cudaGetErrorString(rice_err__)).c_str()); \ } \ } while (false) #endif // TRITON_ENABLE_GPU #define RESPOND_AND_SET_NULL_IF_ERROR(RESPONSE_PTR, X) \ do { \ TRITONSERVER_Error* rarie_err__ = (X); \ if (rarie_err__ != nullptr) { \ if (*RESPONSE_PTR != nullptr) { \ LOG_IF_ERROR( \ TRITONBACKEND_ResponseSend( \ *RESPONSE_PTR, TRITONSERVER_RESPONSE_COMPLETE_FINAL, \ rarie_err__), \ "failed to send error response"); \ *RESPONSE_PTR = nullptr; \ } \ TRITONSERVER_ErrorDelete(rarie_err__); \ } \ } while (false) #define RESPOND_ALL_AND_SET_NULL_IF_ERROR(RESPONSES, RESPONSES_COUNT, X) \ do { \ TRITONSERVER_Error* raasnie_err__ = (X); \ if (raasnie_err__ != nullptr) { \ for (size_t ridx = 0; ridx < RESPONSES_COUNT; ++ridx) { \ if (RESPONSES[ridx] != nullptr) { \ LOG_IF_ERROR( \ TRITONBACKEND_ResponseSend( \ RESPONSES[ridx], TRITONSERVER_RESPONSE_COMPLETE_FINAL, \ raasnie_err__), \ "failed to send error response"); \ RESPONSES[ridx] = nullptr; \ } \ } \ TRITONSERVER_ErrorDelete(raasnie_err__); \ } \ } while (false) #define RESPOND_ALL_AND_SET_TRUE_IF_ERROR(RESPONSES, RESPONSES_COUNT, BOOL, X) \ do { \ TRITONSERVER_Error* raasnie_err__ = (X); \ if (raasnie_err__ != nullptr) { \ BOOL = true; \ for (size_t ridx = 0; ridx < RESPONSES_COUNT; ++ridx) { \ if (RESPONSES[ridx] != nullptr) { \ LOG_IF_ERROR( \ TRITONBACKEND_ResponseSend( \ RESPONSES[ridx], TRITONSERVER_RESPONSE_COMPLETE_FINAL, \ raasnie_err__), \ "failed to send error response"); \ RESPONSES[ridx] = nullptr; \ } \ } \ TRITONSERVER_ErrorDelete(raasnie_err__); \ } \ } while (false) #ifdef TRITON_ENABLE_STATS #define TIMESPEC_TO_NANOS(TS) ((TS).tv_sec * 1000000000 + (TS).tv_nsec) #define SET_TIMESTAMP(TS_NS) \ { \ TS_NS = std::chrono::duration_cast( \ std::chrono::steady_clock::now().time_since_epoch()) \ .count(); \ } #define DECL_TIMESTAMP(TS_NS) \ uint64_t TS_NS; \ SET_TIMESTAMP(TS_NS); #else #define DECL_TIMESTAMP(TS_NS) #define SET_TIMESTAMP(TS_NS) #endif // TRITON_ENABLE_STATS #ifndef TRITON_ENABLE_GPU using cudaStream_t = void*; #endif // !TRITON_ENABLE_GPU /// Convenience deleter for TRITONBACKEND_ResponseFactory. struct ResponseFactoryDeleter { void operator()(TRITONBACKEND_ResponseFactory* f) { LOG_IF_ERROR( TRITONBACKEND_ResponseFactoryDelete(f), "failed deleting response factory"); } }; // A representation of the BatchInput message in model config class BatchInput { public: enum class Kind { BATCH_ELEMENT_COUNT, BATCH_ACCUMULATED_ELEMENT_COUNT, BATCH_ACCUMULATED_ELEMENT_COUNT_WITH_ZERO, BATCH_MAX_ELEMENT_COUNT_AS_SHAPE, BATCH_ITEM_SHAPE, BATCH_ITEM_SHAPE_FLATTEN }; static TRITONSERVER_Error* ParseFromModelConfig( triton::common::TritonJson::Value& config, std::vector* batch_inputs); const std::vector& TargetNames() const { return target_names_; } TRITONSERVER_DataType DataType() const { return data_type_; } Kind BatchInputKind() const { return kind_; } std::string BatchInputKindString() const { return kind_str_; } const std::vector& SourceInputs() const { return source_inputs_; } private: TRITONSERVER_Error* Init(triton::common::TritonJson::Value& bi_config); Kind kind_; std::string kind_str_; std::vector target_names_; TRITONSERVER_DataType data_type_; std::vector source_inputs_; }; // A representation of the BatchOutput message in model config class BatchOutput { public: enum class Kind { BATCH_SCATTER_WITH_INPUT_SHAPE }; static TRITONSERVER_Error* ParseFromModelConfig( triton::common::TritonJson::Value& config, std::vector* batch_outputs); const std::vector& TargetNames() const { return target_names_; } TRITONSERVER_DataType DataType() const { return data_type_; } const std::vector& OutputShape() const { return shape_; } Kind BatchOutputKind() const { return kind_; } const std::vector& SourceInputs() const { return source_inputs_; } private: Kind kind_; std::vector target_names_; TRITONSERVER_DataType data_type_; std::vector shape_; std::vector source_inputs_; }; struct CopyParams { CopyParams(void* dst, const void* src, const size_t byte_size) : dst_(dst), src_(src), byte_size_(byte_size) { } void* dst_; const void* src_; const size_t byte_size_; }; /// The value for a dimension in a shape that indicates that that /// dimension can take on any size. constexpr int WILDCARD_DIM = -1; constexpr char kTensorRTExecutionAccelerator[] = "tensorrt"; constexpr char kOpenVINOExecutionAccelerator[] = "openvino"; constexpr char kCUDAExecutionAccelerator[] = "cuda"; constexpr char kGPUIOExecutionAccelerator[] = "gpu_io"; constexpr char kAutoMixedPrecisionExecutionAccelerator[] = "auto_mixed_precision"; TRITONSERVER_MemoryType GetUsePinnedMemoryType( TRITONSERVER_MemoryType ref_buffer_type); TRITONSERVER_Error* CommonErrorToTritonError(triton::common::Error error); TRITONSERVER_Error_Code StatusCodeToTritonCode( triton::common::Error::Code error_code); /// Parse an array in a JSON object into the corresponding shape. The /// array must be composed of integers. /// /// \param io The JSON object containing the member array. /// \param name The name of the array member in the JSON object. /// \param shape Returns the shape. /// \return a TRITONSERVER_Error indicating success or failure. TRITONSERVER_Error* ParseShape( common::TritonJson::Value& io, const std::string& name, std::vector* shape); /// Return the string representation of a shape. /// /// \param dims The shape dimensions. /// \param dims_count The number of dimensions. /// \return The string representation. std::string ShapeToString(const int64_t* dims, const size_t dims_count); /// Return the string representation of a shape. /// /// \param shape The shape as a vector of dimensions. /// \return The string representation. std::string ShapeToString(const std::vector& shape); /// Deprecated. Use TRITONSERVER_Error* GetElementCount instead. /// Return the number of elements of a shape. /// /// \param dims The shape dimensions. /// \param dims_count The number of dimensions. /// \return The number of elements, /// -1 if unable to determine the number, /// -2 if the shape contains an invalid dim, /// or -3 if the number is too large to represent as an int64_t. int64_t GetElementCount(const int64_t* dims, const size_t dims_count); /// Deprecated. Use TRITONSERVER_Error* GetElementCount instead. /// Return the number of elements of a shape. /// /// \param shape The shape as a vector of dimensions. /// \return The number of elements, /// -1 if unable to determine the number, /// -2 if the shape contains an invalid dim, /// or -3 if the number is too large to represent as an int64_t. int64_t GetElementCount(const std::vector& shape); /// Return the number of elements of a shape with error checking. /// /// \param dims The shape dimensions. /// \param dims_count The number of dimensions. /// \param cnt Returns the number of elements. /// \return a TRITONSERVER_Error indicating success or failure. TRITONSERVER_Error* GetElementCount( const int64_t* dims, const size_t dims_count, int64_t* cnt); /// Return the number of elements of a shape with error checking. /// /// \param shape The shape as a vector of dimensions. /// \param cnt Returns the number of elements. /// \return a TRITONSERVER_Error indicating success or failure. TRITONSERVER_Error* GetElementCount( const std::vector& shape, int64_t* cnt); /// Deprecated. Use TRITONSERVER_Error* GetByteSize instead. /// Get the size, in bytes, of a tensor based on datatype and /// shape. /// \param dtype The data-type. /// \param dims The shape. /// \return The size, in bytes, of the corresponding tensor, /// -1 if unable to determine the size, /// -2 if the shape contains an invalid dim, /// or -3 if the size is too large to represent as an int64_t. int64_t GetByteSize( const TRITONSERVER_DataType& dtype, const std::vector& dims); /// Get the size, in bytes, of a tensor based on datatype and /// shape with error checking. /// \param dtype The data-type. /// \param dims The shape. /// \param size Returns the size, in bytes, of the corresponding tensor. /// \return a TRITONSERVER_Error indicating success or failure. TRITONSERVER_Error* GetByteSize( const TRITONSERVER_DataType& dtype, const std::vector& dims, int64_t* size); /// Get an input tensor's contents into a buffer. This overload expects /// both 'buffer' and buffers of the input to be in CPU. /// /// \param request The inference request. /// \param input_name The name of the input buffer. /// \param buffer The buffer where the input tensor content is copied into. /// \param buffer_byte_size Acts as both input and output. On input /// gives the size of 'buffer', in bytes. The function will fail if /// the buffer is not large enough to hold the input tensor /// contents. Returns the size of the input tensor data returned in /// 'buffer'. /// \param host_policy_name The host policy name to look up the input buffer. /// Default input buffer will be used if nullptr is provided. /// \return a TRITONSERVER_Error indicating success or failure. TRITONSERVER_Error* ReadInputTensor( TRITONBACKEND_Request* request, const std::string& input_name, char* buffer, size_t* buffer_byte_size, const char* host_policy_name = nullptr); /// Get an input tensor's contents into a buffer. This overload of /// 'ReadInputTensor' supports input buffers that can be in any memory. /// /// \param request The inference request. /// \param input_name The name of the input buffer. /// \param buffer The buffer where the input tensor content is copied into. /// \param buffer_byte_size Acts as both input and output. On input /// gives the size of 'buffer', in bytes. The function will fail if /// the buffer is not large enough to hold the input tensor /// contents. Returns the size of the input tensor data returned in /// 'buffer'. /// \param host_policy_name The host policy name to look up the input buffer. /// Default input buffer will be used if nullptr is provided. /// \param memory_type The memory type of the buffer provided. /// \param memory_type_id The memory type id of the buffer provided. /// \param cuda_stream specifies the stream to be associated with, and 0 can be /// passed for default stream. /// \param cuda_used returns whether a CUDA memory copy is initiated. If true, /// the caller should synchronize on the given 'cuda_stream' to ensure data copy /// is completed. /// \param copy_on_stream whether the memory copies should be performed in cuda /// host functions on the 'cuda_stream'. /// \return a TRITONSERVER_Error indicating success or failure. TRITONSERVER_Error* ReadInputTensor( TRITONBACKEND_Request* request, const std::string& input_name, char* buffer, size_t* buffer_byte_size, TRITONSERVER_MemoryType memory_type, int64_t memory_type_id, cudaStream_t cuda_stream, bool* cuda_used, const char* host_policy_name = nullptr, const bool copy_on_stream = false); /// Validate that an input matches one of the allowed input names. /// \param io The model input. /// \param allowed The set of allowed input names. /// \return The error status. A non-OK status indicates the input /// is not valid. TRITONSERVER_Error* CheckAllowedModelInput( common::TritonJson::Value& io, const std::set& allowed); /// Validate that an output matches one of the allowed output names. /// \param io The model output. /// \param allowed The set of allowed output names. /// \return The error status. A non-OK status indicates the output /// is not valid. TRITONSERVER_Error* CheckAllowedModelOutput( common::TritonJson::Value& io, const std::set& allowed); /// Get the tensor name, false value, and true value for a boolean /// sequence batcher control kind. If 'required' is true then must /// find a tensor for the control. If 'required' is false, return /// 'tensor_name' as empty-string if the control is not mapped to any /// tensor. /// /// \param batcher The JSON object of the sequence batcher. /// \param model_name The name of the model. /// \param control_kind The kind of control tensor to look for. /// \param required Whether the tensor must be specified. /// \param tensor_name Returns the name of the tensor. /// \param tensor_datatype Returns the data type of the tensor. /// \param fp32_false_value Returns the float value for false if /// the tensor type is FP32. /// \param fp32_true_value Returns the float value for true if /// the tensor type is FP32. /// \param int32_false_value Returns the int value for false if /// the tensor type is INT32. /// \param int32_true_value Returns the int value for true if /// the tensor type is INT32. /// \param bool_false_value Returns the bool value for false if /// the tensor type is BOOL. /// \param bool_true_value Returns the bool value for true if /// the tensor type is BOOL. /// \return a TRITONSERVER_Error indicating success or failure. TRITONSERVER_Error* GetBooleanSequenceControlProperties( common::TritonJson::Value& batcher, const std::string& model_name, const std::string& control_kind, const bool required, std::string* tensor_name, std::string* tensor_datatype, float* fp32_false_value, float* fp32_true_value, int32_t* int32_false_value, int32_t* int32_true_value, bool* bool_false_value, bool* bool_true_value); /// Get the tensor name and datatype for a non-boolean sequence /// batcher control kind. If 'required' is true then must find a /// tensor for the control. If 'required' is false, return /// 'tensor_name' as empty-string if the control is not mapped to any /// tensor. 'tensor_datatype' returns the required datatype for the /// control. /// /// \param batcher The JSON object of the sequence batcher. /// \param model_name The name of the model. /// \param control_kind The kind of control tensor to look for. /// \param required Whether the tensor must be specified. /// \param tensor_name Returns the name of the tensor. /// \param tensor_datatype Returns the data type of the tensor. /// \return a TRITONSERVER_Error indicating success or failure. TRITONSERVER_Error* GetTypedSequenceControlProperties( common::TritonJson::Value& batcher, const std::string& model_name, const std::string& control_kind, const bool required, std::string* tensor_name, std::string* tensor_datatype); /// Create and send an error response for a set of requests. This /// function takes ownership of 'response_err' and so the caller must /// not access or delete it after this call returns. /// /// \param requests The requests. /// \param request_count The number of 'requests'. /// \param response_err The error to send to each request. /// \param release_request If true, the requests will be released after /// sending the error responses and the request pointers are set to /// nullptr. void RequestsRespondWithError( TRITONBACKEND_Request** requests, const uint32_t request_count, TRITONSERVER_Error* response_err, const bool release_request = true); /// Send an error response for a set of responses. This function takes /// ownership of 'response_err' and so the caller must not access or /// delete it after this call returns. /// /// \param responses The responses. /// \param response_count The number of 'responses'. /// \param response_err The error to send. void SendErrorForResponses( std::vector* responses, const uint32_t response_count, TRITONSERVER_Error* response_err); /// Copy buffer from 'src' to 'dst' for given 'byte_size'. The buffer location /// is identified by the memory type and id, and the corresponding copy will be /// initiated. /// \param msg The message to be prepended in error message. /// \param src_memory_type The memory type of the source buffer. /// \param src_memory_type_id The memory type id of the source buffer. /// \param dst_memory_type The memory type of the destination buffer. /// \param dst_memory_type_id The memory type id of the destination buffer. /// \param byte_size The byte size of the source buffer. /// \param src The pointer to the source buffer. /// \param dst The pointer to the destination buffer. /// \param cuda_stream specifies the stream to be associated with, and 0 can be /// passed for default stream. /// \param cuda_used returns whether a CUDA memory copy is initiated. If true, /// the caller should synchronize on the given 'cuda_stream' to ensure data copy /// is completed. /// \param copy_on_stream whether the memory copies should be performed in cuda /// host functions on the 'cuda_stream'. /// \return a TRITONSERVER_Error indicating success or failure. TRITONSERVER_Error* CopyBuffer( const std::string& msg, const TRITONSERVER_MemoryType src_memory_type, const int64_t src_memory_type_id, const TRITONSERVER_MemoryType dst_memory_type, const int64_t dst_memory_type_id, const size_t byte_size, const void* src, void* dst, cudaStream_t cuda_stream, bool* cuda_used, const bool copy_on_stream = false); /// Does a file or directory exist? /// \param path The path to check for existence. /// \param exists Returns true if file/dir exists /// \return a TRITONSERVER_Error indicating success or failure. TRITONSERVER_Error* FileExists(const std::string& path, bool* exists); /// Read a text file into a string. /// \param path The path of the file. /// \param contents Returns the contents of the file. /// \return a TRITONSERVER_Error indicating success or failure. TRITONSERVER_Error* ReadTextFile( const std::string& path, std::string* contents); /// Is a path a directory? /// \param path The path to check. /// \param is_dir Returns true if path represents a directory /// \return a TRITONSERVER_Error indicating success or failure. TRITONSERVER_Error* IsDirectory(const std::string& path, bool* is_dir); /// Join path segments into a longer path /// \param segments The path segments. /// \return the path formed by joining the segments. std::string JoinPath(std::initializer_list segments); /// Returns the content in the model version path and the path to the content as /// key-value pair. /// \param model_repository_path The path to the model repository. /// \param version The version of the model. /// \param ignore_directories Whether the directories will be ignored. /// \param ignore_files Whether the files will be ignored. /// \param model_paths Returns the content in the model version path and /// the path to the content. /// \return a TRITONSERVER_Error indicating success or failure. TRITONSERVER_Error* ModelPaths( const std::string& model_repository_path, uint64_t version, const bool ignore_directories, const bool ignore_files, std::unordered_map* model_paths); /// Create a CUDA stream appropriate for GPU<->CPU data transfer /// operations for a given GPU device. The caller takes ownership of /// the stream. 'stream' returns nullptr if GPU support is disabled. /// /// \param device_id The ID of the GPU. /// \param priority The stream priority. Use 0 for normal priority. /// \param stream Returns the created stream. /// \return a TRITONSERVER_Error indicating success or failure. TRITONSERVER_Error* CreateCudaStream( const int device_id, const int cuda_stream_priority, cudaStream_t* stream); /// Parse the string as long long integer. /// /// \param value The string. /// \param parse_value The long long integral value of the string. /// \return a TRITONSERVER_Error indicating success or failure. TRITONSERVER_Error* ParseLongLongValue( const std::string& value, int64_t* parsed_value); /// Parse the string as unsigned long long integer. /// /// \param value The string. /// \param parse_value The unsigned long long integral value of the string. /// \return a TRITONSERVER_Error indicating success or failure. TRITONSERVER_Error* ParseUnsignedLongLongValue( const std::string& value, uint64_t* parsed_value); /// Parse the string as boolean. /// /// \param value The string. /// \param parse_value The boolean value of the string. /// \return a TRITONSERVER_Error indicating success or failure. TRITONSERVER_Error* ParseBoolValue( const std::string& value, bool* parsed_value); /// Parse the string as integer. /// /// \param value The string. /// \param parse_value The integral value of the string. /// \return a TRITONSERVER_Error indicating success or failure. TRITONSERVER_Error* ParseIntValue(const std::string& value, int* parsed_value); /// Parse the string as double. /// /// \param value The string. /// \param parse_value The double value of the string. /// \return a TRITONSERVER_Error indicating success or failure. TRITONSERVER_Error* ParseDoubleValue( const std::string& value, double* parsed_value); /// Return the value of the specified key in a JSON object. /// /// \param params The JSON object containing the key-value mapping. /// \param key The key to look up the value in the JSON object. /// \param value Returns the value. /// \return a TRITONSERVER_Error indicating success or failure. TRITONSERVER_Error* GetParameterValue( triton::common::TritonJson::Value& params, const std::string& key, std::string* value); /// Return the Triton server data type of the data type string specified /// in model config JSON. /// /// \param data_type_str The string representation of the data type. /// \return the Triton server data type. TRITONSERVER_DataType ModelConfigDataTypeToTritonServerDataType( const std::string& data_type_str); /// Try to parse the requested parameter. /// /// \param params The param in model config /// \param mkey Key in the model config. /// \param value The parsed string value. /// \param default_value Default value to use when key is not found. /// \return a TRITONSERVER_Error indicating success or failure. TRITONSERVER_Error* TryParseModelStringParameter( triton::common::TritonJson::Value& params, const std::string& mkey, std::string* value, const std::string& default_value); /// Try to parse the requested parameter. /// /// \param params The param in model config /// \param mkey Key in the model config. /// \param value The parsed int value. /// \param default_value Default value to use when key is not found. /// \return a TRITONSERVER_Error indicating success or failure. TRITONSERVER_Error* TryParseModelStringParameter( triton::common::TritonJson::Value& params, const std::string& mkey, int* value, const int& default_value); /// Try to parse the requested parameter. /// /// \param params The param in model config /// \param mkey Key in the model config. /// \param value The parsed bool value. /// \param default_value Default value to use when key is not found. /// \return a TRITONSERVER_Error indicating success or failure. TRITONSERVER_Error* TryParseModelStringParameter( triton::common::TritonJson::Value& params, const std::string& mkey, bool* value, const bool& default_value); /// Try to parse the requested parameter. /// /// \param params The param in model config /// \param mkey Key in the model config. /// \param value The parsed uint64 value. /// \param default_value Default value to use when key is not found. /// \return a TRITONSERVER_Error indicating success or failure. TRITONSERVER_Error* TryParseModelStringParameter( triton::common::TritonJson::Value& params, const std::string& mkey, uint64_t* value, const uint64_t& default_value); /// Get a string representation of a tensor buffer. /// /// \param str Returns the string. /// \param buffer The base pointer to the tensor buffer. /// \param buffer_byte_size The size of the buffer in bytes. /// \param datatype The type of the tensor /// \return a TRITONSERVER_Error indicating success or failure. TRITONSERVER_Error* BufferAsTypedString( std::string& str, const char* buffer, size_t buffer_byte_size, TRITONSERVER_DataType datatype); /// Get the ID of the request as a string formatted for logging. /// /// \param request Request of which to get the ID. /// \return a formatted string for logging the request ID. std::string GetRequestId(TRITONBACKEND_Request* request); /// Validate the contiguous string buffer with correct format /// ... and parse string /// elements into list of pairs of memory address and length. /// Note the returned list of pairs points to valid memory as long /// as memory pointed by buffer remains allocated. /// /// \param buffer The pointer to the contiguous string buffer. /// \param buffer_byte_size The size of the buffer in bytes. /// \param expected_element_cnt The number of expected string elements. /// \param input_name The name of the input buffer. /// \param str_list Returns pairs of address and length of parsed strings. /// \return a TRITONSERVER_Error indicating success or failure. TRITONSERVER_Error* ValidateStringBuffer( const char* buffer, size_t buffer_byte_size, const size_t expected_element_cnt, const char* input_name, std::vector>* str_list); /// Converts incoming utf-8 path to an OS valid path /// /// On Linux there is not much to do. /// On Windows we need to take care of the long paths and handle them correctly /// to avoid legacy issues with MAX_PATH /// /// More details: /// https://learn.microsoft.com/en-us/windows/win32/fileio/maximum-file-path-limitation?tabs=registry /// /// \param path The path to validate /// \param ret_path The updated valid path as per the OS requirements /// \return a TRITONSERVER_Error indicating success or failure. TRITONSERVER_Error* GetOSValidPath( const std::string& path, std::string& ret_path); }} // namespace triton::backend ================================================ FILE: include/triton/backend/backend_input_collector.h ================================================ // Copyright 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // * Neither the name of NVIDIA CORPORATION nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma once #include #include #include #include #include "triton/backend/backend_common.h" #include "triton/backend/backend_memory.h" #include "triton/common/async_work_queue.h" #include "triton/common/sync_queue.h" #include "triton/core/tritonbackend.h" #ifdef TRITON_ENABLE_GPU #include #endif // TRITON_ENABLE_GPU namespace triton { namespace backend { #ifndef TRITON_ENABLE_GPU using cudaStream_t = void*; using cudaEvent_t = void*; #endif // !TRITON_ENABLE_GPU // // BackendInputCollector // class BackendInputCollector { public: // The caller can optionally provide 'event' for internal synchronization // instead of using 'stream'. If 'host_policy_name' is provided, it must be // valid for the lifetime of the collector explicit BackendInputCollector( TRITONBACKEND_Request** requests, const uint32_t request_count, std::vector* responses, TRITONBACKEND_MemoryManager* memory_manager, const bool pinned_enabled, cudaStream_t stream, cudaEvent_t event = nullptr, cudaEvent_t buffer_ready_event = nullptr, const size_t kernel_buffer_threshold = 0, const char* host_policy_name = nullptr, const bool copy_on_stream = false, const bool coalesce_request_input = false) : need_sync_(false), requests_(requests), request_count_(request_count), responses_(responses), memory_manager_(memory_manager), pinned_enabled_(pinned_enabled), use_async_cpu_copy_(triton::common::AsyncWorkQueue::WorkerCount() > 1), stream_(stream), #ifdef TRITON_ENABLE_GPU event_(event), buffer_ready_event_(buffer_ready_event), #endif // TRITON_ENABLE_GPU kernel_buffer_threshold_(kernel_buffer_threshold), pending_pinned_byte_size_(0), pending_pinned_offset_(0), pending_copy_kernel_buffer_byte_size_(0), pending_copy_kernel_buffer_offset_(0), pending_copy_kernel_input_buffer_counts_(0), async_task_count_(0), host_policy_cstr_(host_policy_name), copy_on_stream_(copy_on_stream), coalesce_request_input_(coalesce_request_input) { } ~BackendInputCollector() = default; // Process all requests for a named input tensor and return the // concatenated values of those requests in a single contiguous // buffer. This overload of the function can avoid data copy if the // tensor values are already contiguous and the caller doesn't // provide a destination 'buffer'. // // 'buffer' is used to determine whether the input should be placed at the // 'buffer' provided by the caller. If 'buffer' == nullptr, the returned // buffer will be managed by the BackendInputCollector object and // has the same lifecycle as the BackendInputCollector object. // 'buffer_byte_size' is the byte size of 'buffer' if it is not nullptr. // 'allowed_input_types' is the ordered list of the memory type and id pairs // that the returned buffer can be. It must only contain the memory type // and id of 'buffer' if 'buffer' is not nullptr. // 'dst_buffer' returns the contiguous buffer of the input tensor. // 'dst_buffer_byte_size' the byte size of 'dst_buffer'. // 'dst_memory_type' returns the memory type of 'dst_buffer'. // 'dst_memory_type_id' returns the memory type id of 'dst_buffer'. TRITONSERVER_Error* ProcessTensor( const char* input_name, char* buffer, const size_t buffer_byte_size, const std::vector>& allowed_input_types, const char** dst_buffer, size_t* dst_buffer_byte_size, TRITONSERVER_MemoryType* dst_memory_type, int64_t* dst_memory_type_id); // Process all requests for a named input tensor and return the // concatenated values of those requests in a single contiguous // 'buffer'. // // 'buffer' The buffer to hold the concatenates tensor value. Must // be large enough to hold all tensor value. // 'buffer_byte_size' is the byte size of 'buffer'. // 'dst_memory_type' The memory type of 'buffer'. // 'dst_memory_type_id' The memory type id of 'buffer'. void ProcessTensor( const char* input_name, char* buffer, const size_t buffer_byte_size, const TRITONSERVER_MemoryType memory_type, const int64_t memory_type_id); // Process the batch input and return its shape. Returning error indicates // that the batch input can't be formed properly and the caller should abort // the whole batch. TRITONSERVER_Error* BatchInputShape( const BatchInput& batch_input, std::vector* shape); // Process the batch input and derive its value into 'buffer'. Returning // error indicates that the batch input can't be formed properly and // the caller should abort the whole batch. // 'buffer' is used to determine whether the input should be placed at the // 'buffer' provided by the caller. If 'buffer' == nullptr, the returned // buffer will be managed by the BackendInputCollector object and // has the same lifecycle as the BackendInputCollector object. // 'buffer_byte_size' is the byte size of 'buffer' if it is not nullptr. // 'allowed_input_types' is the ordered list of the memory type and id pairs // that the returned buffer can be. It must only contain the memory type // and id of 'buffer' if it is not nullptr. // 'dst_buffer' returns the contiguous buffer of the input tensor. // 'dst_memory_type' returns the memory type of 'dst_buffer'. // 'dst_memory_type_id' returns the memory type id of 'dst_buffer'. TRITONSERVER_Error* ProcessBatchInput( const BatchInput& batch_input, char* buffer, const size_t buffer_byte_size, const std::vector>& allowed_input_types, const char** dst_buffer, size_t* dst_buffer_byte_size, TRITONSERVER_MemoryType* dst_memory_type, int64_t* dst_memory_type_id); // Finalize processing of all requests for all input tensors. Return // true if cudaMemcpyAsync is called, and the caller should call // cudaStreamSynchronize (or cudaEventSynchronize on 'event') before // using the data. bool Finalize(); private: struct ContiguousBuffer { ContiguousBuffer() : start_request_idx_(0), end_request_idx_(0) {} MemoryDesc memory_desc_; size_t start_request_idx_; size_t end_request_idx_; }; class InputIterator { public: InputIterator( TRITONBACKEND_Request** requests, const uint32_t request_count, std::vector* responses, const char* input_name, const char* host_policy_name, const bool coalesce_request_input); // Return false if iterator reaches the end of inputs, 'input' is not set. bool GetNextContiguousInput(ContiguousBuffer* input); private: TRITONBACKEND_Request** requests_; const uint32_t request_count_; std::vector* responses_; const char* input_name_; const char* host_policy_; const bool coalesce_request_input_; TRITONBACKEND_Input* curr_input_; size_t curr_request_idx_; size_t curr_buffer_idx_; uint32_t curr_buffer_cnt_; bool reach_end_; }; // Return whether the entire input is in a contiguous buffer. If returns true, // the properties of the contiguous input buffer will also be returned. // Otherwise, only 'buffer_byte_size' will be set and return the total byte // size of the input. bool GetInputBufferIfContiguous( const char* input_name, const char** buffer, size_t* buffer_byte_size, TRITONSERVER_MemoryType* memory_type, int64_t* memory_type_id); bool FlushPendingPinned( char* tensor_buffer, const size_t tensor_buffer_byte_size, const TRITONSERVER_MemoryType tensor_memory_type, const int64_t tensor_memory_type_id); bool FlushPendingCopyKernel( char* tensor_buffer, const size_t tensor_buffer_byte_size, const TRITONSERVER_MemoryType tensor_memory_type, const int64_t tensor_memory_type_id); TRITONSERVER_Error* LaunchCopyKernel( char* tensor_buffer, const size_t tensor_buffer_byte_size, const TRITONSERVER_MemoryType tensor_memory_type, const int64_t tensor_memory_type_id); bool SetInputTensor( const char* input_name, const ContiguousBuffer& input, char* tensor_buffer, const size_t tensor_buffer_byte_size, const TRITONSERVER_MemoryType tensor_memory_type, const int64_t tensor_memory_type_id, const size_t tensor_buffer_offset, const TRITONSERVER_MemoryType use_pinned_memory_type, const bool use_kernel, const bool wait_buffer); template TRITONSERVER_Error* SetElementCount( const std::string& source_input, char* buffer, const size_t buffer_byte_size); template TRITONSERVER_Error* SetAccumulatedElementCount( const std::string& source_input, char* buffer, const size_t buffer_byte_size); template TRITONSERVER_Error* SetBatchItemShape( const std::string& source_input, char* buffer, const size_t buffer_byte_size); bool need_sync_; TRITONBACKEND_Request** requests_; const uint32_t request_count_; std::vector* responses_; TRITONBACKEND_MemoryManager* memory_manager_; const bool pinned_enabled_; const bool use_async_cpu_copy_; cudaStream_t stream_; #ifdef TRITON_ENABLE_GPU cudaEvent_t event_; cudaEvent_t buffer_ready_event_; #endif // TRITON_ENABLE_GPU const size_t kernel_buffer_threshold_; size_t pending_pinned_byte_size_; size_t pending_pinned_offset_; std::list pending_pinned_input_buffers_; // managed memories that need to live over the lifetime of this // BackendInputCollector object. std::list> in_use_memories_; size_t pending_copy_kernel_buffer_byte_size_; size_t pending_copy_kernel_buffer_offset_; size_t pending_copy_kernel_input_buffer_counts_; std::list pending_copy_kernel_input_buffers_; std::vector>> input_ptr_buffer_host_; std::vector>> byte_size_buffer_host_; std::vector>> byte_size_offset_buffer_host_; // Pinned memory buffers and the corresponding request_inputs where // the final copy to the tensor is deferred until Finalize() after // waiting for all in-flight copies. struct DeferredPinned { DeferredPinned( char* pinned_memory, const size_t pinned_memory_size, char* tensor_buffer, const size_t tensor_buffer_offset, const TRITONSERVER_MemoryType tensor_memory_type, const int64_t tensor_memory_id, std::list&& request_buffers, std::vector* responses) : finalized_(false), pinned_memory_(pinned_memory), pinned_memory_size_(pinned_memory_size), tensor_buffer_(tensor_buffer), tensor_buffer_offset_(tensor_buffer_offset), tensor_memory_type_(tensor_memory_type), tensor_memory_id_(tensor_memory_id), requests_(std::move(request_buffers)), responses_(responses) { } bool Finalize(cudaStream_t stream); bool finalized_; // Holding reference to the pinned memory buffer, which is managed // by BackendInputCollector as 'pinned_memory' char* pinned_memory_; const size_t pinned_memory_size_; char* tensor_buffer_; const size_t tensor_buffer_offset_; const TRITONSERVER_MemoryType tensor_memory_type_; const int64_t tensor_memory_id_; std::list requests_; std::vector* responses_; }; std::list deferred_pinned_; // FIXME use future to maintain an issue-order queue to drop task count triton::common::SyncQueue completion_queue_; size_t async_task_count_; const char* host_policy_cstr_; const bool copy_on_stream_; const bool coalesce_request_input_; }; }} // namespace triton::backend ================================================ FILE: include/triton/backend/backend_memory.h ================================================ // Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // * Neither the name of NVIDIA CORPORATION nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma once #include #include #include "triton/core/tritonbackend.h" #include "triton/core/tritonserver.h" namespace triton { namespace backend { // Collection of common properties that describes a buffer in Triton struct MemoryDesc { MemoryDesc() : buffer_(nullptr), byte_size_(0), memory_type_(TRITONSERVER_MEMORY_CPU), memory_type_id_(0) { } MemoryDesc( const char* buffer, size_t byte_size, TRITONSERVER_MemoryType memory_type, int64_t memory_type_id) : buffer_(buffer), byte_size_(byte_size), memory_type_(memory_type), memory_type_id_(memory_type_id) { } const char* buffer_; size_t byte_size_; TRITONSERVER_MemoryType memory_type_; int64_t memory_type_id_; }; // // BackendMemory // // Utility class for allocating and deallocating memory using both // TRITONBACKEND_MemoryManager and direct GPU and CPU malloc/free. // class BackendMemory { public: enum class AllocationType { CPU, CPU_PINNED, GPU, CPU_PINNED_POOL, GPU_POOL }; // Allocate a contiguous block of 'alloc_type' memory. 'mem' // returns the pointer to the allocated memory. // // CPU, CPU_PINNED_POOL and GPU_POOL are allocated using // TRITONBACKEND_MemoryManagerAllocate. Note that CPU_PINNED and GPU // allocations can be much slower than the POOL variants. // // Two error codes have specific interpretations for this function: // // TRITONSERVER_ERROR_UNSUPPORTED: Indicates that function is // incapable of allocating the requested memory type and memory // type ID. Requests for the memory type and ID will always fail // no matter 'byte_size' of the request. // // TRITONSERVER_ERROR_UNAVAILABLE: Indicates that function can // allocate the memory type and ID but that currently it cannot // allocate a contiguous block of memory of the requested // 'byte_size'. static TRITONSERVER_Error* Create( TRITONBACKEND_MemoryManager* manager, const AllocationType alloc_type, const int64_t memory_type_id, const size_t byte_size, BackendMemory** mem); // Allocate a contiguous block of memory by attempting the // allocation using 'alloc_types' in order until one is successful. // See BackendMemory::Create() above for details. static TRITONSERVER_Error* Create( TRITONBACKEND_MemoryManager* manager, const std::vector& alloc_types, const int64_t memory_type_id, const size_t byte_size, BackendMemory** mem); // Creates a BackendMemory object from a pre-allocated buffer. The buffer // is not owned by the object created with this function. Hence, for // proper operation, the lifetime of the buffer should at least extend till // the corresponding BackendMemory. static TRITONSERVER_Error* Create( TRITONBACKEND_MemoryManager* manager, const AllocationType alloc_type, const int64_t memory_type_id, void* buffer, const size_t byte_size, BackendMemory** mem); ~BackendMemory(); AllocationType AllocType() const { return alloctype_; } int64_t MemoryTypeId() const { return memtype_id_; } char* MemoryPtr() { return buffer_; } size_t ByteSize() const { return byte_size_; } TRITONSERVER_MemoryType MemoryType() const { return AllocTypeToMemoryType(alloctype_); } static TRITONSERVER_MemoryType AllocTypeToMemoryType(const AllocationType a); static const char* AllocTypeString(const AllocationType a); private: BackendMemory( TRITONBACKEND_MemoryManager* manager, const AllocationType alloctype, const int64_t memtype_id, char* buffer, const size_t byte_size, const bool owns_buffer = true) : manager_(manager), alloctype_(alloctype), memtype_id_(memtype_id), buffer_(buffer), byte_size_(byte_size), owns_buffer_(owns_buffer) { } TRITONBACKEND_MemoryManager* manager_; AllocationType alloctype_; int64_t memtype_id_; char* buffer_; size_t byte_size_; bool owns_buffer_; }; }} // namespace triton::backend ================================================ FILE: include/triton/backend/backend_model.h ================================================ // Copyright 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // * Neither the name of NVIDIA CORPORATION nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma once #include #include #include #include "triton/backend/backend_common.h" #include "triton/core/tritonbackend.h" #include "triton/core/tritonserver.h" namespace triton { namespace backend { // // BackendModel // // Common functionality for a backend model. This class is provided as // a convenience; backends are not required to use this class. // class BackendModel { public: BackendModel( TRITONBACKEND_Model* triton_model, const bool allow_optional = false); virtual ~BackendModel() = default; // Get the handle to the TRITONBACKEND server hosting this model. TRITONSERVER_Server* TritonServer() { return triton_server_; } // Get the handle to the memory manager for this model. TRITONBACKEND_MemoryManager* TritonMemoryManager() { return triton_memory_manager_; } // Get the handle to the TRITONBACKEND model. TRITONBACKEND_Model* TritonModel() { return triton_model_; } // Get the name and version of the model. const std::string& Name() const { return name_; } uint64_t Version() const { return version_; } const std::string& RepositoryPath() const { return repository_path_; } // The model configuration. common::TritonJson::Value& ModelConfig() { return model_config_; } // Sets the updated model configuration to the core. TRITONSERVER_Error* SetModelConfig(); // Parses information out of the model configuration. TRITONSERVER_Error* ParseModelConfig(); // Maximum batch size supported by the model. A value of 0 // indicates that the model does not support batching. int MaxBatchSize() const { return max_batch_size_; } // Set the max batch size for the model. When a backend // auto-completes a configuration it may set or change the maximum // batch size. void SetMaxBatchSize(const int b) { max_batch_size_ = b; } // Does this model support batching in the first dimension? TRITONSERVER_Error* SupportsFirstDimBatching(bool* supports); // Use indirect pinned memory buffer when copying an input or output // tensor to/from the model. bool EnablePinnedInput() const { return enable_pinned_input_; } bool EnablePinnedOutput() const { return enable_pinned_output_; } const std::vector& BatchInputs() const { return batch_inputs_; } const std::vector& BatchOutputs() const { return batch_outputs_; } const BatchOutput* FindBatchOutput(const std::string& output_name) const; bool IsInputRagged(const std::string& input_name) const { return (ragged_inputs_.find(input_name) != ragged_inputs_.end()); } bool IsInputOptional(const std::string& input_name) const { return (optional_inputs_.find(input_name) != optional_inputs_.end()); } protected: TRITONSERVER_Server* triton_server_; TRITONBACKEND_MemoryManager* triton_memory_manager_; TRITONBACKEND_Model* triton_model_; std::string name_; uint64_t version_; std::string repository_path_; bool allow_optional_; common::TritonJson::Value model_config_; int max_batch_size_; bool enable_pinned_input_; bool enable_pinned_output_; std::vector batch_inputs_; std::vector batch_outputs_; std::map batch_output_map_; std::set ragged_inputs_; std::set optional_inputs_; }; // // BackendModelException // // Exception thrown if error occurs while constructing an // BackendModel. // struct BackendModelException { BackendModelException(TRITONSERVER_Error* err) : err_(err) {} TRITONSERVER_Error* err_; }; #define THROW_IF_BACKEND_MODEL_ERROR(X) \ do { \ TRITONSERVER_Error* tie_err__ = (X); \ if (tie_err__ != nullptr) { \ throw triton::backend::BackendModelException(tie_err__); \ } \ } while (false) }} // namespace triton::backend ================================================ FILE: include/triton/backend/backend_model_instance.h ================================================ // Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // * Neither the name of NVIDIA CORPORATION nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma once #include #include "triton/core/tritonbackend.h" #ifdef TRITON_ENABLE_GPU #include #endif // TRITON_ENABLE_GPU namespace triton { namespace backend { #ifndef TRITON_ENABLE_GPU using cudaStream_t = void*; #endif // !TRITON_ENABLE_GPU class BackendModel; // // BackendModelInstance // // Common functionality for a backend model instance. This class is // provided as a convenience; backends are not required to use this // class. // class BackendModelInstance { public: BackendModelInstance( BackendModel* backend_model, TRITONBACKEND_ModelInstance* triton_model_instance); virtual ~BackendModelInstance(); // Get the name, kind and device ID of the instance. const std::string& Name() const { return name_; } TRITONSERVER_InstanceGroupKind Kind() const { return kind_; } int32_t DeviceId() const { return device_id_; } // Get the handle to the TRITONBACKEND model instance. TRITONBACKEND_ModelInstance* TritonModelInstance() { return triton_model_instance_; } // Get the BackendModel representing the model that corresponds to // this instance. BackendModel* Model() const { return backend_model_; } // The model configuration 'default_model_filename' value, or the // value in model configuration 'cc_model_filenames' for the GPU // targeted by this instance. If neither are specified in the model // configuration, the return empty string. const std::string& ArtifactFilename() const { return artifact_filename_; } // Returns the stream associated with this instance that can be used // for GPU<->CPU memory transfers. Returns nullptr if GPU support is // disabled or if this instance is not executing on a GPU. cudaStream_t CudaStream() { return stream_; } const std::string& HostPolicyName() const { return host_policy_name_; } protected: BackendModel* backend_model_; TRITONBACKEND_ModelInstance* triton_model_instance_; std::string name_; TRITONSERVER_InstanceGroupKind kind_; int32_t device_id_; std::string artifact_filename_; cudaStream_t stream_; std::string host_policy_name_; }; // // BackendModelInstanceException // // Exception thrown if error occurs while constructing an // BackendModelInstance. // struct BackendModelInstanceException { BackendModelInstanceException(TRITONSERVER_Error* err) : err_(err) {} TRITONSERVER_Error* err_; }; #define THROW_IF_BACKEND_INSTANCE_ERROR(X) \ do { \ TRITONSERVER_Error* tie_err__ = (X); \ if (tie_err__ != nullptr) { \ throw triton::backend::BackendModelInstanceException(tie_err__); \ } \ } while (false) }} // namespace triton::backend ================================================ FILE: include/triton/backend/backend_output_responder.h ================================================ // Copyright 2019-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // * Neither the name of NVIDIA CORPORATION nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma once #include #include #include #include "triton/backend/backend_common.h" #include "triton/common/async_work_queue.h" #include "triton/core/tritonbackend.h" #ifdef TRITON_ENABLE_GPU #include #endif // TRITON_ENABLE_GPU namespace triton { namespace backend { #ifndef TRITON_ENABLE_GPU using cudaStream_t = void*; using cudaEvent_t = void*; #endif // !TRITON_ENABLE_GPU // // BackendOutputResponder // class BackendOutputResponder { public: // The caller can optionally provide 'event' for internal synchronization // instead of using 'stream'. explicit BackendOutputResponder( TRITONBACKEND_Request** requests, const uint32_t request_count, std::vector* responses, TRITONBACKEND_MemoryManager* memory_manager, const bool first_dim_batching, const bool pinned_enabled, cudaStream_t stream, cudaEvent_t event = nullptr, bool copy_on_stream = false) : need_sync_(false), requests_(requests), request_count_(request_count), responses_(responses), memory_manager_(memory_manager), first_dim_batching_(first_dim_batching), pinned_enabled_(pinned_enabled), use_async_cpu_copy_(triton::common::AsyncWorkQueue::WorkerCount() > 1), stream_(stream), event_(event), pending_pinned_byte_size_(0), copy_on_stream_(copy_on_stream) { } // Legacy constructor for backwards compatibility. The above // constructor should be used for all new cases. The responder needs // to know if the model is batching along the first dimension. With // this constructor we derive that information from the // max_batch_size value instead of having it provided directly as in // the above constructor. explicit BackendOutputResponder( TRITONBACKEND_Request** requests, const uint32_t request_count, std::vector* responses, const int max_batch_size, TRITONBACKEND_MemoryManager* memory_manager, const bool pinned_enabled, cudaStream_t stream, cudaEvent_t event = nullptr, bool copy_on_stream = false) : need_sync_(false), requests_(requests), request_count_(request_count), responses_(responses), memory_manager_(memory_manager), first_dim_batching_(max_batch_size >= 1), pinned_enabled_(pinned_enabled), use_async_cpu_copy_(triton::common::AsyncWorkQueue::WorkerCount() > 1), stream_(stream), event_(event), pending_pinned_byte_size_(0), copy_on_stream_(copy_on_stream) { } ~BackendOutputResponder(); // Process all responses for a named output tensor. // 'batchn_shape' may be modified by the call. void ProcessTensor( const std::string& name, const TRITONSERVER_DataType datatype, std::vector& batchn_shape, const char* buffer, const TRITONSERVER_MemoryType memory_type, const int64_t memory_type_id); // Process all responses for a named state tensor. Returns a vector of // TRITONBACKEND_State objects that the backend can use to update the state. // If TRITONBACKEND_StateUpdate is not called on the vector elements, the // state will not be updated. // 'batchn_shape' may be modified by the call. std::vector ProcessStateTensor( const std::string& name, const TRITONSERVER_DataType datatype, std::vector& batchn_shape, const char* buffer, const TRITONSERVER_MemoryType memory_type, const int64_t memory_type_id); // Process all responses for a batch output and derive its value from // 'buffer'. void ProcessBatchOutput( const std::string& name, const BatchOutput& batch_output, const char* buffer, const TRITONSERVER_MemoryType memory_type, const int64_t memory_type_id); // Finalize processing of all responses for all output // tensors. Return true if cudaMemcpyAsync is called, and the caller // should call cudaStreamSynchronize (or cudaEventSynchronize on 'event') // before using the data. bool Finalize(); private: bool FlushPendingPinned( const char* tensor_buffer, const TRITONSERVER_MemoryType tensor_memory_type, const int64_t tensor_memory_type_id); bool SetFixedSizeBuffer( TRITONBACKEND_Response** response, void* response_state_or_output, const std::string& output_name, const size_t tensor_byte_size, const size_t tensor_offset, const char* tensor_buffer, const TRITONSERVER_MemoryType tensor_memory_type, const int64_t tensor_memory_type_id, const TRITONSERVER_MemoryType use_pinned_memory_type, bool state); struct OutputData { OutputData( const std::string& name, void* buffer, const size_t buffer_byte_size, const TRITONSERVER_MemoryType memory_type, const int64_t memory_type_id) : name_(name), buffer_(buffer), buffer_byte_size_(buffer_byte_size), memory_type_(memory_type), memory_type_id_(memory_type_id) { } const std::string name_; void* buffer_; const size_t buffer_byte_size_; const TRITONSERVER_MemoryType memory_type_; const int64_t memory_type_id_; }; bool need_sync_; TRITONBACKEND_Request** requests_; const uint32_t request_count_; std::vector* responses_; TRITONBACKEND_MemoryManager* memory_manager_; const bool first_dim_batching_; const bool pinned_enabled_; const bool use_async_cpu_copy_; cudaStream_t stream_; cudaEvent_t event_; using ResponsesList = std::list>; size_t pending_pinned_byte_size_; size_t pending_pinned_offset_; ResponsesList pending_pinned_outputs_; const bool copy_on_stream_; // Pinned memories that need to live over the lifetime of this // BackendOutputResponder object. std::list pinned_memories_; // Pinned memory buffers and the corresponding response outputs // where the final copy to the response is deferred until Finalize() // after waiting for all in-flight copies. struct DeferredPinned { DeferredPinned( char* pinned_memory, const size_t pinned_memory_size, ResponsesList&& responses) : pinned_memory_(pinned_memory), pinned_memory_size_(pinned_memory_size), responses_(std::move(responses)) { } char* pinned_memory_; const size_t pinned_memory_size_; ResponsesList responses_; }; std::list deferred_pinned_; }; }} // namespace triton::backend ================================================ FILE: include/triton/backend/device_memory_tracker.h ================================================ // Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // * Neither the name of NVIDIA CORPORATION nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma once #include #include #include #include #include #include #include #include "triton/backend/backend_common.h" #include "triton/common/logging.h" #include "triton/core/tritonserver.h" #if defined(TRITON_ENABLE_GPU) && defined(TRITON_ENABLE_MEMORY_TRACKER) #include #endif static_assert( sizeof(uint64_t) >= sizeof(uintptr_t), "The implementation is storing address pointer as uint64_t, " "must ensure the space for pointer is <= sizeof(uint64_t)."); namespace triton { namespace backend { /// DeviceMemoryTracker is a backend utility provided to track the memory /// allocated for a particular model and associated model instances. /// This utility is often used for backend to report memory usage through /// TRITONBACKEND_ModelReportMemoryUsage and /// TRITONBACKEND_ModelInstanceReportMemoryUsage, which provides /// additional information to Triton for making decision on model scaling and /// deployment. /// /// Caveat: The memory tracker is implemented with CUPTI library which currently /// only supports single client/subscriber. This is an known limitation and as a /// result, the memory tracker can cause unexpected application failure if other /// component of the Triton process also uses CUPTI with a different /// configuration, for example, the framework used by the backend may have /// implemented similar profiler with CUPTI. Therefore, before enabling this /// memory tracker utilities, you should make sure that there is no other CUPTI /// client in the process. This tracker is implemented with the assumption that /// all other CUPTI clients are using the same implementation so that /// as long as all backends are compiled with this memory tracker, they may /// interact with an externally-initialized CUPTI to the backend without issues. /// /// Typical usage: /// /// On TRITONBACKEND_Initialize /// - Call DeviceMemoryTracker::Init /// /// If DeviceMemoryTracker::Init returns true, /// DeviceMemoryTracker::TrackThreadMemoryUsage and /// DeviceMemoryTracker::UntrackThreadMemoryUsage can be called accordingly to /// track memory allocation in the scope between the two calls. The memory usage /// will be recorded in MemoryUsage object and may be reported through /// TRITONBACKEND_ModelReportMemoryUsage or /// TRITONBACKEND_ModelInstanceReportMemoryUsage based on the entity of the /// memory usage. /// /// On reporting memory usage /// - Call MemoryUsage::SerializeToBufferAttributes to prepare the usage /// in the desired format. The BufferAttributes will be owned by MemoryUsage. extern "C" { typedef struct TRITONBACKEND_CuptiTracker_t { // C struct require extra implementation for dynamic array, for simplicity, // the following assumptions are made to pre-allocate the array with max // possible length: // - system / pinned memory allocation should only be on deviceId 0 // - CUDA allocation will only be on visible CUDA devices int64_t* system_memory_usage_byte_; int64_t* pinned_memory_usage_byte_; int64_t* cuda_memory_usage_byte_; uint32_t system_array_len_; uint32_t pinned_array_len_; uint32_t cuda_array_len_; // only set to false if somehow the CUPTI activity occurs on index out of // range. In that case, user should invalidate the whole tracker. bool valid_; } TRITONBACKEND_CuptiTracker; } class DeviceMemoryTracker { public: struct MemoryUsage { MemoryUsage() { cuda_memory_usage_byte_.resize(CudaDeviceCount(), 0); cupti_tracker_.system_memory_usage_byte_ = system_memory_usage_byte_.data(); cupti_tracker_.pinned_memory_usage_byte_ = pinned_memory_usage_byte_.data(); cupti_tracker_.cuda_memory_usage_byte_ = cuda_memory_usage_byte_.data(); cupti_tracker_.system_array_len_ = system_memory_usage_byte_.size(); cupti_tracker_.pinned_array_len_ = pinned_memory_usage_byte_.size(); cupti_tracker_.cuda_array_len_ = cuda_memory_usage_byte_.size(); cupti_tracker_.valid_ = true; } ~MemoryUsage() { // Make sure all C struct reference are dropped before clearing. if (tracked_) { UntrackThreadMemoryUsage(this); } for (auto& ba : buffer_attributes_) { if (ba) { LOG_IF_ERROR( TRITONSERVER_BufferAttributesDelete(ba), "Releasing buffer attributes in MemoryUsage object"); } } } // Disable copy and assign to better manage C struct lifecycle MemoryUsage(const MemoryUsage&) = delete; void operator=(const MemoryUsage&) = delete; // merge record from another MemoryUsage object MemoryUsage& operator+=(const MemoryUsage& rhs) { std::transform( rhs.system_memory_usage_byte_.begin(), rhs.system_memory_usage_byte_.end(), system_memory_usage_byte_.begin(), system_memory_usage_byte_.begin(), std::plus()); std::transform( rhs.pinned_memory_usage_byte_.begin(), rhs.pinned_memory_usage_byte_.end(), pinned_memory_usage_byte_.begin(), pinned_memory_usage_byte_.begin(), std::plus()); std::transform( rhs.cuda_memory_usage_byte_.begin(), rhs.cuda_memory_usage_byte_.end(), cuda_memory_usage_byte_.begin(), cuda_memory_usage_byte_.begin(), std::plus()); return *this; } // Serialize the MemoryUsage into an array of TRITONSERVER_BufferAttributes, // the buffer attributes object are owned by the MemoryUsage object. // Empty usage will be returned if the MemoryUsage object is invalid. TRITONSERVER_Error* SerializeToBufferAttributes( TRITONSERVER_BufferAttributes*** usage, uint32_t* usage_size) { if (!cupti_tracker_.valid_) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INTERNAL, "MemoryUsage record is invalid."); } uint32_t usage_idx = 0; // Define lambda to convert an vector of memory usage of the same type of // device into buffer attributes and set in 'usage' auto set_attributes_for_device_fn = [&](const std::vector& devices, const TRITONSERVER_MemoryType mem_type) -> TRITONSERVER_Error* { for (size_t idx = 0; idx < devices.size(); ++idx) { // skip if no allocation if (devices[idx] == 0) { continue; } // there is space in usage array if (usage_idx >= buffer_attributes_.size()) { buffer_attributes_.emplace_back(nullptr); RETURN_IF_ERROR( TRITONSERVER_BufferAttributesNew(&buffer_attributes_.back())); } auto entry = buffer_attributes_[usage_idx]; RETURN_IF_ERROR( TRITONSERVER_BufferAttributesSetMemoryType(entry, mem_type)); RETURN_IF_ERROR( TRITONSERVER_BufferAttributesSetMemoryTypeId(entry, idx)); RETURN_IF_ERROR( TRITONSERVER_BufferAttributesSetByteSize(entry, devices[idx])); ++usage_idx; } return nullptr; // success }; RETURN_IF_ERROR(set_attributes_for_device_fn( system_memory_usage_byte_, TRITONSERVER_MEMORY_CPU)); RETURN_IF_ERROR(set_attributes_for_device_fn( pinned_memory_usage_byte_, TRITONSERVER_MEMORY_CPU_PINNED)); RETURN_IF_ERROR(set_attributes_for_device_fn( cuda_memory_usage_byte_, TRITONSERVER_MEMORY_GPU)); *usage_size = usage_idx; *usage = buffer_attributes_.data(); return nullptr; } // Byte size of allocated memory tracked, // 'system_memory_usage_byte_' is likely to be empty as system memory // allocation is not controlled by CUDA driver. But keeping it for // completeness. std::vector system_memory_usage_byte_{0}; std::vector pinned_memory_usage_byte_{0}; std::vector cuda_memory_usage_byte_{0}; bool tracked_{false}; std::vector buffer_attributes_; TRITONBACKEND_CuptiTracker cupti_tracker_; }; // Simple scope guard to make sure memory usage is untracked without coupling // with MemoryUsage lifecycle struct ScopeGuard { ScopeGuard(MemoryUsage* usage) : usage_(usage) {} ~ScopeGuard() { if (usage_ && usage_->tracked_) { UntrackThreadMemoryUsage(usage_); } } MemoryUsage* usage_{nullptr}; }; #if defined(TRITON_ENABLE_GPU) && defined(TRITON_ENABLE_MEMORY_TRACKER) static bool Init(); static void Fini(); static int CudaDeviceCount(); // The memory usage will be tracked and modified until it's untracked, 'usage' // must be valid and not to be modified externally until untrack is called. // Currently can distinguish activity by correlation id which is thread // specific, which implies that there will be missing records if tracking // region switching threads to handle other activities. // This function takes no affect if 'usage' is nullptr. static void TrackThreadMemoryUsage(MemoryUsage* usage); // Note that CUPTI always pop from the top of the thread-wise stack, must be // careful on the untrack order if there is need to use multiple MemoryUsage // objects. // This function takes no affect if 'usage' is nullptr. static void UntrackThreadMemoryUsage(MemoryUsage* usage); static bool EnableFromBackendConfig( triton::common::TritonJson::Value& backend_config) { triton::common::TritonJson::Value cmdline; if (backend_config.Find("cmdline", &cmdline)) { triton::common::TritonJson::Value value; std::string value_str; if (cmdline.Find("triton-backend-memory-tracker", &value)) { bool lvalue = false; auto err = value.AsString(&value_str); if (err != nullptr) { LOG_IF_ERROR(err, "Error parsing backend config: "); return false; } err = ParseBoolValue(value_str, &lvalue); if (err != nullptr) { LOG_IF_ERROR(err, "Error parsing backend config: "); return false; } return lvalue; } } return false; } ~DeviceMemoryTracker(); static void TrackActivity(CUpti_Activity* record) { if (tracker_) { tracker_->TrackActivityInternal(record); } } private: DeviceMemoryTracker(); void TrackActivityInternal(CUpti_Activity* record); bool UpdateMemoryTypeUsage( CUpti_ActivityMemory3* memory_record, const bool is_allocation, int64_t* memory_usage, uint32_t usage_len); std::mutex mtx_; std::unordered_map activity_to_memory_usage_; CUpti_SubscriberHandle subscriber_{nullptr}; int device_cnt_{0}; static std::unique_ptr tracker_; #else // no-ops static bool Init() { return false; } static void Fini() {} static int CudaDeviceCount() { return 0; } static void TrackThreadMemoryUsage(MemoryUsage* usage) {} static void UntrackThreadMemoryUsage(MemoryUsage* usage) {} static bool EnableFromBackendConfig( const triton::common::TritonJson::Value& backend_config) { return false; } #endif // TRITON_ENABLE_GPU && TRITON_ENABLE_MEMORY_TRACKER }; }} // namespace triton::backend ================================================ FILE: pyproject.toml ================================================ # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of NVIDIA CORPORATION nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. [tool.codespell] # note: pre-commit passes explicit lists of files here, which this skip file list doesn't override - # this is only to allow you to run codespell interactively skip = "./.git,./.github" # ignore short words, and typename parameters like OffsetT ignore-regex = "\\b(.{1,4}|[A-Z]\\w*T)\\b" # use the 'clear' dictionary for unambiguous spelling mistakes builtin = "clear" # disable warnings about binary files and wrong encoding quiet-level = 3 [tool.isort] profile = "black" use_parentheses = true multi_line_output = 3 include_trailing_comma = true force_grid_wrap = 0 ensure_newline_before_comments = true line_length = 88 balanced_wrapping = true indent = " " skip = ["build"] ================================================ FILE: src/backend_common.cc ================================================ // Copyright 2020-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // * Neither the name of NVIDIA CORPORATION nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "triton/backend/backend_common.h" #ifdef _WIN32 // suppress the min and max definitions in Windef.h. #define NOMINMAX #include // _CRT_INTERNAL_NONSTDC_NAMES 1 before including Microsoft provided C Runtime // library to expose declarations without "_" prefix to match POSIX style. #define _CRT_INTERNAL_NONSTDC_NAMES 1 #include #include #else #include #include #endif #include #include #include #include #include #include #ifdef _WIN32 // in Windows doesn't define S_ISDIR macro #if !defined(S_ISDIR) && defined(S_IFMT) && defined(S_IFDIR) #define S_ISDIR(m) (((m)&S_IFMT) == S_IFDIR) #endif #define F_OK 0 #endif namespace triton { namespace backend { #ifdef TRITON_ENABLE_GPU void CUDART_CB MemcpyHost(void* args) { auto* copy_params = reinterpret_cast(args); memcpy(copy_params->dst_, copy_params->src_, copy_params->byte_size_); delete copy_params; } #endif // TRITON_ENABLE_GPU TRITONSERVER_MemoryType GetUsePinnedMemoryType(TRITONSERVER_MemoryType ref_buffer_type) { // The following matrix is used for both input and output. // src \ dest | non-pinned | pinned | device // non-pinned | memcpy | memcpy | buffer needed // pinned | memcpy | memcpy | cudaMemcpy // device | buffer needed | cudaMemcpy | cudaMemcpy if (ref_buffer_type == TRITONSERVER_MEMORY_CPU_PINNED) { return TRITONSERVER_MEMORY_CPU_PINNED; } return (ref_buffer_type == TRITONSERVER_MEMORY_CPU) ? TRITONSERVER_MEMORY_GPU : TRITONSERVER_MEMORY_CPU; } TRITONSERVER_Error_Code StatusCodeToTritonCode(triton::common::Error::Code error_code) { switch (error_code) { case triton::common::Error::Code::UNKNOWN: return TRITONSERVER_ERROR_UNKNOWN; case triton::common::Error::Code::INTERNAL: return TRITONSERVER_ERROR_INTERNAL; case triton::common::Error::Code::NOT_FOUND: return TRITONSERVER_ERROR_NOT_FOUND; case triton::common::Error::Code::INVALID_ARG: return TRITONSERVER_ERROR_INVALID_ARG; case triton::common::Error::Code::UNAVAILABLE: return TRITONSERVER_ERROR_UNAVAILABLE; case triton::common::Error::Code::UNSUPPORTED: return TRITONSERVER_ERROR_UNSUPPORTED; case triton::common::Error::Code::ALREADY_EXISTS: return TRITONSERVER_ERROR_ALREADY_EXISTS; default: break; } return TRITONSERVER_ERROR_UNKNOWN; } TRITONSERVER_Error* CommonErrorToTritonError(triton::common::Error error) { return TRITONSERVER_ErrorNew( StatusCodeToTritonCode(error.ErrorCode()), error.Message().c_str()); } TRITONSERVER_Error* ParseShape( common::TritonJson::Value& io, const std::string& name, std::vector* shape) { common::TritonJson::Value shape_array; RETURN_IF_ERROR(io.MemberAsArray(name.c_str(), &shape_array)); for (size_t i = 0; i < shape_array.ArraySize(); ++i) { int64_t d = 0; RETURN_IF_ERROR(shape_array.IndexAsInt(i, &d)); shape->push_back(d); } return nullptr; // success } std::string ShapeToString(const int64_t* dims, const size_t dims_count) { bool first = true; std::string str("["); for (size_t i = 0; i < dims_count; ++i) { const int64_t dim = dims[i]; if (!first) { str += ","; } str += std::to_string(dim); first = false; } str += "]"; return str; } std::string ShapeToString(const std::vector& shape) { return ShapeToString(shape.data(), shape.size()); } int64_t GetElementCount(const int64_t* dims, const size_t dims_count) { bool first = true; int64_t cnt = 0; for (size_t i = 0; i < dims_count; i++) { if (dims[i] == WILDCARD_DIM) { return -1; } else if (dims[i] < 0) { // invalid dim return -2; } else if (dims[i] == 0) { return 0; } if (first) { cnt = dims[i]; first = false; } else { // Check for overflow before multiplication if (cnt > INT64_MAX / dims[i]) { return -3; } cnt *= dims[i]; } } return cnt; } int64_t GetElementCount(const std::vector& shape) { return GetElementCount(shape.data(), shape.size()); } TRITONSERVER_Error* GetElementCount(const int64_t* dims, const size_t dims_count, int64_t* cnt) { *cnt = GetElementCount(dims, dims_count); if (*cnt == -2) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INVALID_ARG, (std::string("shape") + ShapeToString(dims, dims_count) + " contains an invalid dim.") .c_str()); } else if (*cnt == -3) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INVALID_ARG, "unexpected integer overflow while calculating element count."); } return nullptr; // success } TRITONSERVER_Error* GetElementCount(const std::vector& shape, int64_t* cnt) { *cnt = GetElementCount(shape.data(), shape.size()); if (*cnt == -2) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INVALID_ARG, (std::string("shape") + ShapeToString(shape) + " contains an invalid dim.") .c_str()); } else if (*cnt == -3) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INVALID_ARG, "unexpected integer overflow while calculating element count."); } return nullptr; // success } int64_t GetByteSize( const TRITONSERVER_DataType& dtype, const std::vector& dims) { size_t dt_size = TRITONSERVER_DataTypeByteSize(dtype); if (dt_size == 0) { return -1; } int64_t cnt = GetElementCount(dims); if (cnt <= 0) { return cnt; } if ((cnt > INT64_MAX / dt_size)) { return -3; } return cnt * dt_size; } TRITONSERVER_Error* GetByteSize( const TRITONSERVER_DataType& dtype, const std::vector& dims, int64_t* size) { *size = GetByteSize(dtype, dims); if (*size == -2) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INVALID_ARG, (std::string("shape") + ShapeToString(dims) + " contains an invalid dim.") .c_str()); } else if (*size == -3) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INVALID_ARG, "unexpected integer overflow while calculating byte size."); } return nullptr; // success } TRITONSERVER_Error* ReadInputTensor( TRITONBACKEND_Request* request, const std::string& input_name, char* buffer, size_t* buffer_byte_size, TRITONSERVER_MemoryType memory_type, int64_t memory_type_id, cudaStream_t cuda_stream, bool* cuda_used, const char* host_policy_name, const bool copy_on_stream) { TRITONBACKEND_Input* input; RETURN_IF_ERROR( TRITONBACKEND_RequestInput(request, input_name.c_str(), &input)); uint64_t input_byte_size; uint32_t input_buffer_count; RETURN_IF_ERROR(TRITONBACKEND_InputPropertiesForHostPolicy( input, host_policy_name, nullptr, nullptr, nullptr, nullptr, &input_byte_size, &input_buffer_count)); RETURN_ERROR_IF_FALSE( input_byte_size <= *buffer_byte_size, TRITONSERVER_ERROR_INVALID_ARG, std::string( GetRequestId(request) + "buffer too small for input tensor '" + input_name + "', " + std::to_string(*buffer_byte_size) + " < " + std::to_string(input_byte_size))); size_t output_buffer_offset = 0; for (uint32_t b = 0; b < input_buffer_count; ++b) { const void* input_buffer = nullptr; uint64_t input_buffer_byte_size = 0; TRITONSERVER_MemoryType input_memory_type = TRITONSERVER_MEMORY_CPU; int64_t input_memory_type_id = 0; RETURN_IF_ERROR(TRITONBACKEND_InputBufferForHostPolicy( input, host_policy_name, b, &input_buffer, &input_buffer_byte_size, &input_memory_type, &input_memory_type_id)); RETURN_IF_ERROR(CopyBuffer( "Failed to copy buffer", input_memory_type, input_memory_type_id, memory_type, memory_type_id, input_buffer_byte_size, input_buffer, buffer + output_buffer_offset, cuda_stream, cuda_used, copy_on_stream)); output_buffer_offset += input_buffer_byte_size; } *buffer_byte_size = input_byte_size; return nullptr; // success } TRITONSERVER_Error* ReadInputTensor( TRITONBACKEND_Request* request, const std::string& input_name, char* buffer, size_t* buffer_byte_size, const char* host_policy_name) { bool cuda_used; return ReadInputTensor( request, input_name, buffer, buffer_byte_size, TRITONSERVER_MEMORY_CPU /* memory_type */, 0 /* memory_type_id */, 0 /* cuda_stream */, &cuda_used); } TRITONSERVER_Error* CheckAllowedModelInput( common::TritonJson::Value& io, const std::set& allowed) { std::string io_name; RETURN_IF_ERROR(io.MemberAsString("name", &io_name)); if (allowed.find(io_name) == allowed.end()) { std::string astr; for (const auto& a : allowed) { if (!astr.empty()) { astr.append(", "); } astr.append(a); } return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INVALID_ARG, std::string( "unexpected inference input '" + io_name + "', allowed inputs are: " + astr) .c_str()); } return nullptr; // success } TRITONSERVER_Error* CheckAllowedModelOutput( common::TritonJson::Value& io, const std::set& allowed) { std::string io_name; RETURN_IF_ERROR(io.MemberAsString("name", &io_name)); if (allowed.find(io_name) == allowed.end()) { std::string astr; for (const auto& a : allowed) { if (!astr.empty()) { astr.append(", "); } astr.append(a); } return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INVALID_ARG, std::string( "unexpected inference output '" + io_name + "', allowed outputs are: " + astr) .c_str()); } return nullptr; // success } TRITONSERVER_Error* GetBooleanSequenceControlProperties( common::TritonJson::Value& batcher, const std::string& model_name, const std::string& control_kind, const bool required, std::string* tensor_name, std::string* tensor_datatype, float* fp32_false_value, float* fp32_true_value, int32_t* int32_false_value, int32_t* int32_true_value, bool* bool_false_value, bool* bool_true_value) { // Make sure same tensor is not configured for multiple controls std::set seen_tensors; // Make sure the control kind is not mentioned multiple times. bool seen_control = false; common::TritonJson::Value control_inputs; if (batcher.Find("control_input", &control_inputs)) { for (size_t ci_idx = 0; ci_idx < control_inputs.ArraySize(); ci_idx++) { common::TritonJson::Value control_input; RETURN_IF_ERROR(control_inputs.IndexAsObject(ci_idx, &control_input)); std::string input_name; RETURN_IF_ERROR(control_input.MemberAsString("name", &input_name)); if (input_name.empty()) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INVALID_ARG, (std::string( "sequence batching control tensor must have a name for ") + model_name) .c_str()); } if (seen_tensors.find(input_name) != seen_tensors.end()) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INVALID_ARG, (std::string("sequence batching control tensor '") + input_name + "' is specified for multiple control kinds for " + model_name) .c_str()); } seen_tensors.insert(input_name); common::TritonJson::Value controls; if (control_input.Find("control", &controls)) { for (size_t c_idx = 0; c_idx < controls.ArraySize(); c_idx++) { common::TritonJson::Value c; RETURN_IF_ERROR(controls.IndexAsObject(c_idx, &c)); std::string kind_str; RETURN_IF_ERROR(c.MemberAsString("kind", &kind_str)); if (kind_str == control_kind) { if (seen_control) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INVALID_ARG, (std::string( "sequence batching specifies multiple " + control_kind + " tensors for " + model_name) .c_str())); } *tensor_name = input_name; seen_control = true; common::TritonJson::Value int32_false_true, fp32_false_true, bool_false_true; bool found_int32 = (c.Find("int32_false_true", &int32_false_true) && (int32_false_true.ArraySize() > 0)); bool found_fp32 = (c.Find("fp32_false_true", &fp32_false_true) && (fp32_false_true.ArraySize() > 0)); bool found_bool = (c.Find("bool_false_true", &bool_false_true) && (bool_false_true.ArraySize() > 0)); // Make sure only one of int, float, or bool type is specified. if (!(found_int32 || found_fp32 || found_bool)) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INVALID_ARG, (std::string( "sequence batching must specify either " "'int32_false_true', 'fp32_false_true' or " "'bool_false_true' for " + control_kind + " for " + model_name)) .c_str()); } else if ( (found_fp32 && found_int32) || (found_fp32 && found_bool) || (found_int32 && found_bool)) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INVALID_ARG, (std::string( "sequence batching specifies more than one from " "'int32_false_true', 'fp32_false_true' and " "'bool_false_true' for " + control_kind + " for " + model_name)) .c_str()); } if (found_int32) { if (int32_false_true.ArraySize() != 2) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INVALID_ARG, (std::string( "sequence batching control 'int32_false_true' must " "have " "exactly 2 entries for " + control_kind + " for " + model_name)) .c_str()); } if (tensor_datatype != nullptr) { *tensor_datatype = "TYPE_INT32"; } if (int32_false_value != nullptr) { int64_t value; RETURN_IF_ERROR(int32_false_true.IndexAsInt(0, &value)); *int32_false_value = value; } if (int32_true_value != nullptr) { int64_t value; RETURN_IF_ERROR(int32_false_true.IndexAsInt(1, &value)); *int32_true_value = value; } } else if (found_fp32) { if (fp32_false_true.ArraySize() != 2) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INVALID_ARG, (std::string( "sequence batching control 'fp32_false_true' must " "have exactly " "2 entries for " + control_kind + " for " + model_name)) .c_str()); } if (tensor_datatype != nullptr) { *tensor_datatype = "TYPE_FP32"; } if (fp32_false_value != nullptr) { double value = 0.0; RETURN_IF_ERROR(fp32_false_true.IndexAsDouble(0, &value)); *fp32_false_value = value; } if (fp32_true_value != nullptr) { double value = 0.0; RETURN_IF_ERROR(fp32_false_true.IndexAsDouble(1, &value)); *fp32_true_value = value; } } else { if (bool_false_true.ArraySize() != 2) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INVALID_ARG, (std::string( "sequence batching control 'bool_false_true' must " "have exactly " "2 entries for " + control_kind + " for " + model_name)) .c_str()); } if (tensor_datatype != nullptr) { *tensor_datatype = "TYPE_BOOL"; } if (bool_false_value != nullptr) { bool value; RETURN_IF_ERROR(bool_false_true.IndexAsBool(0, &value)); *bool_false_value = value; } if (bool_true_value != nullptr) { bool value; RETURN_IF_ERROR(bool_false_true.IndexAsBool(1, &value)); *bool_true_value = value; } } } } } } } if (!seen_control) { if (required) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INVALID_ARG, (std::string( "sequence batching control tensor must specify a " + control_kind + " value for " + model_name)) .c_str()); } tensor_name->clear(); } return nullptr; // success } TRITONSERVER_Error* GetTypedSequenceControlProperties( common::TritonJson::Value& batcher, const std::string& model_name, const std::string& control_kind, const bool required, std::string* tensor_name, std::string* tensor_datatype) { // Make sure same tensor is not configured for multiple controls std::set seen_tensors; // Make sure the control kind is not mentioned multiple times. bool seen_control = false; common::TritonJson::Value control_inputs; if (batcher.Find("control_input", &control_inputs)) { for (size_t ci_idx = 0; ci_idx < control_inputs.ArraySize(); ci_idx++) { common::TritonJson::Value control_input; RETURN_IF_ERROR(control_inputs.IndexAsObject(ci_idx, &control_input)); std::string input_name; RETURN_IF_ERROR(control_input.MemberAsString("name", &input_name)); if (input_name.empty()) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INVALID_ARG, (std::string( "sequence batching control tensor must have a name for ") + model_name) .c_str()); } if (seen_tensors.find(input_name) != seen_tensors.end()) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INVALID_ARG, (std::string("sequence batching control tensor '") + input_name + "' is specified for multiple control kinds for " + model_name) .c_str()); } seen_tensors.insert(input_name); common::TritonJson::Value controls; if (control_input.Find("control", &controls)) { for (size_t c_idx = 0; c_idx < controls.ArraySize(); c_idx++) { common::TritonJson::Value c; RETURN_IF_ERROR(controls.IndexAsObject(c_idx, &c)); std::string kind_str; RETURN_IF_ERROR(c.MemberAsString("kind", &kind_str)); if (kind_str == control_kind) { if (seen_control) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INVALID_ARG, (std::string( "sequence batching specifies multiple " + control_kind + " tensors for " + model_name) .c_str())); } *tensor_name = input_name; if (tensor_datatype != nullptr) { RETURN_IF_ERROR(c.MemberAsString("data_type", tensor_datatype)); } seen_control = true; common::TritonJson::Value int32_false_true, fp32_false_true, bool_false_true; bool found_int32 = (c.Find("int32_false_true", &int32_false_true) && (int32_false_true.ArraySize() > 0)); bool found_fp32 = (c.Find("fp32_false_true", &fp32_false_true) && (fp32_false_true.ArraySize() > 0)); bool found_bool = (c.Find("bool_false_true", &bool_false_true) && (bool_false_true.ArraySize() > 0)); if (found_fp32 || found_int32 || found_bool) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INVALID_ARG, (std::string( "sequence batching must not specify either " "'int32_false_true', 'fp32_false_true' or " "'bool_false_true' for " + control_kind + " for " + model_name)) .c_str()); } } } } } } if (!seen_control) { if (required) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INVALID_ARG, (std::string( "sequence batching control tensor must specify a " + control_kind + " value for " + model_name)) .c_str()); } tensor_name->clear(); } return nullptr; // success } void RequestsRespondWithError( TRITONBACKEND_Request** requests, const uint32_t request_count, TRITONSERVER_Error* response_err, const bool release_request) { for (size_t i = 0; i < request_count; i++) { TRITONBACKEND_Response* response; auto err = TRITONBACKEND_ResponseNew(&response, requests[i]); if (err != nullptr) { LOG_MESSAGE( TRITONSERVER_LOG_ERROR, (GetRequestId(requests[i]) + "fail to create response").c_str()); TRITONSERVER_ErrorDelete(err); } else { LOG_IF_ERROR( TRITONBACKEND_ResponseSend( response, TRITONSERVER_RESPONSE_COMPLETE_FINAL, response_err), (GetRequestId(requests[i]) + "fail to send error response").c_str()); } if (release_request) { LOG_IF_ERROR( TRITONBACKEND_RequestRelease( requests[i], TRITONSERVER_REQUEST_RELEASE_ALL), "fail to release request"); requests[i] = nullptr; } } TRITONSERVER_ErrorDelete(response_err); } void SendErrorForResponses( std::vector* responses, const uint32_t response_count, TRITONSERVER_Error* response_err) { for (size_t i = 0; i < response_count; i++) { TRITONBACKEND_Response* response = (*responses)[i]; if (response != nullptr) { LOG_IF_ERROR( TRITONBACKEND_ResponseSend( response, TRITONSERVER_RESPONSE_COMPLETE_FINAL, response_err), "fail to send error response"); (*responses)[i] = nullptr; } } TRITONSERVER_ErrorDelete(response_err); } TRITONSERVER_Error* CopyBuffer( const std::string& msg, const TRITONSERVER_MemoryType src_memory_type, const int64_t src_memory_type_id, const TRITONSERVER_MemoryType dst_memory_type, const int64_t dst_memory_type_id, const size_t byte_size, const void* src, void* dst, cudaStream_t cuda_stream, bool* cuda_used, const bool copy_on_stream) { *cuda_used = false; if (byte_size > 0) { if (src == nullptr) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INTERNAL, std::string( msg + ": attempted a copy of " + std::to_string(byte_size) + " Bytes from an uninitialized memory") .c_str()); } if (dst == nullptr) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INTERNAL, std::string( msg + ": attempted a copy of " + std::to_string(byte_size) + " Bytes to an uninitialized memory") .c_str()); } } // For CUDA memcpy, if copy_on_stream is false, all host to host copy will be // blocked in respect to the host, so use memcpy() directly. In this case, // need to be careful on whether the src buffer is valid. if ((src_memory_type != TRITONSERVER_MEMORY_GPU) && (dst_memory_type != TRITONSERVER_MEMORY_GPU)) { #ifdef TRITON_ENABLE_GPU if (copy_on_stream) { auto params = new CopyParams(dst, src, byte_size); cudaLaunchHostFunc( cuda_stream, MemcpyHost, reinterpret_cast(params)); *cuda_used = true; } else { memcpy(dst, src, byte_size); } #else memcpy(dst, src, byte_size); #endif // TRITON_ENABLE_GPU } else { #ifdef TRITON_ENABLE_GPU // [TODO] use cudaMemcpyDefault if UVM is supported for the device auto copy_kind = cudaMemcpyDeviceToDevice; if (src_memory_type != TRITONSERVER_MEMORY_GPU) { copy_kind = cudaMemcpyHostToDevice; } else if (dst_memory_type != TRITONSERVER_MEMORY_GPU) { copy_kind = cudaMemcpyDeviceToHost; } if ((src_memory_type_id != dst_memory_type_id) && (copy_kind == cudaMemcpyDeviceToDevice)) { RETURN_IF_CUDA_ERROR( cudaMemcpyPeerAsync( dst, dst_memory_type_id, src, src_memory_type_id, byte_size, cuda_stream), TRITONSERVER_ERROR_INTERNAL, msg + ": failed to perform CUDA copy"); } else { RETURN_IF_CUDA_ERROR( cudaMemcpyAsync(dst, src, byte_size, copy_kind, cuda_stream), TRITONSERVER_ERROR_INTERNAL, msg + ": failed to perform CUDA copy"); } *cuda_used = true; #else return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INTERNAL, std::string(msg + ": try to use CUDA copy while GPU is not supported") .c_str()); #endif // TRITON_ENABLE_GPU } return nullptr; // success } TRITONSERVER_Error* GetDirectoryContents(const std::string& path, std::set* contents) { #ifdef _WIN32 WIN32_FIND_DATA entry; HANDLE dir = FindFirstFile(path.c_str(), &entry); if (dir == INVALID_HANDLE_VALUE) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INTERNAL, (std::string("failed to open directory: ") + path).c_str()); } if ((entry.cFileName != ".") && (entry.cFileName != "..")) { contents->insert(entry.cFileName); } while (FindNextFileA(dir, &entry)) { if ((entry.cFileName != ".") && (entry.cFileName != "..")) { contents->insert(entry.cFileName); } } FindClose(dir); #else DIR* dir = opendir(path.c_str()); if (dir == nullptr) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INTERNAL, (std::string("failed to open directory: ") + path).c_str()); } struct dirent* entry; while ((entry = readdir(dir)) != nullptr) { std::string entryname = entry->d_name; if ((entryname != ".") && (entryname != "..")) { contents->insert(entryname); } } closedir(dir); #endif return nullptr; // success } TRITONSERVER_Error* FileExists(const std::string& path, bool* exists) { std::string valid_path; GetOSValidPath(path, valid_path); *exists = (access(valid_path.c_str(), F_OK) == 0); return nullptr; // success } TRITONSERVER_Error* ReadTextFile(const std::string& path, std::string* contents) { std::string valid_path; GetOSValidPath(path, valid_path); std::ifstream in(valid_path, std::ios::in | std::ios::binary); if (!in) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INTERNAL, ("failed to open/read file '" + valid_path + "': " + strerror(errno)) .c_str()); } in.seekg(0, std::ios::end); contents->resize(in.tellg()); in.seekg(0, std::ios::beg); in.read(&(*contents)[0], contents->size()); in.close(); return nullptr; // success } TRITONSERVER_Error* IsDirectory(const std::string& path, bool* is_dir) { *is_dir = false; std::string valid_path; GetOSValidPath(path, valid_path); struct stat st; if (stat(valid_path.c_str(), &st) != 0) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INTERNAL, (std::string("failed to stat file ") + valid_path).c_str()); } *is_dir = S_ISDIR(st.st_mode); return nullptr; // success } std::string JoinPath(std::initializer_list segments) { std::string joined; for (const auto& seg : segments) { if (joined.empty()) { joined = seg; } else if (!seg.empty() && (seg[0] == '/')) { // IsAbsolutePath(seg) if (joined[joined.size() - 1] == '/') { joined.append(seg.substr(1)); } else { joined.append(seg); } } else { // !IsAbsolutePath(seg) if (joined[joined.size() - 1] != '/') { joined.append("/"); } joined.append(seg); } } return joined; } TRITONSERVER_Error* ModelPaths( const std::string& model_repository_path, uint64_t version, const bool ignore_directories, const bool ignore_files, std::unordered_map* model_paths) { std::set model_files; // Read all the files in 'path' and filter by type for different requirements auto path = JoinPath({model_repository_path, std::to_string(version)}); RETURN_IF_ERROR(GetDirectoryContents(path, &model_files)); if (ignore_directories) { // Erase directory entries... for (auto iter = model_files.begin(); iter != model_files.end();) { bool is_dir; RETURN_IF_ERROR(IsDirectory(JoinPath({path, *iter}), &is_dir)); if (is_dir) { iter = model_files.erase(iter); } else { ++iter; } } } if (ignore_files) { // Erase non-directory entries... for (auto iter = model_files.begin(); iter != model_files.end();) { bool is_dir; RETURN_IF_ERROR(IsDirectory(JoinPath({path, *iter}), &is_dir)); if (!is_dir) { iter = model_files.erase(iter); } else { ++iter; } } } for (const auto& filename : model_files) { const auto model_path = JoinPath({path, filename}); model_paths->emplace( std::piecewise_construct, std::make_tuple(filename), std::make_tuple(model_path)); } return nullptr; // success } TRITONSERVER_Error* CreateCudaStream( const int device_id, const int cuda_stream_priority, cudaStream_t* stream) { *stream = nullptr; #ifdef TRITON_ENABLE_GPU // Make sure that correct device is set before creating stream and // then restore the device to what was set by the caller. int current_device; auto cuerr = cudaGetDevice(¤t_device); bool overridden = false; if (cuerr == cudaSuccess) { overridden = (current_device != device_id); if (overridden) { cuerr = cudaSetDevice(device_id); } } if (cuerr == cudaSuccess) { cuerr = cudaStreamCreateWithPriority( stream, cudaStreamDefault, cuda_stream_priority); } if (overridden) { cudaSetDevice(current_device); } if (cuerr != cudaSuccess) { *stream = nullptr; return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INTERNAL, (std::string("unable to create stream: ") + cudaGetErrorString(cuerr)) .c_str()); } #endif // TRITON_ENABLE_GPU return nullptr; // success } TRITONSERVER_Error* ParseLongLongValue(const std::string& value, int64_t* parsed_value) { try { *parsed_value = std::stoll(value); } catch (const std::invalid_argument& ia) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INVALID_ARG, (std::string("failed to convert '") + value + "' to long long integral number") .c_str()); } return nullptr; // success } TRITONSERVER_Error* ParseUnsignedLongLongValue(const std::string& value, uint64_t* parsed_value) { try { *parsed_value = std::stoull(value); } catch (const std::invalid_argument& ia) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INVALID_ARG, (std::string("failed to convert '") + value + "' to unsigned long long integral number") .c_str()); } return nullptr; // success } TRITONSERVER_Error* ParseBoolValue(const std::string& value, bool* parsed_value) { std::string lvalue = value; std::transform( lvalue.begin(), lvalue.end(), lvalue.begin(), [](unsigned char c) { return std::tolower(c); }); if ((lvalue == "true") || (lvalue == "on") || (lvalue == "1")) { *parsed_value = true; return nullptr; // success } if ((lvalue == "false") || (lvalue == "off") || (lvalue == "0")) { *parsed_value = false; return nullptr; // success } return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INVALID_ARG, (std::string("failed to convert '") + value + "' to boolean").c_str()); } TRITONSERVER_Error* ParseIntValue(const std::string& value, int* parsed_value) { try { *parsed_value = std::stoi(value); } catch (const std::invalid_argument& ia) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INVALID_ARG, (std::string("failed to convert '") + value + "' to integral number") .c_str()); } return nullptr; // success } TRITONSERVER_Error* ParseDoubleValue(const std::string& value, double* parsed_value) { try { *parsed_value = std::stod(value); } catch (const std::invalid_argument& ia) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INVALID_ARG, (std::string("failed to convert '") + value + "' to double number") .c_str()); } return nullptr; // success } TRITONSERVER_Error* GetParameterValue( triton::common::TritonJson::Value& params, const std::string& key, std::string* value) { triton::common::TritonJson::Value json_value; RETURN_ERROR_IF_FALSE( params.Find(key.c_str(), &json_value), TRITONSERVER_ERROR_NOT_FOUND, std::string("model configuration is missing the parameter ") + key); RETURN_IF_ERROR(json_value.MemberAsString("string_value", value)); return nullptr; // success } TRITONSERVER_Error* BatchInput::ParseFromModelConfig( triton::common::TritonJson::Value& config, std::vector* batch_inputs) { batch_inputs->clear(); triton::common::TritonJson::Value bis; RETURN_IF_ERROR(config.MemberAsArray("batch_input", &bis)); for (size_t i = 0; i < bis.ArraySize(); ++i) { triton::common::TritonJson::Value bi; RETURN_IF_ERROR(bis.IndexAsObject(i, &bi)); batch_inputs->emplace_back(); RETURN_IF_ERROR(batch_inputs->back().Init(bi)); } return nullptr; // success } TRITONSERVER_Error* BatchInput::Init(triton::common::TritonJson::Value& bi_config) { { triton::common::TritonJson::Value bi_target_names; RETURN_IF_ERROR(bi_config.MemberAsArray("target_name", &bi_target_names)); for (size_t i = 0; i < bi_target_names.ArraySize(); ++i) { std::string tn; RETURN_IF_ERROR(bi_target_names.IndexAsString(i, &tn)); target_names_.emplace_back(std::move(tn)); } } { RETURN_IF_ERROR(bi_config.MemberAsString("kind", &kind_str_)); if (kind_str_ == "BATCH_ELEMENT_COUNT") { kind_ = Kind::BATCH_ELEMENT_COUNT; } else if (kind_str_ == "BATCH_ACCUMULATED_ELEMENT_COUNT") { kind_ = Kind::BATCH_ACCUMULATED_ELEMENT_COUNT; } else if (kind_str_ == "BATCH_ACCUMULATED_ELEMENT_COUNT_WITH_ZERO") { kind_ = Kind::BATCH_ACCUMULATED_ELEMENT_COUNT_WITH_ZERO; } else if (kind_str_ == "BATCH_MAX_ELEMENT_COUNT_AS_SHAPE") { kind_ = Kind::BATCH_MAX_ELEMENT_COUNT_AS_SHAPE; } else if (kind_str_ == "BATCH_ITEM_SHAPE") { kind_ = Kind::BATCH_ITEM_SHAPE; } else if (kind_str_ == "BATCH_ITEM_SHAPE_FLATTEN") { kind_ = Kind::BATCH_ITEM_SHAPE_FLATTEN; } else { RETURN_ERROR_IF_FALSE( false, TRITONSERVER_ERROR_INVALID_ARG, std::string("unexpected batch input kind '" + kind_str_ + "'")); } } { std::string bi_dtype; RETURN_IF_ERROR(bi_config.MemberAsString("data_type", &bi_dtype)); data_type_ = ModelConfigDataTypeToTritonServerDataType(bi_dtype); RETURN_ERROR_IF_TRUE( data_type_ == TRITONSERVER_TYPE_INVALID, TRITONSERVER_ERROR_INVALID_ARG, std::string("unexpected batch input data type '" + bi_dtype + "'")); } { triton::common::TritonJson::Value bi_source_inputs; RETURN_IF_ERROR(bi_config.MemberAsArray("source_input", &bi_source_inputs)); for (size_t i = 0; i < bi_source_inputs.ArraySize(); ++i) { std::string si; RETURN_IF_ERROR(bi_source_inputs.IndexAsString(i, &si)); source_inputs_.emplace_back(std::move(si)); } } return nullptr; // success } TRITONSERVER_DataType ModelConfigDataTypeToTritonServerDataType(const std::string& data_type_str) { // Must start with "TYPE_". if (data_type_str.rfind("TYPE_", 0) != 0) { return TRITONSERVER_TYPE_INVALID; } const std::string dtype = data_type_str.substr(strlen("TYPE_")); if (dtype == "BOOL") { return TRITONSERVER_TYPE_BOOL; } else if (dtype == "UINT8") { return TRITONSERVER_TYPE_UINT8; } else if (dtype == "UINT16") { return TRITONSERVER_TYPE_UINT16; } else if (dtype == "UINT32") { return TRITONSERVER_TYPE_UINT32; } else if (dtype == "UINT64") { return TRITONSERVER_TYPE_UINT64; } else if (dtype == "INT8") { return TRITONSERVER_TYPE_INT8; } else if (dtype == "INT16") { return TRITONSERVER_TYPE_INT16; } else if (dtype == "INT32") { return TRITONSERVER_TYPE_INT32; } else if (dtype == "INT64") { return TRITONSERVER_TYPE_INT64; } else if (dtype == "FP16") { return TRITONSERVER_TYPE_FP16; } else if (dtype == "FP32") { return TRITONSERVER_TYPE_FP32; } else if (dtype == "FP64") { return TRITONSERVER_TYPE_FP64; } else if (dtype == "STRING") { return TRITONSERVER_TYPE_BYTES; } else if (dtype == "BF16") { return TRITONSERVER_TYPE_BF16; } return TRITONSERVER_TYPE_INVALID; } TRITONSERVER_Error* BatchOutput::ParseFromModelConfig( triton::common::TritonJson::Value& config, std::vector* batch_outputs) { batch_outputs->clear(); triton::common::TritonJson::Value bos; RETURN_IF_ERROR(config.MemberAsArray("batch_output", &bos)); for (size_t i = 0; i < bos.ArraySize(); ++i) { batch_outputs->emplace_back(); auto& batch_output = batch_outputs->back(); triton::common::TritonJson::Value bo; RETURN_IF_ERROR(bos.IndexAsObject(i, &bo)); { triton::common::TritonJson::Value bo_target_names; RETURN_IF_ERROR(bo.MemberAsArray("target_name", &bo_target_names)); for (size_t i = 0; i < bo_target_names.ArraySize(); ++i) { std::string tn; RETURN_IF_ERROR(bo_target_names.IndexAsString(i, &tn)); batch_output.target_names_.emplace_back(std::move(tn)); } } { std::string bo_kind; RETURN_IF_ERROR(bo.MemberAsString("kind", &bo_kind)); if (bo_kind == "BATCH_SCATTER_WITH_INPUT_SHAPE") { batch_output.kind_ = Kind::BATCH_SCATTER_WITH_INPUT_SHAPE; // Keep track of the output info for later cross reference with input int64_t mbs = 0; RETURN_IF_ERROR(config.MemberAsInt("max_batch_size", &mbs)); if (mbs != 0) { batch_output.shape_.push_back(-1); } triton::common::TritonJson::Value ios; RETURN_IF_ERROR(config.MemberAsArray("output", &ios)); for (size_t i = 0; i < ios.ArraySize(); i++) { triton::common::TritonJson::Value io; RETURN_IF_ERROR(ios.IndexAsObject(i, &io)); std::string io_name; RETURN_IF_ERROR(io.MemberAsString("name", &io_name)); if (io_name == batch_output.target_names_[0]) { std::string io_dtype; RETURN_IF_ERROR(io.MemberAsString("data_type", &io_dtype)); batch_output.data_type_ = ModelConfigDataTypeToTritonServerDataType(io_dtype); // If a reshape is provided for the input then use that when // validating that the model matches what is expected. triton::common::TritonJson::Value reshape; if (io.Find("reshape", &reshape)) { RETURN_IF_ERROR( ParseShape(reshape, "shape", &batch_output.shape_)); } else { RETURN_IF_ERROR(ParseShape(io, "dims", &batch_output.shape_)); } break; } } } else { RETURN_ERROR_IF_FALSE( false, TRITONSERVER_ERROR_INVALID_ARG, std::string("unexpected batch output kind '" + bo_kind + "'")); } } { triton::common::TritonJson::Value bo_source_inputs; RETURN_IF_ERROR(bo.MemberAsArray("source_input", &bo_source_inputs)); for (size_t i = 0; i < bo_source_inputs.ArraySize(); ++i) { std::string si; RETURN_IF_ERROR(bo_source_inputs.IndexAsString(i, &si)); batch_output.source_inputs_.emplace_back(std::move(si)); } } } return nullptr; // success } TRITONSERVER_Error* TryParseModelStringParameter( triton::common::TritonJson::Value& params, const std::string& mkey, std::string* value, const std::string& default_value) { triton::common::TritonJson::Value json_value; if (params.Find(mkey.c_str(), &json_value)) { RETURN_IF_ERROR(json_value.MemberAsString("string_value", value)); } else { *value = default_value; } return nullptr; // success } TRITONSERVER_Error* TryParseModelStringParameter( triton::common::TritonJson::Value& params, const std::string& mkey, int* value, const int& default_value) { triton::common::TritonJson::Value json_value; if (params.Find(mkey.c_str(), &json_value)) { std::string string_value; RETURN_IF_ERROR(json_value.MemberAsString("string_value", &string_value)); return ParseIntValue(string_value, value); } else { *value = default_value; return nullptr; // success } } TRITONSERVER_Error* TryParseModelStringParameter( triton::common::TritonJson::Value& params, const std::string& mkey, bool* value, const bool& default_value) { triton::common::TritonJson::Value json_value; if (params.Find(mkey.c_str(), &json_value)) { std::string string_value; RETURN_IF_ERROR(json_value.MemberAsString("string_value", &string_value)); return ParseBoolValue(string_value, value); } else { *value = default_value; return nullptr; // success } } TRITONSERVER_Error* TryParseModelStringParameter( triton::common::TritonJson::Value& params, const std::string& mkey, uint64_t* value, const uint64_t& default_value) { triton::common::TritonJson::Value json_value; if (params.Find(mkey.c_str(), &json_value)) { std::string string_value; RETURN_IF_ERROR(json_value.MemberAsString("string_value", &string_value)); return ParseUnsignedLongLongValue(string_value, value); } else { *value = default_value; return nullptr; // success } } namespace { template TRITONSERVER_Error* BufferAsTypedString( std::string& str, const char* buffer, const size_t element_cnt) { const T* vals = reinterpret_cast(buffer); str += "[ "; for (size_t i = 0; i < element_cnt; ++i) { const T& v = vals[i]; if (i != 0) { str += ", "; } str += std::to_string(v); } str += " ]"; return nullptr; // success } } // namespace TRITONSERVER_Error* BufferAsTypedString( std::string& str, const char* buffer, size_t buffer_byte_size, TRITONSERVER_DataType datatype) { const size_t element_cnt = buffer_byte_size / TRITONSERVER_DataTypeByteSize(datatype); switch (datatype) { case TRITONSERVER_TYPE_UINT8: return BufferAsTypedString(str, buffer, element_cnt); case TRITONSERVER_TYPE_UINT16: return BufferAsTypedString(str, buffer, element_cnt); case TRITONSERVER_TYPE_UINT32: return BufferAsTypedString(str, buffer, element_cnt); case TRITONSERVER_TYPE_UINT64: return BufferAsTypedString(str, buffer, element_cnt); case TRITONSERVER_TYPE_INT8: return BufferAsTypedString(str, buffer, element_cnt); case TRITONSERVER_TYPE_INT16: return BufferAsTypedString(str, buffer, element_cnt); case TRITONSERVER_TYPE_INT32: return BufferAsTypedString(str, buffer, element_cnt); case TRITONSERVER_TYPE_INT64: return BufferAsTypedString(str, buffer, element_cnt); case TRITONSERVER_TYPE_FP32: return BufferAsTypedString(str, buffer, element_cnt); case TRITONSERVER_TYPE_FP64: return BufferAsTypedString(str, buffer, element_cnt); default: return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INVALID_ARG, std::string( std::string("class result not available for output due to " "unsupported type '") + std::string(TRITONSERVER_DataTypeString(datatype)) + "'") .c_str()); } return nullptr; // success } std::string GetRequestId(TRITONBACKEND_Request* request) { const char* request_id = nullptr; LOG_IF_ERROR( TRITONBACKEND_RequestId(request, &request_id), "unable to retrieve request ID string"); if ((request_id == nullptr) || (request_id[0] == '\0')) { request_id = ""; } return std::string("[request id: ") + request_id + "] "; } TRITONSERVER_Error* ValidateStringBuffer( const char* buffer, size_t buffer_byte_size, const size_t expected_element_cnt, const char* input_name, std::vector>* str_list) { size_t element_idx = 0; size_t remaining_bytes = buffer_byte_size; // Each string in 'buffer' is a 4-byte length followed by the string itself // with no null-terminator. while (remaining_bytes >= sizeof(uint32_t)) { // Do not modify this line. str_list->size() must not exceed // expected_element_cnt. if (element_idx >= expected_element_cnt) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INVALID_ARG, std::string( "unexpected number of string elements " + std::to_string(element_idx + 1) + " for inference input '" + input_name + "', expecting " + std::to_string(expected_element_cnt)) .c_str()); } const uint32_t len = *(reinterpret_cast(buffer)); remaining_bytes -= sizeof(uint32_t); buffer += sizeof(uint32_t); if (remaining_bytes < len) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INVALID_ARG, std::string( "incomplete string data for inference input '" + std::string(input_name) + "', expecting string of length " + std::to_string(len) + " but only " + std::to_string(remaining_bytes) + " bytes available") .c_str()); } if (str_list) { str_list->push_back({buffer, len}); } buffer += len; remaining_bytes -= len; element_idx++; } if (element_idx != expected_element_cnt) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INTERNAL, std::string( "expected " + std::to_string(expected_element_cnt) + " strings for inference input '" + input_name + "', got " + std::to_string(element_idx)) .c_str()); } return nullptr; } TRITONSERVER_Error* GetOSValidPath(const std::string& path, std::string& ret_path) { std::string l_path(path); #ifdef _WIN32 constexpr const const char* kWindowsLongPathPrefix = "\\\\?\\"; // On Windows long paths must be marked correctly otherwise, due to backwards // compatibility, all paths are limited to MAX_PATH length if (l_path.size() >= MAX_PATH) { // Must be prefixed with "\\?\" to be considered long path if (l_path.substr(0, 4) != (kWindowsLongPathPrefix)) { // Long path but not "tagged" correctly l_path = (kWindowsLongPathPrefix) + l_path; } } std::replace(l_path.begin(), l_path.end(), '/', '\\'); #endif ret_path = l_path; return nullptr; } }} // namespace triton::backend ================================================ FILE: src/backend_input_collector.cc ================================================ // Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // * Neither the name of NVIDIA CORPORATION nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "triton/backend/backend_input_collector.h" #include #include "triton/backend/backend_common.h" #ifdef TRITON_ENABLE_GPU #include "kernel.h" #endif // TRITON_ENABLE_GPU namespace triton { namespace backend { // // BackendInputCollector::InputIterator // BackendInputCollector::InputIterator::InputIterator( TRITONBACKEND_Request** requests, const uint32_t request_count, std::vector* responses, const char* input_name, const char* host_policy_name, const bool coalesce_request_input) : requests_(requests), request_count_(request_count), responses_(responses), input_name_(input_name), host_policy_(host_policy_name), coalesce_request_input_(coalesce_request_input), curr_request_idx_(0), curr_buffer_idx_(0), reach_end_(false) { auto& response = (*responses_)[curr_request_idx_]; RESPOND_AND_SET_NULL_IF_ERROR( &response, TRITONBACKEND_RequestInput( requests_[curr_request_idx_], input_name_, &curr_input_)); RESPOND_AND_SET_NULL_IF_ERROR( &response, TRITONBACKEND_InputPropertiesForHostPolicy( curr_input_, host_policy_, nullptr, nullptr, nullptr, nullptr, nullptr, &curr_buffer_cnt_)); } bool BackendInputCollector::InputIterator::GetNextContiguousInput( ContiguousBuffer* input) { if (reach_end_ || (curr_buffer_idx_ >= curr_buffer_cnt_)) { return false; } // Get the first buffer TRITONBACKEND_InputBufferForHostPolicy( curr_input_, host_policy_, curr_buffer_idx_, reinterpret_cast(&input->memory_desc_.buffer_), reinterpret_cast(&input->memory_desc_.byte_size_), &input->memory_desc_.memory_type_, &input->memory_desc_.memory_type_id_); ++curr_buffer_idx_; input->start_request_idx_ = curr_request_idx_; input->end_request_idx_ = curr_request_idx_; if (!coalesce_request_input_) { if (curr_buffer_idx_ >= curr_buffer_cnt_) { ++curr_request_idx_; if (curr_request_idx_ < request_count_) { auto& response = (*responses_)[curr_request_idx_]; RESPOND_AND_SET_NULL_IF_ERROR( &response, TRITONBACKEND_RequestInput( requests_[curr_request_idx_], input_name_, &curr_input_)); RESPOND_AND_SET_NULL_IF_ERROR( &response, TRITONBACKEND_InputPropertiesForHostPolicy( curr_input_, host_policy_, nullptr, nullptr, nullptr, nullptr, nullptr, &curr_buffer_cnt_)); // reset buffer idx curr_buffer_idx_ = 0; } else { reach_end_ = true; } } return true; } do { for (; curr_buffer_idx_ < curr_buffer_cnt_; ++curr_buffer_idx_) { const void* next_buffer; size_t next_buffer_byte_size; TRITONSERVER_MemoryType next_memory_type; int64_t next_memory_type_id; TRITONBACKEND_InputBufferForHostPolicy( curr_input_, host_policy_, curr_buffer_idx_, &next_buffer, reinterpret_cast(&next_buffer_byte_size), &next_memory_type, &next_memory_type_id); if (((input->memory_desc_.buffer_ + input->memory_desc_.byte_size_) != next_buffer) || (input->memory_desc_.memory_type_ != next_memory_type) || (input->memory_desc_.memory_type_id_ != next_memory_type_id)) { return true; } input->memory_desc_.byte_size_ += next_buffer_byte_size; input->end_request_idx_ = curr_request_idx_; } // Iterated all buffers for current request, check next ++curr_request_idx_; if (curr_request_idx_ < request_count_) { auto& response = (*responses_)[curr_request_idx_]; RESPOND_AND_SET_NULL_IF_ERROR( &response, TRITONBACKEND_RequestInput( requests_[curr_request_idx_], input_name_, &curr_input_)); RESPOND_AND_SET_NULL_IF_ERROR( &response, TRITONBACKEND_InputPropertiesForHostPolicy( curr_input_, host_policy_, nullptr, nullptr, nullptr, nullptr, nullptr, &curr_buffer_cnt_)); // reset buffer idx curr_buffer_idx_ = 0; } } while (curr_request_idx_ < request_count_); reach_end_ = true; return true; } // // BackendInputCollector // bool BackendInputCollector::GetInputBufferIfContiguous( const char* input_name, const char** buffer, size_t* buffer_byte_size, TRITONSERVER_MemoryType* memory_type, int64_t* memory_type_id) { *buffer = nullptr; *buffer_byte_size = 0; const char* expected_next_buffer = nullptr; bool contiguous = true; for (size_t idx = 0; idx < request_count_; idx++) { auto& request = requests_[idx]; auto& response = (*responses_)[idx]; TRITONBACKEND_Input* input; RESPOND_AND_SET_NULL_IF_ERROR( &response, TRITONBACKEND_RequestInput(request, input_name, &input)); uint64_t byte_size; uint32_t buffer_count; RESPOND_AND_SET_NULL_IF_ERROR( &response, TRITONBACKEND_InputPropertiesForHostPolicy( input, host_policy_cstr_, nullptr, nullptr, nullptr, nullptr, &byte_size, &buffer_count)); for (size_t idx = 0; idx < buffer_count; ++idx) { const void* src_buffer; size_t src_byte_size; TRITONSERVER_MemoryType src_memory_type; int64_t src_memory_type_id; RESPOND_AND_SET_NULL_IF_ERROR( &response, TRITONBACKEND_InputBufferForHostPolicy( input, host_policy_cstr_, idx, &src_buffer, reinterpret_cast(&src_byte_size), &src_memory_type, &src_memory_type_id)); if (*buffer != nullptr) { // If have seen the second buffer while coalescing input is not // requested, treat the inputs are not contiguous if (coalesce_request_input_ && (expected_next_buffer == src_buffer) && (*memory_type == src_memory_type) && (*memory_type_id == src_memory_type_id)) { expected_next_buffer += src_byte_size; } else { contiguous = false; } // Want to know total buffer byte size even if it is not contiguous *buffer_byte_size += src_byte_size; } else { *buffer = reinterpret_cast(src_buffer); *memory_type = src_memory_type; *memory_type_id = src_memory_type_id; *buffer_byte_size = src_byte_size; expected_next_buffer = *buffer + src_byte_size; } } } return contiguous; } void BackendInputCollector::ProcessTensor( const char* input_name, char* buffer, const size_t buffer_byte_size, const TRITONSERVER_MemoryType memory_type, const int64_t memory_type_id) { // A value of CPU_PINNED indicates that pinned memory buffer is not // needed for this tensor. Any other value indicates that a pinned // memory buffer is needed when the target memory type matches // 'use_pinned_memory_type'. TRITONSERVER_MemoryType use_pinned_memory_type = TRITONSERVER_MEMORY_CPU_PINNED; if (pinned_enabled_) { use_pinned_memory_type = GetUsePinnedMemoryType(memory_type); } const bool use_kernel = (kernel_buffer_threshold_ != 0); size_t buffer_offset = 0; InputIterator ii( requests_, request_count_, responses_, input_name, host_policy_cstr_, coalesce_request_input_); ContiguousBuffer input; while (ii.GetNextContiguousInput(&input)) { // If there are pending copies from tensor buffer that is not // contiguous with 'response's part of that buffer, then need to // go ahead and perform the pending copies so that can start a new // contiguous region if necessary. if ((pending_pinned_byte_size_ > 0) && (buffer_offset != (pending_pinned_byte_size_ + pending_pinned_offset_))) { need_sync_ |= FlushPendingPinned( buffer, buffer_byte_size, memory_type, memory_type_id); } if ((pending_copy_kernel_buffer_byte_size_ > 0) && (buffer_offset != (pending_copy_kernel_buffer_byte_size_ + pending_copy_kernel_buffer_offset_))) { need_sync_ |= FlushPendingCopyKernel( buffer, buffer_byte_size, memory_type, memory_type_id); } need_sync_ |= SetInputTensor( input_name, input, buffer, buffer_byte_size, memory_type, memory_type_id, buffer_offset, use_pinned_memory_type, use_kernel, true); buffer_offset += input.memory_desc_.byte_size_; } // Done with the tensor, flush any pending pinned copies. need_sync_ |= FlushPendingPinned(buffer, buffer_byte_size, memory_type, memory_type_id); need_sync_ |= FlushPendingCopyKernel( buffer, buffer_byte_size, memory_type, memory_type_id); #ifdef TRITON_ENABLE_GPU if (need_sync_ && (event_ != nullptr)) { cudaEventRecord(event_, stream_); } #endif // TRITON_ENABLE_GPU } TRITONSERVER_Error* BackendInputCollector::ProcessTensor( const char* input_name, char* buffer, const size_t buffer_byte_size, const std::vector>& allowed_input_types, const char** dst_buffer, size_t* dst_buffer_byte_size, TRITONSERVER_MemoryType* dst_memory_type, int64_t* dst_memory_type_id) { if (buffer == nullptr) { if (allowed_input_types.size() == 0) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INTERNAL, "'allowed_input_types' must contain at least one pair of memory type " "and id"); } if (GetInputBufferIfContiguous( input_name, dst_buffer, dst_buffer_byte_size, dst_memory_type, dst_memory_type_id)) { // zero size buffer will be treated as contiguous as well, // but we want to invoke backend memory to have a valid address. if (*dst_buffer_byte_size != 0) { // If the buffer is contiguous, check if the caller expects its type for (const auto& allowed_type : allowed_input_types) { if ((*dst_memory_type == allowed_type.first) && ((*dst_memory_type_id == allowed_type.second))) { return nullptr; // success } } } } // A separate buffer is needed BackendMemory* backend_memory = nullptr; for (const auto& allowed_type : allowed_input_types) { std::vector alloc_types; const int64_t memory_type_id = allowed_type.second; switch (allowed_type.first) { case TRITONSERVER_MEMORY_GPU: alloc_types = { BackendMemory::AllocationType::GPU_POOL, BackendMemory::AllocationType::GPU}; break; case TRITONSERVER_MEMORY_CPU_PINNED: alloc_types = { BackendMemory::AllocationType::CPU_PINNED_POOL, BackendMemory::AllocationType::CPU_PINNED}; break; case TRITONSERVER_MEMORY_CPU: alloc_types = {BackendMemory::AllocationType::CPU}; break; } auto err = BackendMemory::Create( memory_manager_, alloc_types, memory_type_id, *dst_buffer_byte_size, &backend_memory); if (err != nullptr) { LOG_MESSAGE( TRITONSERVER_LOG_VERBOSE, (std::string("unable to create backend memory for type: ") + TRITONSERVER_MemoryTypeString(allowed_type.first) + " id: " + std::to_string(memory_type_id) + ": " + TRITONSERVER_ErrorMessage(err)) .c_str()); TRITONSERVER_ErrorDelete(err); } else { in_use_memories_.emplace_back(backend_memory); break; } } if (backend_memory == nullptr) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INTERNAL, (std::string("failed to allocate contiguous buffer for input '") + input_name + "'") .c_str()); } buffer = backend_memory->MemoryPtr(); *dst_buffer = backend_memory->MemoryPtr(); *dst_buffer_byte_size = backend_memory->ByteSize(); *dst_memory_type = backend_memory->MemoryType(); *dst_memory_type_id = backend_memory->MemoryTypeId(); } else { if (allowed_input_types.size() != 1) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INTERNAL, "'allowed_input_types' must only contain the memory type and id of " "'buffer'"); } *dst_buffer = buffer; *dst_buffer_byte_size = buffer_byte_size; *dst_memory_type = allowed_input_types[0].first; *dst_memory_type_id = allowed_input_types[0].second; } if (*dst_buffer_byte_size != 0) { ProcessTensor( input_name, buffer, *dst_buffer_byte_size, *dst_memory_type, *dst_memory_type_id); } return nullptr; // success } bool BackendInputCollector::Finalize() { #ifdef TRITON_ENABLE_GPU if ((!deferred_pinned_.empty()) && need_sync_) { if (event_ != nullptr) { cudaEventSynchronize(event_); } else { cudaStreamSynchronize(stream_); } need_sync_ = false; } #endif // TRITON_ENABLE_GPU // After the above sync all the GPU->pinned copies are complete. Any // deferred copies of pinned->CPU can now be done. #ifdef TRITON_ENABLE_GPU if (buffer_ready_event_ != nullptr) { cudaEventSynchronize(buffer_ready_event_); buffer_ready_event_ = nullptr; } #endif // TRITON_ENABLE_GPU for (auto& def : deferred_pinned_) { if (!def.finalized_) { need_sync_ |= def.Finalize(stream_); } } for (size_t i = 0; i < async_task_count_; i++) { need_sync_ |= completion_queue_.Get(); } #ifdef TRITON_ENABLE_GPU // Record the new event location if deferred copies occur if ((!deferred_pinned_.empty()) && need_sync_ && (event_ != nullptr)) { cudaEventRecord(event_, stream_); } #endif // TRITON_ENABLE_GPU return need_sync_; } bool BackendInputCollector::DeferredPinned::Finalize(cudaStream_t stream) { bool cuda_used = false; auto err = CopyBuffer( "pinned buffer", TRITONSERVER_MEMORY_CPU_PINNED, 0, tensor_memory_type_, tensor_memory_id_, pinned_memory_size_, pinned_memory_, tensor_buffer_ + tensor_buffer_offset_, stream, &cuda_used); // If something goes wrong with the copy all the pending // responses fail... if (err != nullptr) { for (auto& pr : requests_) { for (size_t idx = pr.start_request_idx_; idx <= pr.end_request_idx_; ++idx) { if ((*responses_)[idx] != nullptr) { LOG_IF_ERROR( TRITONBACKEND_ResponseSend( (*responses_)[idx], TRITONSERVER_RESPONSE_COMPLETE_FINAL, err), "failed to send error response"); (*responses_)[idx] = nullptr; } } } TRITONSERVER_ErrorDelete(err); } return cuda_used; } bool BackendInputCollector::SetInputTensor( const char* input_name, const ContiguousBuffer& input, char* tensor_buffer, const size_t tensor_buffer_byte_size, const TRITONSERVER_MemoryType tensor_memory_type, const int64_t tensor_memory_type_id, const size_t tensor_buffer_offset, const TRITONSERVER_MemoryType use_pinned_memory_type, const bool use_kernel, const bool wait_buffer) { bool cuda_copy = false; if ((tensor_buffer_offset + input.memory_desc_.byte_size_) > tensor_buffer_byte_size) { for (size_t i = input.start_request_idx_; i <= input.end_request_idx_; ++i) { RESPOND_AND_SET_NULL_IF_ERROR( &(*responses_)[i], TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INVALID_ARG, std::string( "unexpected total byte size " + std::to_string( tensor_buffer_offset + input.memory_desc_.byte_size_) + " for input '" + input_name + "', expecting " + std::to_string(tensor_buffer_byte_size)) .c_str())); } return cuda_copy; } // If the request buffer matches the memory type that should use an // intermediate pinned memory buffer for the transfer, then just // record the input as pending and increase the size required for // the intermediate pinned buffer. We only do this check for the // first buffer of an input and apply the same policy for all // buffers. So if an inputs data is split over different memory // types this may not be ideal but that should be a very rare // situation. if ((use_pinned_memory_type != TRITONSERVER_MEMORY_CPU_PINNED) && (input.memory_desc_.memory_type_ == use_pinned_memory_type)) { if (pending_pinned_byte_size_ == 0) { pending_pinned_offset_ = tensor_buffer_offset; } pending_pinned_byte_size_ += input.memory_desc_.byte_size_; pending_pinned_input_buffers_.push_back(input); return cuda_copy; } // [FIXME] support other direction if prove to be faster, all kernel // handling code in this class assumes the destination buffer is on device // If the request buffer and the destination buffer are accessible by all // GPUs (i.e. pinned, device), initiate the copy via copy CUDA kernel. // We only do this check for the // first buffer of an input and apply the same policy for all // buffers. So if an inputs data is split over different memory // types this may not be ideal but that should be a very rare // situation. // Currently checked direction: // pinned -> device // same device -> device // different device -> device if (use_kernel && (input.memory_desc_.memory_type_ != TRITONSERVER_MEMORY_CPU) && (tensor_memory_type == TRITONSERVER_MEMORY_GPU)) { // [FIXME] Currently not allowing copy between devices as it requires // peer-to-peer access to be enabled. Peer-to-peer is enabled by default, // but server can still runs even if it fails to enable peer-to-peer. // Should provide a utility to check whether a device pair allows direct // access and use gather kernel accordingly if ((input.memory_desc_.memory_type_ != TRITONSERVER_MEMORY_GPU) || (input.memory_desc_.memory_type_id_ == tensor_memory_type_id)) { if (pending_copy_kernel_buffer_byte_size_ == 0) { pending_copy_kernel_buffer_offset_ = tensor_buffer_offset; } pending_copy_kernel_buffer_byte_size_ += input.memory_desc_.byte_size_; ++pending_copy_kernel_input_buffer_counts_; pending_copy_kernel_input_buffers_.push_back(input); return cuda_copy; } } #ifdef TRITON_ENABLE_GPU if (wait_buffer && (buffer_ready_event_ != nullptr)) { cudaEventSynchronize(buffer_ready_event_); buffer_ready_event_ = nullptr; } #endif // TRITON_ENABLE_GPU // Direct copy without intermediate pinned memory. bool cuda_used = false; auto err = CopyBuffer( input_name, input.memory_desc_.memory_type_, input.memory_desc_.memory_type_id_, tensor_memory_type, tensor_memory_type_id, input.memory_desc_.byte_size_, input.memory_desc_.buffer_, tensor_buffer + tensor_buffer_offset, stream_, &cuda_used, copy_on_stream_); if (err != nullptr) { for (size_t i = input.start_request_idx_; i <= input.end_request_idx_; ++i) { RESPOND_AND_SET_NULL_IF_ERROR( &(*responses_)[i], TRITONSERVER_ErrorNew( TRITONSERVER_ErrorCode(err), TRITONSERVER_ErrorMessage(err))); } TRITONSERVER_ErrorDelete(err); } cuda_copy |= cuda_used; return cuda_copy; } bool BackendInputCollector::FlushPendingPinned( char* tensor_buffer, const size_t tensor_buffer_byte_size, const TRITONSERVER_MemoryType tensor_memory_type, const int64_t tensor_memory_type_id) { bool cuda_copy = false; // Will be copying from CPU->pinned->GPU or GPU->pinned->CPU // Attempt to allocate a pinned buffer to use for staging the // copy... if we fail to allocated the pinned buffer then we just // directly go CPU->GPU or GPU->CPU. char* pinned_memory = nullptr; int64_t pinned_memory_type_id = 0; TRITONSERVER_MemoryType pinned_memory_type; BackendMemory* backend_memory; if (pending_pinned_byte_size_ > 0) { TRITONSERVER_Error* err = BackendMemory::Create( memory_manager_, {BackendMemory::AllocationType::CPU_PINNED_POOL, BackendMemory::AllocationType::CPU_PINNED}, 0 /* memory_type_id */, pending_pinned_byte_size_, &backend_memory); if (err != nullptr) { TRITONSERVER_ErrorDelete(err); } else { pinned_memory = backend_memory->MemoryPtr(); pinned_memory_type = backend_memory->MemoryType(); pinned_memory_type_id = backend_memory->MemoryTypeId(); } } // If the pinned buffer wasn't actually allocated then just perform // a direct copy. if (pinned_memory == nullptr) { size_t offset = 0; for (auto& pr : pending_pinned_input_buffers_) { cuda_copy |= SetInputTensor( "pinned fallback", pr, tensor_buffer, tensor_buffer_byte_size, tensor_memory_type, tensor_memory_type_id, pending_pinned_offset_ + offset, TRITONSERVER_MEMORY_CPU_PINNED, false, true); offset += pr.memory_desc_.byte_size_; } } // We have a pinned buffer so copy the pending input buffer(s) into // the pinned memory. else { // pinned_memory_type == TRITONSERVER_MEMORY_CPU_PINNED bool cuda_used = false; size_t offset = 0; if (!use_async_cpu_copy_) { for (auto& pr : pending_pinned_input_buffers_) { cuda_used |= SetInputTensor( "pinned H2H", pr, pinned_memory, pending_pinned_byte_size_, TRITONSERVER_MEMORY_CPU_PINNED, 0 /* memory_type_id */, offset, TRITONSERVER_MEMORY_CPU_PINNED, false, true); offset += pr.memory_desc_.byte_size_; } cuda_copy |= cuda_used; // If the copy was not async (i.e. if request input was in CPU so // a CPU->CPU-PINNED copy was performed above), then the pinned // buffer now holds the tensor contents and we can immediately // issue the copies from the pinned buffer to the tensor. // // Otherwise the GPU->CPU-PINNED async copies are in flight and we // simply remember the pinned buffer and the corresponding // request inputs so that we can do the pinned->CPU copies in // finalize after we have waited for all async copies to complete. if (!cuda_used) { #ifdef TRITON_ENABLE_GPU if (buffer_ready_event_ != nullptr) { cudaEventSynchronize(buffer_ready_event_); buffer_ready_event_ = nullptr; } #endif // TRITON_ENABLE_GPU auto err = CopyBuffer( "pinned input buffer H2D", TRITONSERVER_MEMORY_CPU_PINNED, 0 /* memory_type_id */, tensor_memory_type, tensor_memory_type_id, pending_pinned_byte_size_, pinned_memory, tensor_buffer + pending_pinned_offset_, stream_, &cuda_used, copy_on_stream_); cuda_copy |= cuda_used; // If something goes wrong with the copy all the pending // responses fail... if (err != nullptr) { for (auto& pr : pending_pinned_input_buffers_) { for (size_t idx = pr.start_request_idx_; idx <= pr.end_request_idx_; ++idx) { if ((*responses_)[idx] != nullptr) { LOG_IF_ERROR( TRITONBACKEND_ResponseSend( (*responses_)[idx], TRITONSERVER_RESPONSE_COMPLETE_FINAL, err), "failed to send error response"); (*responses_)[idx] = nullptr; } } } TRITONSERVER_ErrorDelete(err); } } else { // cuda_used deferred_pinned_.emplace_back( pinned_memory, pending_pinned_byte_size_, tensor_buffer, pending_pinned_offset_, tensor_memory_type, tensor_memory_type_id, std::move(pending_pinned_input_buffers_), responses_); } } else { async_task_count_++; deferred_pinned_.emplace_back( pinned_memory, pending_pinned_byte_size_, tensor_buffer, pending_pinned_offset_, tensor_memory_type, tensor_memory_type_id, std::move(pending_pinned_input_buffers_), responses_); auto& deferred_pinned = deferred_pinned_.back(); // Mark finalized to avoid duplicated call to DeferredPinned::Finalized() // in BackendInputCollector::Finalize() deferred_pinned_.back().finalized_ = true; auto incomplete_count = new std::atomic(std::min( deferred_pinned_.back().requests_.size(), triton::common::AsyncWorkQueue::WorkerCount())); auto pending_pinned_byte_size = pending_pinned_byte_size_; size_t stride = (deferred_pinned_.back().requests_.size() + triton::common::AsyncWorkQueue::WorkerCount() - 1) / triton::common::AsyncWorkQueue::WorkerCount(); auto pending_it = deferred_pinned_.back().requests_.begin(); while (pending_it != deferred_pinned_.back().requests_.end()) { auto end_it = pending_it; auto next_offset = offset; for (size_t idx = 0; idx < stride; idx++) { next_offset += end_it->memory_desc_.byte_size_; end_it++; if (end_it == deferred_pinned_.back().requests_.end()) { break; } } auto err = CommonErrorToTritonError(triton::common::AsyncWorkQueue::AddTask( [this, offset, pinned_memory, pinned_memory_type, pending_pinned_byte_size, pinned_memory_type_id, pending_it, end_it, incomplete_count, &deferred_pinned]() mutable { for (; pending_it != end_it; pending_it++) { SetInputTensor( "pinned async H2H", *pending_it, pinned_memory, pending_pinned_byte_size, pinned_memory_type, pinned_memory_type_id, offset, TRITONSERVER_MEMORY_CPU_PINNED, false, false); offset += pending_it->memory_desc_.byte_size_; } // The last segmented task will start the next phase of // the internal pinned buffer copy if (incomplete_count->fetch_sub(1) == 1) { #ifdef TRITON_ENABLE_GPU if (buffer_ready_event_ != nullptr) { cudaEventSynchronize(buffer_ready_event_); buffer_ready_event_ = nullptr; } #endif // TRITON_ENABLE_GPU completion_queue_.Put(deferred_pinned.Finalize(stream_)); delete incomplete_count; } })); if (err != nullptr) { for (; pending_it != end_it; pending_it++) { for (size_t idx = pending_it->start_request_idx_; idx <= pending_it->end_request_idx_; ++idx) { if ((*responses_)[idx] != nullptr) { LOG_IF_ERROR( TRITONBACKEND_ResponseSend( (*responses_)[idx], TRITONSERVER_RESPONSE_COMPLETE_FINAL, err), "failed to send error response"); (*responses_)[idx] = nullptr; } } } } TRITONSERVER_ErrorDelete(err); offset = next_offset; pending_it = end_it; } } } // Pending pinned copies are handled... pending_pinned_byte_size_ = 0; pending_pinned_offset_ = 0; pending_pinned_input_buffers_.clear(); // Need to hold on to the allocated pinned buffer as there are still // copies in flight. Will delete it in finalize. if (pinned_memory != nullptr) { in_use_memories_.emplace_back(backend_memory); } return cuda_copy; } TRITONSERVER_Error* BackendInputCollector::BatchInputShape( const BatchInput& batch_input, std::vector* shape) { *shape = std::vector{0}; switch (batch_input.BatchInputKind()) { case BatchInput::Kind::BATCH_ELEMENT_COUNT: case BatchInput::Kind::BATCH_ACCUMULATED_ELEMENT_COUNT: { (*shape)[0] = request_count_; break; } case BatchInput::Kind::BATCH_ACCUMULATED_ELEMENT_COUNT_WITH_ZERO: { (*shape)[0] = request_count_ + 1; break; } case BatchInput::Kind::BATCH_MAX_ELEMENT_COUNT_AS_SHAPE: { const auto& source_input = batch_input.SourceInputs()[0]; for (size_t req_idx = 0; req_idx < request_count_; req_idx++) { TRITONBACKEND_Input* input; RETURN_IF_ERROR(TRITONBACKEND_RequestInput( requests_[req_idx], source_input.c_str(), &input)); const int64_t* shape_arr; uint32_t dims_count; int64_t element_cnt = 0; RETURN_IF_ERROR(TRITONBACKEND_InputPropertiesForHostPolicy( input, host_policy_cstr_, nullptr, nullptr, &shape_arr, &dims_count, nullptr, nullptr)); RETURN_IF_ERROR(GetElementCount(shape_arr, dims_count, &element_cnt)); (*shape)[0] = std::max((*shape)[0], element_cnt); } break; } case BatchInput::Kind::BATCH_ITEM_SHAPE: { shape->emplace_back(0); const auto& source_input = batch_input.SourceInputs()[0]; for (size_t req_idx = 0; req_idx < request_count_; req_idx++) { TRITONBACKEND_Input* input; RETURN_IF_ERROR(TRITONBACKEND_RequestInput( requests_[req_idx], source_input.c_str(), &input)); const int64_t* shape_arr; uint32_t dims_count; RETURN_IF_ERROR(TRITONBACKEND_InputPropertiesForHostPolicy( input, host_policy_cstr_, nullptr, nullptr, &shape_arr, &dims_count, nullptr, nullptr)); // Assuming first dimension is batch size and ragged input is only set // for batching enabled model. (*shape)[0] += shape_arr[0]; // The batch input tracks the shape without batch dimension for // each batch item (*shape)[1] = (dims_count - 1); } break; } case BatchInput::Kind::BATCH_ITEM_SHAPE_FLATTEN: { const auto& source_input = batch_input.SourceInputs()[0]; for (size_t req_idx = 0; req_idx < request_count_; req_idx++) { TRITONBACKEND_Input* input; RETURN_IF_ERROR(TRITONBACKEND_RequestInput( requests_[req_idx], source_input.c_str(), &input)); const int64_t* shape_arr; uint32_t dims_count; RETURN_IF_ERROR(TRITONBACKEND_InputPropertiesForHostPolicy( input, host_policy_cstr_, nullptr, nullptr, &shape_arr, &dims_count, nullptr, nullptr)); // Assuming first dimension is batch size and ragged input is only set // for batching enabled model. // The batch input tracks the shape without batch dimension for // each batch item (*shape)[0] += (shape_arr[0] * (dims_count - 1)); } break; } default: return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INTERNAL, "unsupported BatchInputKind received"); } return nullptr; // success } TRITONSERVER_Error* BackendInputCollector::ProcessBatchInput( const BatchInput& batch_input, char* buffer, const size_t buffer_byte_size, const std::vector>& allowed_input_types, const char** dst_buffer, size_t* dst_buffer_byte_size, TRITONSERVER_MemoryType* dst_memory_type, int64_t* dst_memory_type_id) { #ifdef TRITON_ENABLE_GPU if (buffer_ready_event_ != nullptr) { cudaEventSynchronize(buffer_ready_event_); buffer_ready_event_ = nullptr; } #endif // TRITON_ENABLE_GPU if (buffer == nullptr) { if (allowed_input_types.size() == 0) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INTERNAL, "'allowed_input_types' must contain at least one pair of memory type " "and id"); } // Calculate the byte size of the buffer std::vector shape; RETURN_IF_ERROR(BatchInputShape(batch_input, &shape)); RETURN_IF_ERROR(GetByteSize( batch_input.DataType(), shape, reinterpret_cast(dst_buffer_byte_size))); BackendMemory* backend_memory = nullptr; for (const auto& allowed_type : allowed_input_types) { std::vector alloc_types; const int64_t memory_type_id = allowed_type.second; switch (allowed_type.first) { case TRITONSERVER_MEMORY_GPU: alloc_types = { BackendMemory::AllocationType::GPU_POOL, BackendMemory::AllocationType::GPU}; break; case TRITONSERVER_MEMORY_CPU_PINNED: alloc_types = { BackendMemory::AllocationType::CPU_PINNED_POOL, BackendMemory::AllocationType::CPU_PINNED}; break; case TRITONSERVER_MEMORY_CPU: alloc_types = {BackendMemory::AllocationType::CPU}; break; } auto err = BackendMemory::Create( memory_manager_, alloc_types, memory_type_id, *dst_buffer_byte_size, &backend_memory); if (err != nullptr) { LOG_MESSAGE( TRITONSERVER_LOG_VERBOSE, (std::string("unable to create backend memory for type: ") + TRITONSERVER_MemoryTypeString(allowed_type.first) + " id: " + std::to_string(memory_type_id) + ": " + TRITONSERVER_ErrorMessage(err)) .c_str()); TRITONSERVER_ErrorDelete(err); } else { in_use_memories_.emplace_back(backend_memory); break; } } if (backend_memory == nullptr) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INTERNAL, (std::string( "failed to allocate contiguous buffer for batch input '") + batch_input.TargetNames()[0] + "'") .c_str()); } buffer = backend_memory->MemoryPtr(); *dst_buffer = backend_memory->MemoryPtr(); *dst_buffer_byte_size = backend_memory->ByteSize(); *dst_memory_type = backend_memory->MemoryType(); *dst_memory_type_id = backend_memory->MemoryTypeId(); } else { if (allowed_input_types.size() != 1) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INTERNAL, "'allowed_input_types' must only contain the memory type and id of " "'buffer'"); } *dst_buffer = buffer; *dst_buffer_byte_size = buffer_byte_size; *dst_memory_type = allowed_input_types[0].first; *dst_memory_type_id = allowed_input_types[0].second; } char* input_buffer = buffer; std::unique_ptr internal_buffer; // Need a CPU buffer for modifying the value if (*dst_memory_type == TRITONSERVER_MEMORY_GPU) { BackendMemory* ib = nullptr; RETURN_IF_ERROR(BackendMemory::Create( memory_manager_, {BackendMemory::AllocationType::CPU_PINNED_POOL, BackendMemory::AllocationType::CPU}, 0, *dst_buffer_byte_size, &ib)); internal_buffer.reset(ib); input_buffer = internal_buffer->MemoryPtr(); } const auto& data_type = batch_input.DataType(); switch (batch_input.BatchInputKind()) { case BatchInput::Kind::BATCH_ELEMENT_COUNT: { const auto& source_input = batch_input.SourceInputs()[0]; if (data_type == TRITONSERVER_TYPE_FP32) { RETURN_IF_ERROR(SetElementCount( source_input, input_buffer, *dst_buffer_byte_size)); } else { RETURN_IF_ERROR(SetElementCount( source_input, input_buffer, *dst_buffer_byte_size)); } break; } case BatchInput::Kind::BATCH_ACCUMULATED_ELEMENT_COUNT: { const auto& source_input = batch_input.SourceInputs()[0]; if (data_type == TRITONSERVER_TYPE_FP32) { RETURN_IF_ERROR(SetAccumulatedElementCount( source_input, input_buffer, *dst_buffer_byte_size)); } else { RETURN_IF_ERROR(SetAccumulatedElementCount( source_input, input_buffer, *dst_buffer_byte_size)); } break; } case BatchInput::Kind::BATCH_ACCUMULATED_ELEMENT_COUNT_WITH_ZERO: { const auto& source_input = batch_input.SourceInputs()[0]; if (data_type == TRITONSERVER_TYPE_FP32) { *reinterpret_cast(input_buffer) = 0; if (*dst_buffer_byte_size < sizeof(float)) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INVALID_ARG, (std::string( "Unexpected total byte size for batch input. Expect >= ") + std::to_string(sizeof(float)) + ", got " + std::to_string(*dst_buffer_byte_size)) .c_str()); } RETURN_IF_ERROR(SetAccumulatedElementCount( source_input, input_buffer + sizeof(float), *dst_buffer_byte_size - sizeof(float))); } else { *reinterpret_cast(input_buffer) = 0; if (*dst_buffer_byte_size < sizeof(int32_t)) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INVALID_ARG, (std::string( "Unexpected total byte size for batch input. Expect >= ") + std::to_string(sizeof(int32_t)) + ", got " + std::to_string(*dst_buffer_byte_size)) .c_str()); } RETURN_IF_ERROR(SetAccumulatedElementCount( source_input, input_buffer + sizeof(int32_t), *dst_buffer_byte_size - sizeof(int32_t))); } break; } case BatchInput::Kind::BATCH_MAX_ELEMENT_COUNT_AS_SHAPE: { // The batch input is described by the shape, // no data modification is needed return nullptr; // success } case BatchInput::Kind::BATCH_ITEM_SHAPE: case BatchInput::Kind::BATCH_ITEM_SHAPE_FLATTEN: { // Use the same utilities for both types as the data will be the same, // only difference is the shape of the tensor. const auto& source_input = batch_input.SourceInputs()[0]; if (data_type == TRITONSERVER_TYPE_FP32) { *reinterpret_cast(input_buffer) = 0; RETURN_IF_ERROR(SetBatchItemShape( source_input, input_buffer, *dst_buffer_byte_size)); } else { *reinterpret_cast(input_buffer) = 0; RETURN_IF_ERROR(SetBatchItemShape( source_input, input_buffer, *dst_buffer_byte_size)); } break; } } if (*dst_memory_type == TRITONSERVER_MEMORY_GPU) { bool cuda_used; RETURN_IF_ERROR(CopyBuffer( "batch input buffer", internal_buffer->MemoryType(), internal_buffer->MemoryTypeId(), *dst_memory_type, *dst_memory_type_id, *dst_buffer_byte_size, input_buffer, buffer, stream_, &cuda_used, copy_on_stream_)); // Need to keep the backend memory alive in the case of async copy in_use_memories_.emplace_back(std::move(internal_buffer)); need_sync_ |= cuda_used; } return nullptr; // success } template TRITONSERVER_Error* BackendInputCollector::SetElementCount( const std::string& source_input, char* buffer, const size_t buffer_byte_size) { size_t buffer_offset = 0; for (size_t req_idx = 0; req_idx < request_count_; req_idx++) { if (buffer_offset + sizeof(T) > buffer_byte_size) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INVALID_ARG, "unexpected total byte size for batch input"); } TRITONBACKEND_Input* input; RETURN_IF_ERROR(TRITONBACKEND_RequestInput( requests_[req_idx], source_input.c_str(), &input)); const int64_t* shape; uint32_t dims_count; int64_t element_cnt = 0; RETURN_IF_ERROR(TRITONBACKEND_InputPropertiesForHostPolicy( input, host_policy_cstr_, nullptr, nullptr, &shape, &dims_count, nullptr, nullptr)); RETURN_IF_ERROR(GetElementCount(shape, dims_count, &element_cnt)); *(reinterpret_cast(buffer) + req_idx) = element_cnt; buffer_offset += sizeof(T); } // Set the rest of the buffer to 0 for (; buffer_offset + sizeof(T) <= buffer_byte_size; buffer_offset += sizeof(T)) { *reinterpret_cast(buffer + buffer_offset) = 0; } return nullptr; // success } template TRITONSERVER_Error* BackendInputCollector::SetAccumulatedElementCount( const std::string& source_input, char* buffer, const size_t buffer_byte_size) { size_t accumulated_element_count = 0; size_t buffer_offset = 0; for (size_t req_idx = 0; req_idx < request_count_; req_idx++) { if (buffer_offset + sizeof(T) > buffer_byte_size) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INVALID_ARG, "unexpected total byte size for batch input"); } TRITONBACKEND_Input* input; RETURN_IF_ERROR(TRITONBACKEND_RequestInput( requests_[req_idx], source_input.c_str(), &input)); const int64_t* shape; uint32_t dims_count; int64_t element_cnt = 0; RETURN_IF_ERROR(TRITONBACKEND_InputPropertiesForHostPolicy( input, host_policy_cstr_, nullptr, nullptr, &shape, &dims_count, nullptr, nullptr)); RETURN_IF_ERROR(GetElementCount(shape, dims_count, &element_cnt)); accumulated_element_count += element_cnt; *(reinterpret_cast(buffer) + req_idx) = accumulated_element_count; buffer_offset += sizeof(T); } // Set the rest of the buffer to 'accumulated_element_count' // (no increase in element count) for (; buffer_offset + sizeof(T) <= buffer_byte_size; buffer_offset += sizeof(T)) { *reinterpret_cast(buffer + buffer_offset) = accumulated_element_count; } return nullptr; // success } template TRITONSERVER_Error* BackendInputCollector::SetBatchItemShape( const std::string& source_input, char* buffer, const size_t buffer_byte_size) { size_t buffer_offset = 0; for (size_t req_idx = 0; req_idx < request_count_; req_idx++) { TRITONBACKEND_Input* input; RETURN_IF_ERROR(TRITONBACKEND_RequestInput( requests_[req_idx], source_input.c_str(), &input)); const int64_t* shape; uint32_t dims_count; RETURN_IF_ERROR(TRITONBACKEND_InputPropertiesForHostPolicy( input, host_policy_cstr_, nullptr, nullptr, &shape, &dims_count, nullptr, nullptr)); // Assuming first dimension is batch size and ragged input is only set // for batching enabled model. size_t batch_1_size = sizeof(T) * (dims_count - 1); if (buffer_offset + (size_t)shape[0] * batch_1_size > buffer_byte_size) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INVALID_ARG, (GetRequestId(requests_[req_idx]) + "unexpected total byte size for batch input") .c_str()); } // The batch input tracks the shape without batch dimension for // each batch item for (size_t idx = 1; idx < dims_count; ++idx) { // Need to set the element explicitly for type conversion *(reinterpret_cast(buffer + buffer_offset) + (idx - 1)) = shape[idx]; } // memcpy the data repeatedly if the request has batch size > 1 for (int64_t idx = 1; idx < shape[0]; ++idx) { memcpy( buffer + buffer_offset + idx * batch_1_size, buffer + buffer_offset, batch_1_size); } buffer_offset += batch_1_size * (size_t)shape[0]; } return nullptr; // success } bool BackendInputCollector::FlushPendingCopyKernel( char* tensor_buffer, const size_t tensor_buffer_byte_size, const TRITONSERVER_MemoryType tensor_memory_type, const int64_t tensor_memory_type_id) { if (pending_copy_kernel_input_buffers_.size() == 0) { return false; } bool cuda_copy = false; TRITONSERVER_Error* error = nullptr; // Only try to launch kernel if buffer count is large enough for // good GPU utilization if (pending_copy_kernel_input_buffer_counts_ >= kernel_buffer_threshold_) { error = LaunchCopyKernel( tensor_buffer, tensor_buffer_byte_size, tensor_memory_type, tensor_memory_type_id); cuda_copy = (error == nullptr); LOG_MESSAGE( TRITONSERVER_LOG_VERBOSE, (std::string("gather kernel launched with status: ") + ((error == nullptr) ? "Success" : TRITONSERVER_ErrorMessage(error))) .c_str()); } // If kernel can't be launched then just perform a direct copy. if ((pending_copy_kernel_input_buffer_counts_ < kernel_buffer_threshold_) || (error != nullptr)) { size_t offset = 0; for (auto& pr : pending_copy_kernel_input_buffers_) { cuda_copy |= SetInputTensor( "gather kernel fallback", pr, tensor_buffer, tensor_buffer_byte_size, tensor_memory_type, tensor_memory_type_id, pending_copy_kernel_buffer_offset_ + offset, TRITONSERVER_MEMORY_CPU_PINNED, false, true); offset += pr.memory_desc_.byte_size_; } } TRITONSERVER_ErrorDelete(error); // Pending kernel copies are handled... pending_copy_kernel_buffer_byte_size_ = 0; pending_copy_kernel_buffer_offset_ = 0; pending_copy_kernel_input_buffer_counts_ = 0; pending_copy_kernel_input_buffers_.clear(); return cuda_copy; } TRITONSERVER_Error* BackendInputCollector::LaunchCopyKernel( char* tensor_buffer, const size_t tensor_buffer_byte_size, const TRITONSERVER_MemoryType tensor_memory_type, const int64_t tensor_memory_type_id) { #ifdef TRITON_ENABLE_GPU input_ptr_buffer_host_.emplace_back(new std::vector()); byte_size_buffer_host_.emplace_back(new std::vector()); byte_size_offset_buffer_host_.emplace_back(new std::vector()); auto& input_ptr_buffer_host = *input_ptr_buffer_host_.back(); auto& byte_size_buffer_host = *byte_size_buffer_host_.back(); auto& byte_size_offset_buffer_host = *byte_size_offset_buffer_host_.back(); input_ptr_buffer_host.reserve(pending_copy_kernel_input_buffer_counts_); byte_size_buffer_host.reserve(pending_copy_kernel_input_buffer_counts_); byte_size_offset_buffer_host.reserve( pending_copy_kernel_input_buffer_counts_); size_t byte_size_offset = 0; for (const auto& response_input : pending_copy_kernel_input_buffers_) { const auto& input = response_input.memory_desc_; input_ptr_buffer_host.emplace_back( const_cast(reinterpret_cast(input.buffer_))); byte_size_buffer_host.emplace_back(input.byte_size_); byte_size_offset_buffer_host.emplace_back(byte_size_offset); byte_size_offset += input.byte_size_; } BackendMemory* backend_memory = nullptr; std::vector alloc_types; switch (tensor_memory_type) { case TRITONSERVER_MEMORY_GPU: alloc_types = { BackendMemory::AllocationType::GPU_POOL, BackendMemory::AllocationType::GPU}; break; case TRITONSERVER_MEMORY_CPU_PINNED: alloc_types = { BackendMemory::AllocationType::CPU_PINNED_POOL, BackendMemory::AllocationType::CPU_PINNED}; break; case TRITONSERVER_MEMORY_CPU: alloc_types = {BackendMemory::AllocationType::CPU}; break; } // input_ptr_buffer size_t input_ptr_buffer_byte_size = pending_copy_kernel_input_buffer_counts_ * sizeof(int8_t*); auto err = BackendMemory::Create( memory_manager_, alloc_types, tensor_memory_type_id, input_ptr_buffer_byte_size, &backend_memory); if (err != nullptr) { LOG_MESSAGE( TRITONSERVER_LOG_VERBOSE, (std::string("unable to create backend memory for type: ") + TRITONSERVER_MemoryTypeString(tensor_memory_type) + " id: " + std::to_string(tensor_memory_type_id) + ": " + TRITONSERVER_ErrorMessage(err)) .c_str()); TRITONSERVER_ErrorDelete(err); } else { in_use_memories_.emplace_back(backend_memory); } if (backend_memory == nullptr || (backend_memory->MemoryType() != tensor_memory_type) || (backend_memory->MemoryTypeId() != tensor_memory_type_id)) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INTERNAL, "Failed to obtain memory buffer for copy kernel input"); } char* input_ptr_buffer = backend_memory->MemoryPtr(); // byte_size_buffer size_t byte_size_buffer_byte_size = pending_copy_kernel_input_buffer_counts_ * sizeof(size_t); err = BackendMemory::Create( memory_manager_, alloc_types, tensor_memory_type_id, byte_size_buffer_byte_size, &backend_memory); if (err != nullptr) { LOG_MESSAGE( TRITONSERVER_LOG_VERBOSE, (std::string("unable to create backend memory for type: ") + TRITONSERVER_MemoryTypeString(tensor_memory_type) + " id: " + std::to_string(tensor_memory_type_id) + ": " + TRITONSERVER_ErrorMessage(err)) .c_str()); TRITONSERVER_ErrorDelete(err); } else { in_use_memories_.emplace_back(backend_memory); } if (backend_memory == nullptr || (backend_memory->MemoryType() != tensor_memory_type) || (backend_memory->MemoryTypeId() != tensor_memory_type_id)) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INTERNAL, "Failed to obtain memory buffer for copy kernel input"); } char* byte_size_buffer = backend_memory->MemoryPtr(); // byte_size_offset_buffer size_t byte_size_offset_buffer_byte_size = pending_copy_kernel_input_buffer_counts_ * sizeof(size_t); err = BackendMemory::Create( memory_manager_, alloc_types, tensor_memory_type_id, byte_size_offset_buffer_byte_size, &backend_memory); if (err != nullptr) { LOG_MESSAGE( TRITONSERVER_LOG_VERBOSE, (std::string("unable to create backend memory for type: ") + TRITONSERVER_MemoryTypeString(tensor_memory_type) + " id: " + std::to_string(tensor_memory_type_id) + ": " + TRITONSERVER_ErrorMessage(err)) .c_str()); TRITONSERVER_ErrorDelete(err); } else { in_use_memories_.emplace_back(backend_memory); } if (backend_memory == nullptr || (backend_memory->MemoryType() != tensor_memory_type) || (backend_memory->MemoryTypeId() != tensor_memory_type_id)) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INTERNAL, "Failed to obtain memory buffer for copy kernel input"); } char* byte_size_offset_buffer = backend_memory->MemoryPtr(); cudaMemcpyAsync( input_ptr_buffer, input_ptr_buffer_host.data(), pending_copy_kernel_input_buffer_counts_ * sizeof(int8_t*), cudaMemcpyDefault, stream_); cudaMemcpyAsync( byte_size_buffer, byte_size_buffer_host.data(), pending_copy_kernel_input_buffer_counts_ * sizeof(size_t), cudaMemcpyDefault, stream_); cudaMemcpyAsync( byte_size_offset_buffer, byte_size_offset_buffer_host.data(), pending_copy_kernel_input_buffer_counts_ * sizeof(size_t), cudaMemcpyDefault, stream_); if (buffer_ready_event_ != nullptr) { cudaEventSynchronize(buffer_ready_event_); buffer_ready_event_ = nullptr; } RETURN_IF_CUDA_ERROR( RunGatherKernel( (const int8_t**)input_ptr_buffer, (const size_t*)byte_size_buffer, (const size_t*)byte_size_offset_buffer, (int8_t*)tensor_buffer + pending_copy_kernel_buffer_offset_, pending_copy_kernel_input_buffer_counts_, stream_), TRITONSERVER_ERROR_INTERNAL, std::string("Failed to launch gather kernel")); return nullptr; #else return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_UNSUPPORTED, "Copy kernel can not be launched with TRITON_ENABLE_GPU=OFF"); #endif // TRITON_ENABLE_GPU } }} // namespace triton::backend ================================================ FILE: src/backend_memory.cc ================================================ // Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // * Neither the name of NVIDIA CORPORATION nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "triton/backend/backend_memory.h" #include #include "triton/backend/backend_common.h" namespace triton { namespace backend { TRITONSERVER_Error* BackendMemory::Create( TRITONBACKEND_MemoryManager* manager, const AllocationType alloc_type, const int64_t memory_type_id, const size_t byte_size, BackendMemory** mem) { *mem = nullptr; void* ptr = nullptr; switch (alloc_type) { case AllocationType::CPU_PINNED: { #ifdef TRITON_ENABLE_GPU RETURN_IF_CUDA_ERROR( cudaHostAlloc(&ptr, byte_size, cudaHostAllocPortable), TRITONSERVER_ERROR_UNAVAILABLE, std::string("failed to allocate pinned system memory")); #else return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_UNSUPPORTED, "pinned-memory allocation not supported"); #endif // TRITON_ENABLE_GPU break; } case AllocationType::GPU: { #ifdef TRITON_ENABLE_GPU int current_device; RETURN_IF_CUDA_ERROR( cudaGetDevice(¤t_device), TRITONSERVER_ERROR_INTERNAL, std::string("failed to get device")); bool overridden = (current_device != memory_type_id); if (overridden) { RETURN_IF_CUDA_ERROR( cudaSetDevice(memory_type_id), TRITONSERVER_ERROR_INTERNAL, std::string("failed to set device")); } auto err = cudaMalloc(&ptr, byte_size); if (overridden) { LOG_IF_CUDA_ERROR( cudaSetDevice(current_device), "failed to set CUDA device"); } RETURN_ERROR_IF_FALSE( err == cudaSuccess, TRITONSERVER_ERROR_UNAVAILABLE, std::string("failed to allocate GPU memory: ") + cudaGetErrorString(err)); #else return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_UNSUPPORTED, "GPU allocation not supported"); #endif // TRITON_ENABLE_GPU break; } case AllocationType::CPU: case AllocationType::CPU_PINNED_POOL: case AllocationType::GPU_POOL: RETURN_IF_ERROR(TRITONBACKEND_MemoryManagerAllocate( manager, &ptr, AllocTypeToMemoryType(alloc_type), memory_type_id, byte_size)); break; } *mem = new BackendMemory( manager, alloc_type, memory_type_id, reinterpret_cast(ptr), byte_size); return nullptr; // success } TRITONSERVER_Error* BackendMemory::Create( TRITONBACKEND_MemoryManager* manager, const std::vector& alloc_types, const int64_t memory_type_id, const size_t byte_size, BackendMemory** mem) { *mem = nullptr; RETURN_ERROR_IF_TRUE( alloc_types.size() == 0, TRITONSERVER_ERROR_INVALID_ARG, std::string("BackendMemory::Create, at least one allocation type must be " "specified")); bool success = false; std::unordered_map errors; for (const AllocationType alloc_type : alloc_types) { TRITONSERVER_Error* err = Create(manager, alloc_type, memory_type_id, byte_size, mem); if (err == nullptr) { success = true; break; } errors.insert({alloc_type, err}); } // If allocation failed for all allocation types then display all // the error messages and show the entire allocation request as // failing. if (!success) { std::string msg = "BackendMemory::Create, all allocation types failed:"; for (const auto& pr : errors) { const AllocationType alloc_type = pr.first; TRITONSERVER_Error* err = pr.second; msg += std::string("\n\t") + AllocTypeString(alloc_type) + ": " + TRITONSERVER_ErrorMessage(err); TRITONSERVER_ErrorDelete(err); } return TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_UNAVAILABLE, msg.c_str()); } // If it succeeded we might have to clean up errors associated with // attempts that failed for (const auto& pr : errors) { TRITONSERVER_ErrorDelete(pr.second); } return nullptr; // success } TRITONSERVER_Error* BackendMemory::Create( TRITONBACKEND_MemoryManager* manager, const AllocationType alloc_type, const int64_t memory_type_id, void* buffer, const size_t byte_size, BackendMemory** mem) { *mem = new BackendMemory( manager, alloc_type, memory_type_id, reinterpret_cast(buffer), byte_size, false /* owns_buffer */); return nullptr; // success } BackendMemory::~BackendMemory() { if (owns_buffer_) { switch (alloctype_) { case AllocationType::CPU_PINNED: #ifdef TRITON_ENABLE_GPU if (buffer_ != nullptr) { LOG_IF_CUDA_ERROR( cudaFreeHost(buffer_), "failed to free pinned memory"); } #endif // TRITON_ENABLE_GPU break; case AllocationType::GPU: #ifdef TRITON_ENABLE_GPU if (buffer_ != nullptr) { LOG_IF_CUDA_ERROR(cudaFree(buffer_), "failed to free CUDA memory"); } #endif // TRITON_ENABLE_GPU break; case AllocationType::CPU: case AllocationType::CPU_PINNED_POOL: case AllocationType::GPU_POOL: LOG_IF_ERROR( TRITONBACKEND_MemoryManagerFree( manager_, buffer_, AllocTypeToMemoryType(alloctype_), memtype_id_), "failed to free memory buffer"); break; } } } TRITONSERVER_MemoryType BackendMemory::AllocTypeToMemoryType(const AllocationType a) { switch (a) { case AllocationType::CPU: return TRITONSERVER_MEMORY_CPU; case AllocationType::CPU_PINNED: case AllocationType::CPU_PINNED_POOL: return TRITONSERVER_MEMORY_CPU_PINNED; case AllocationType::GPU: case AllocationType::GPU_POOL: return TRITONSERVER_MEMORY_GPU; } return TRITONSERVER_MEMORY_CPU; // unreachable } const char* BackendMemory::AllocTypeString(const AllocationType a) { switch (a) { case AllocationType::CPU: return "CPU"; case AllocationType::CPU_PINNED: return "CPU_PINNED"; case AllocationType::GPU: return "GPU"; case AllocationType::CPU_PINNED_POOL: return "CPU_PINNED_POOL"; case AllocationType::GPU_POOL: return "GPU_POOL"; } return ""; } }} // namespace triton::backend ================================================ FILE: src/backend_model.cc ================================================ // Copyright 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // * Neither the name of NVIDIA CORPORATION nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "triton/backend/backend_model.h" #include "triton/backend/backend_common.h" namespace triton { namespace backend { // // BackendModel // BackendModel::BackendModel( TRITONBACKEND_Model* triton_model, const bool allow_optional) : triton_model_(triton_model), allow_optional_(allow_optional) { const char* model_name; THROW_IF_BACKEND_MODEL_ERROR( TRITONBACKEND_ModelName(triton_model, &model_name)); name_ = model_name; THROW_IF_BACKEND_MODEL_ERROR( TRITONBACKEND_ModelVersion(triton_model, &version_)); const char* repository_path = nullptr; TRITONBACKEND_ArtifactType repository_artifact_type; THROW_IF_BACKEND_MODEL_ERROR(TRITONBACKEND_ModelRepository( triton_model, &repository_artifact_type, &repository_path)); if (repository_artifact_type != TRITONBACKEND_ARTIFACT_FILESYSTEM) { throw BackendModelException(TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_UNSUPPORTED, (std::string("unsupported repository artifact type for model '") + model_name + "'") .c_str())); } repository_path_ = repository_path; THROW_IF_BACKEND_MODEL_ERROR( TRITONBACKEND_ModelServer(triton_model, &triton_server_)); TRITONBACKEND_Backend* backend; THROW_IF_BACKEND_MODEL_ERROR( TRITONBACKEND_ModelBackend(triton_model, &backend)); THROW_IF_BACKEND_MODEL_ERROR( TRITONBACKEND_BackendMemoryManager(backend, &triton_memory_manager_)); THROW_IF_BACKEND_MODEL_ERROR(ParseModelConfig()); } TRITONSERVER_Error* BackendModel::ParseModelConfig() { TRITONSERVER_Message* config_message; RETURN_IF_ERROR(TRITONBACKEND_ModelConfig( triton_model_, 1 /* config_version */, &config_message)); // Get the model configuration as a json string from // config_message. We use TritonJson, which is a wrapper that // returns nice errors (currently the underlying implementation is // rapidjson... but others could be added). const char* buffer; size_t byte_size; RETURN_IF_ERROR( TRITONSERVER_MessageSerializeToJson(config_message, &buffer, &byte_size)); TRITONSERVER_Error* err = model_config_.Parse(buffer, byte_size); RETURN_IF_ERROR(TRITONSERVER_MessageDelete(config_message)); RETURN_IF_ERROR(err); int64_t mbs = 0; RETURN_IF_ERROR(model_config_.MemberAsInt("max_batch_size", &mbs)); max_batch_size_ = mbs; enable_pinned_input_ = false; enable_pinned_output_ = false; { common::TritonJson::Value optimization; if (model_config_.Find("optimization", &optimization)) { common::TritonJson::Value pinned_memory; if (optimization.Find("input_pinned_memory", &pinned_memory)) { RETURN_IF_ERROR( pinned_memory.MemberAsBool("enable", &enable_pinned_input_)); } if (optimization.Find("output_pinned_memory", &pinned_memory)) { RETURN_IF_ERROR( pinned_memory.MemberAsBool("enable", &enable_pinned_output_)); } } } RETURN_IF_ERROR( BatchInput::ParseFromModelConfig(model_config_, &batch_inputs_)); RETURN_IF_ERROR( BatchOutput::ParseFromModelConfig(model_config_, &batch_outputs_)); for (const auto& batch_output : batch_outputs_) { for (const auto& name : batch_output.TargetNames()) { batch_output_map_.emplace(name, &batch_output); } } triton::common::TritonJson::Value config_inputs; RETURN_IF_ERROR(model_config_.MemberAsArray("input", &config_inputs)); for (size_t i = 0; i < config_inputs.ArraySize(); i++) { triton::common::TritonJson::Value io; RETURN_IF_ERROR(config_inputs.IndexAsObject(i, &io)); std::string io_name; RETURN_IF_ERROR(io.MemberAsString("name", &io_name)); triton::common::TritonJson::Value input_property_json; bool allow_ragged_batch = false; if (io.Find("allow_ragged_batch", &input_property_json)) { RETURN_IF_ERROR(input_property_json.AsBool(&allow_ragged_batch)); } if (allow_ragged_batch) { ragged_inputs_.emplace(io_name); } bool optional = false; if (io.Find("optional", &input_property_json)) { RETURN_IF_ERROR(input_property_json.AsBool(&optional)); } if (optional) { if (allow_optional_) { optional_inputs_.emplace(io_name); } else { RETURN_IF_ERROR(TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INVALID_ARG, (std::string("'optional' is set to true for input '") + io_name + "' while the backend model doesn't support optional input") .c_str())); } } } return nullptr; } TRITONSERVER_Error* BackendModel::SetModelConfig() { triton::common::TritonJson::WriteBuffer json_buffer; RETURN_IF_ERROR(ModelConfig().Write(&json_buffer)); TRITONSERVER_Message* message; RETURN_IF_ERROR(TRITONSERVER_MessageNewFromSerializedJson( &message, json_buffer.Base(), json_buffer.Size())); RETURN_IF_ERROR(TRITONBACKEND_ModelSetConfig( triton_model_, 1 /* config_version */, message)); RETURN_IF_ERROR(TRITONSERVER_MessageDelete(message)); // Triton core can normalize the missing config settings // in the above call. We must retrieve the updated model // configuration from the core. RETURN_IF_ERROR(ParseModelConfig()); return nullptr; } TRITONSERVER_Error* BackendModel::SupportsFirstDimBatching(bool* supports) { *supports = max_batch_size_ > 0; return nullptr; } const BatchOutput* BackendModel::FindBatchOutput(const std::string& output_name) const { const auto it = batch_output_map_.find(output_name); return ((it == batch_output_map_.end()) ? nullptr : it->second); } }} // namespace triton::backend ================================================ FILE: src/backend_model_instance.cc ================================================ // Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // * Neither the name of NVIDIA CORPORATION nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "triton/backend/backend_model_instance.h" #include #include "triton/backend/backend_common.h" #include "triton/backend/backend_model.h" namespace triton { namespace backend { // // BackendModelInstance // BackendModelInstance::BackendModelInstance( BackendModel* backend_model, TRITONBACKEND_ModelInstance* triton_model_instance) : backend_model_(backend_model), triton_model_instance_(triton_model_instance) { const char* instance_name; THROW_IF_BACKEND_INSTANCE_ERROR( TRITONBACKEND_ModelInstanceName(triton_model_instance, &instance_name)); name_ = instance_name; THROW_IF_BACKEND_INSTANCE_ERROR( TRITONBACKEND_ModelInstanceKind(triton_model_instance, &kind_)); THROW_IF_BACKEND_INSTANCE_ERROR( TRITONBACKEND_ModelInstanceDeviceId(triton_model_instance, &device_id_)); common::TritonJson::Value& model_config = backend_model->ModelConfig(); // If the model configuration specifies a 'default_model_filename' // and/or specifies 'cc_model_filenames' then determine the // appropriate 'artifact_filename' value. If model configuration // does not specify then just leave 'artifact_filename' empty and // the backend can then provide its own logic for determine the // filename if that is appropriate. THROW_IF_BACKEND_INSTANCE_ERROR(model_config.MemberAsString( "default_model_filename", &artifact_filename_)); switch (kind_) { case TRITONSERVER_INSTANCEGROUPKIND_CPU: { LOG_MESSAGE( TRITONSERVER_LOG_VERBOSE, (std::string("Creating instance ") + name_ + " on CPU using artifact '" + artifact_filename_ + "'") .c_str()); break; } case TRITONSERVER_INSTANCEGROUPKIND_MODEL: { LOG_MESSAGE( TRITONSERVER_LOG_VERBOSE, (std::string("Creating instance ") + name_ + " on model-specified devices using artifact '" + artifact_filename_ + "'") .c_str()); break; } case TRITONSERVER_INSTANCEGROUPKIND_GPU: { #if defined(TRITON_ENABLE_GPU) cudaDeviceProp cuprops; cudaError_t cuerr = cudaGetDeviceProperties(&cuprops, device_id_); if (cuerr != cudaSuccess) { throw BackendModelInstanceException(TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INTERNAL, (std::string("unable to get CUDA device properties for ") + name_ + ": " + cudaGetErrorString(cuerr)) .c_str())); } const std::string cc = std::to_string(cuprops.major) + "." + std::to_string(cuprops.minor); common::TritonJson::Value cc_names; common::TritonJson::Value cc_name; if ((model_config.Find("cc_model_filenames", &cc_names)) && (cc_names.Find(cc.c_str(), &cc_name))) { cc_name.AsString(&artifact_filename_); } LOG_MESSAGE( TRITONSERVER_LOG_VERBOSE, (std::string("Creating instance ") + name_ + " on GPU " + std::to_string(device_id_) + " (" + cc + ") using artifact '" + artifact_filename_ + "'") .c_str()); #elif !defined(TRITON_ENABLE_MALI_GPU) throw BackendModelInstanceException(TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INTERNAL, "GPU instances not supported")); #endif // TRITON_ENABLE_GPU break; } default: { throw BackendModelInstanceException(TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INTERNAL, (std::string("unexpected instance kind for ") + name_).c_str())); } } stream_ = nullptr; if (kind_ == TRITONSERVER_INSTANCEGROUPKIND_GPU) { THROW_IF_BACKEND_INSTANCE_ERROR( CreateCudaStream(device_id_, 0 /* cuda_stream_priority */, &stream_)); } // Get the host policy setting as a json string from message, // and extract the host policy name for the instance. TRITONSERVER_Message* message = nullptr; THROW_IF_BACKEND_MODEL_ERROR( TRITONBACKEND_ModelInstanceHostPolicy(triton_model_instance_, &message)); const char* buffer; size_t byte_size; THROW_IF_BACKEND_MODEL_ERROR( TRITONSERVER_MessageSerializeToJson(message, &buffer, &byte_size)); common::TritonJson::Value host_policy; TRITONSERVER_Error* err = host_policy.Parse(buffer, byte_size); THROW_IF_BACKEND_MODEL_ERROR(err); std::vector host_policy_name; THROW_IF_BACKEND_MODEL_ERROR(host_policy.Members(&host_policy_name)); if (host_policy_name.size() != 1) { throw BackendModelInstanceException(TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INTERNAL, (std::string("unexpected no host policy for ") + name_).c_str())); } host_policy_name_ = host_policy_name[0]; } BackendModelInstance::~BackendModelInstance() { #ifdef TRITON_ENABLE_GPU if (stream_ != nullptr) { cudaError_t err = cudaStreamDestroy(stream_); if (err != cudaSuccess) { TRITONSERVER_LogMessage( TRITONSERVER_LOG_ERROR, __FILE__, __LINE__, (std::string("~BackendModelInstance: ") + name_ + " failed to destroy cuda stream: " + cudaGetErrorString(err)) .c_str()); } stream_ = nullptr; } #endif // TRITON_ENABLE_GPU } }} // namespace triton::backend ================================================ FILE: src/backend_output_responder.cc ================================================ // Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // * Neither the name of NVIDIA CORPORATION nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "triton/backend/backend_output_responder.h" #include "triton/backend/backend_common.h" #include "triton/backend/backend_model.h" #include "triton/backend/backend_model_instance.h" namespace triton { namespace backend { // // BackendOutputResponder // BackendOutputResponder::~BackendOutputResponder() { for (auto& pinned_memory : pinned_memories_) { LOG_IF_ERROR( TRITONBACKEND_MemoryManagerFree( memory_manager_, reinterpret_cast(pinned_memory), TRITONSERVER_MEMORY_CPU_PINNED, 0), "failed to free pinned memory"); } } void BackendOutputResponder::ProcessTensor( const std::string& output_name, const TRITONSERVER_DataType datatype, std::vector& batchn_shape, const char* buffer, const TRITONSERVER_MemoryType memory_type, const int64_t memory_type_id) { // A value of CPU_PINNED indicates that pinned memory buffer is not // needed for this tensor. Any other value indicates that a pinned // memory buffer is needed when the target memory type matches // 'use_pinned_memory_type'. TRITONSERVER_MemoryType use_pinned_memory_type = TRITONSERVER_MEMORY_CPU_PINNED; if (pinned_enabled_) { use_pinned_memory_type = GetUsePinnedMemoryType(memory_type); } const int64_t batchn_batch_size = batchn_shape[0]; int64_t batch_size_offset = 0; size_t tensor_offset = 0; for (size_t idx = 0; idx < responses_->size(); idx++) { auto& request = requests_[idx]; auto& response = (*responses_)[idx]; // If then pending copies are from tensor buffer that is not // contiguous with 'response's part of that buffer, then need to // go ahead and perform the pending copies so that can start a // new contiguous region if necessary. if ((pending_pinned_byte_size_ > 0) && (tensor_offset != (pending_pinned_byte_size_ + pending_pinned_offset_))) { need_sync_ |= FlushPendingPinned(buffer, memory_type, memory_type_id); } // Override shape to be correct for this response. if (first_dim_batching_) { TRITONBACKEND_Input* input; TRITONBACKEND_RequestInputByIndex(request, 0, &input); const int64_t* shape; TRITONBACKEND_InputProperties( input, nullptr, nullptr, &shape, nullptr, nullptr, nullptr); if ((batchn_batch_size != -1) && ((batch_size_offset + shape[0]) > batchn_batch_size)) { if (response != nullptr) { RESPOND_AND_SET_NULL_IF_ERROR( &response, TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_UNSUPPORTED, std::string( GetRequestId(request) + "failed to split the output tensor '" + output_name + "' in responses: expected batch size of at least " + std::to_string(batch_size_offset + shape[0]) + " in model output, got " + std::to_string(batchn_batch_size)) .c_str())); } } batchn_shape[0] = shape[0]; batch_size_offset += shape[0]; } int64_t tensor_byte_size = 0; RESPOND_AND_SET_NULL_IF_ERROR( &response, GetByteSize(datatype, batchn_shape, &tensor_byte_size)); TRITONBACKEND_Output* response_output; if (response != nullptr) { uint32_t output_count; RESPOND_AND_SET_NULL_IF_ERROR( &response, TRITONBACKEND_RequestOutputCount(request, &output_count)); if (response != nullptr) { for (uint32_t output_idx = 0; output_idx < output_count; output_idx++) { const char* name; RESPOND_AND_SET_NULL_IF_ERROR( &response, TRITONBACKEND_RequestOutputName(request, output_idx, &name)); if ((response != nullptr) && (output_name == name)) { RESPOND_AND_SET_NULL_IF_ERROR( &response, TRITONBACKEND_ResponseOutput( response, &response_output, name, datatype, batchn_shape.data(), batchn_shape.size())); if (response != nullptr) { need_sync_ |= SetFixedSizeBuffer( &response, response_output, output_name, tensor_byte_size, tensor_offset, buffer, memory_type, memory_type_id, use_pinned_memory_type, false /* state */); } break; } } } } tensor_offset += tensor_byte_size; } // Done with the tensor, flush any pending pinned copies. need_sync_ |= FlushPendingPinned(buffer, memory_type, memory_type_id); #ifdef TRITON_ENABLE_GPU if (need_sync_ && (event_ != nullptr)) { cudaEventRecord(event_, stream_); } #endif // TRITON_ENABLE_GPU } std::vector BackendOutputResponder::ProcessStateTensor( const std::string& output_state_name, const TRITONSERVER_DataType datatype, std::vector& batchn_shape, const char* buffer, const TRITONSERVER_MemoryType memory_type, const int64_t memory_type_id) { // A value of CPU_PINNED indicates that pinned memory buffer is not // needed for this tensor. Any other value indicates that a pinned // memory buffer is needed when the target memory type matches // 'use_pinned_memory_type'. TRITONSERVER_MemoryType use_pinned_memory_type = TRITONSERVER_MEMORY_CPU_PINNED; if (pinned_enabled_) { use_pinned_memory_type = GetUsePinnedMemoryType(memory_type); } std::vector states; const int64_t batchn_batch_size = batchn_shape[0]; int64_t batch_size_offset = 0; size_t tensor_offset = 0; for (size_t idx = 0; idx < responses_->size(); idx++) { auto& request = requests_[idx]; auto& response = (*responses_)[idx]; // If then pending copies are from tensor buffer that is not // contiguous with 'response's part of that buffer, then need to // go ahead and perform the pending copies so that can start a // new contiguous region if necessary. if ((pending_pinned_byte_size_ > 0) && (tensor_offset != (pending_pinned_byte_size_ + pending_pinned_offset_))) { need_sync_ |= FlushPendingPinned(buffer, memory_type, memory_type_id); } // Override shape to be correct for this response. if (first_dim_batching_) { TRITONBACKEND_Input* input; TRITONBACKEND_RequestInputByIndex(request, 0, &input); const int64_t* shape; TRITONBACKEND_InputProperties( input, nullptr, nullptr, &shape, nullptr, nullptr, nullptr); if ((batchn_batch_size != -1) && ((batch_size_offset + shape[0]) > batchn_batch_size)) { if (response != nullptr) { RESPOND_AND_SET_NULL_IF_ERROR( &response, TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_UNSUPPORTED, std::string( GetRequestId(request) + "failed to split the output state tensor '" + output_state_name + "' in responses: expected batch size of at least " + std::to_string(batch_size_offset + shape[0]) + " in model output, got " + std::to_string(batchn_batch_size)) .c_str())); } } batchn_shape[0] = shape[0]; batch_size_offset += shape[0]; } int64_t tensor_byte_size = 0; RESPOND_AND_SET_NULL_IF_ERROR( &response, GetByteSize(datatype, batchn_shape, &tensor_byte_size)); TRITONBACKEND_State* output_state; if (response != nullptr) { RESPOND_AND_SET_NULL_IF_ERROR( &response, TRITONBACKEND_StateNew( &output_state, request, output_state_name.c_str(), datatype, batchn_shape.data(), batchn_shape.size())); if (response != nullptr) { states.push_back(output_state); need_sync_ |= SetFixedSizeBuffer( &response, output_state, output_state_name, tensor_byte_size, tensor_offset, buffer, memory_type, memory_type_id, use_pinned_memory_type, true /* state */); } } tensor_offset += tensor_byte_size; } // Done with the tensor, flush any pending pinned copies. need_sync_ |= FlushPendingPinned(buffer, memory_type, memory_type_id); #ifdef TRITON_ENABLE_GPU if (need_sync_ && (event_ != nullptr)) { cudaEventRecord(event_, stream_); } #endif // TRITON_ENABLE_GPU return states; } bool BackendOutputResponder::Finalize() { #ifdef TRITON_ENABLE_GPU if ((!deferred_pinned_.empty()) && need_sync_) { if (event_ != nullptr) { cudaEventSynchronize(event_); } else { cudaStreamSynchronize(stream_); } need_sync_ = false; } #endif // TRITON_ENABLE_GPU // After the above sync all the GPU->pinned copies are complete. Any // deferred copies of pinned->CPU can now be done. for (auto& def : deferred_pinned_) { auto pinned_memory_type = TRITONSERVER_MEMORY_CPU_PINNED; int64_t pinned_memory_id = 0; char* pinned_buffer = def.pinned_memory_; size_t offset = 0; for (auto& pr : def.responses_) { auto& response = pr.first; auto& response_output = pr.second; bool cuda_used = false; RESPOND_AND_SET_NULL_IF_ERROR( response, CopyBuffer( response_output.name_, pinned_memory_type, pinned_memory_id, response_output.memory_type_, response_output.memory_type_id_, response_output.buffer_byte_size_, pinned_buffer + offset, const_cast(response_output.buffer_), stream_, &cuda_used, copy_on_stream_)); need_sync_ |= cuda_used; offset += response_output.buffer_byte_size_; } } #ifdef TRITON_ENABLE_GPU // Record the new event location if deferred copies occur if ((!deferred_pinned_.empty()) && need_sync_ && (event_ != nullptr)) { cudaEventRecord(event_, stream_); } #endif // TRITON_ENABLE_GPU deferred_pinned_.clear(); return need_sync_; } bool BackendOutputResponder::SetFixedSizeBuffer( TRITONBACKEND_Response** response, void* response_output_or_state, const std::string& output_name, const size_t tensor_byte_size, const size_t tensor_offset, const char* tensor_buffer, const TRITONSERVER_MemoryType tensor_memory_type, const int64_t tensor_memory_type_id, const TRITONSERVER_MemoryType use_pinned_memory_type, bool state) { void* buffer = nullptr; bool cuda_copy = false; TRITONSERVER_MemoryType actual_memory_type = tensor_memory_type; int64_t actual_memory_type_id = tensor_memory_type_id; if (state) { TRITONBACKEND_State* response_state = reinterpret_cast(response_output_or_state); auto err = TRITONBACKEND_StateBuffer( response_state, &buffer, tensor_byte_size, &actual_memory_type, &actual_memory_type_id); if (err != nullptr) { RESPOND_AND_SET_NULL_IF_ERROR(response, err); return cuda_copy; } } else { TRITONBACKEND_Output* response_output = reinterpret_cast(response_output_or_state); auto err = TRITONBACKEND_OutputBuffer( response_output, &buffer, tensor_byte_size, &actual_memory_type, &actual_memory_type_id); if (err != nullptr) { RESPOND_AND_SET_NULL_IF_ERROR(response, err); return cuda_copy; } } // If the response buffer matches the memory type that should use an // intermediate pinned memory buffer for the transfer, then just // record the response as pending and increase the size required for // the intermediate pinned buffer. if ((use_pinned_memory_type != TRITONSERVER_MEMORY_CPU_PINNED) && (actual_memory_type == use_pinned_memory_type)) { if (pending_pinned_byte_size_ == 0) { pending_pinned_offset_ = tensor_offset; } pending_pinned_byte_size_ += tensor_byte_size; pending_pinned_outputs_.push_back(std::make_pair( response, OutputData( output_name, buffer, tensor_byte_size, actual_memory_type, actual_memory_type_id))); } else { // Direct copy without intermediate pinned memory. bool cuda_used = false; auto err = CopyBuffer( output_name, tensor_memory_type, tensor_memory_type_id, actual_memory_type, actual_memory_type_id, tensor_byte_size, tensor_buffer + tensor_offset, buffer, stream_, &cuda_used, copy_on_stream_); cuda_copy |= cuda_used; if (err != nullptr) { RESPOND_AND_SET_NULL_IF_ERROR(response, err); return cuda_copy; } } return cuda_copy; } bool BackendOutputResponder::FlushPendingPinned( const char* tensor_buffer, const TRITONSERVER_MemoryType tensor_memory_type, const int64_t tensor_memory_type_id) { bool cuda_copy = false; // Will be copying from CPU->pinned->GPU or GPU->pinned->CPU // Attempt to allocate a pinned buffer to use for staging the // copy... if we fail to allocated the pinned buffer then we just // directly go CPU->GPU or GPU->CPU. char* pinned_memory = nullptr; if (pending_pinned_byte_size_ > 0) { TRITONSERVER_Error* err = TRITONBACKEND_MemoryManagerAllocate( memory_manager_, reinterpret_cast(&pinned_memory), TRITONSERVER_MEMORY_CPU_PINNED, 0 /* memory_type_id */, pending_pinned_byte_size_); if (err != nullptr) { pinned_memory = nullptr; TRITONSERVER_ErrorDelete(err); } } // If the pinned buffer wasn't actually allocated then just perform // a direct copy. if (pinned_memory == nullptr) { size_t offset = 0; for (auto& pr : pending_pinned_outputs_) { auto& response = pr.first; auto& response_output = pr.second; bool cuda_used = false; RESPOND_AND_SET_NULL_IF_ERROR( response, CopyBuffer( response_output.name_, tensor_memory_type, tensor_memory_type_id, response_output.memory_type_, response_output.memory_type_id_, response_output.buffer_byte_size_, tensor_buffer + pending_pinned_offset_ + offset, const_cast(response_output.buffer_), stream_, &cuda_used, copy_on_stream_)); cuda_copy |= cuda_used; offset += response_output.buffer_byte_size_; } } // We have a pinned buffer so do a single copy of a block of tensor // data to the pinned buffer. else { // pinned_memory_type == TRITONSERVER_MEMORY_CPU_PINNED bool cuda_used = false; auto err = CopyBuffer( "pinned buffer", tensor_memory_type, tensor_memory_type_id, TRITONSERVER_MEMORY_CPU_PINNED, 0 /* memory_type_id */, pending_pinned_byte_size_, tensor_buffer + pending_pinned_offset_, pinned_memory, stream_, &cuda_used, copy_on_stream_); cuda_copy |= cuda_used; // If something goes wrong with the copy all the pending // responses fail... if (err != nullptr) { for (auto& pr : pending_pinned_outputs_) { auto& response = pr.first; if (*response != nullptr) { LOG_IF_ERROR( TRITONBACKEND_ResponseSend( *response, TRITONSERVER_RESPONSE_COMPLETE_FINAL, err), "failed to send TensorFlow error response"); *response = nullptr; } } TRITONSERVER_ErrorDelete(err); } // If the copy was not async (i.e. if tensor was in CPU so a // CPU->CPU-PINNED copy was performed above), then the pinned // buffer now holds the tensor contents and we can immediately // issue the copies from the pinned buffer to the // responses. // // Otherwise the GPU->CPU-PINNED async copies are in flight and we // simply remember the pinned buffer and the corresponding // response outputs so that we can do the pinned->CPU copies in // finalize after we have waited for all async copies to complete. if (!cuda_used) { size_t offset = 0; for (auto& pr : pending_pinned_outputs_) { auto& response = pr.first; auto& response_output = pr.second; bool cuda_used = false; RESPOND_AND_SET_NULL_IF_ERROR( response, CopyBuffer( response_output.name_, TRITONSERVER_MEMORY_CPU_PINNED, 0 /* memory_type_id */, response_output.memory_type_, response_output.memory_type_id_, response_output.buffer_byte_size_, pinned_memory + offset, const_cast(response_output.buffer_), stream_, &cuda_used, copy_on_stream_)); cuda_copy |= cuda_used; offset += response_output.buffer_byte_size_; } } else { deferred_pinned_.emplace_back( pinned_memory, pending_pinned_byte_size_, std::move(pending_pinned_outputs_)); } } // Pending pinned copies are handled... pending_pinned_byte_size_ = 0; pending_pinned_offset_ = 0; pending_pinned_outputs_.clear(); // Need to hold on to the allocated pinned buffer as there are still // copies in flight. Will delete it in finalize. if (pinned_memory != nullptr) { pinned_memories_.push_back(pinned_memory); } return cuda_copy; } void BackendOutputResponder::ProcessBatchOutput( const std::string& name, const BatchOutput& batch_output, const char* buffer, const TRITONSERVER_MemoryType memory_type, const int64_t memory_type_id) { // A value of CPU_PINNED indicates that pinned memory buffer is not // needed for this tensor. Any other value indicates that a pinned // memory buffer is needed when the target memory type matches // 'use_pinned_memory_type'. TRITONSERVER_MemoryType use_pinned_memory_type = TRITONSERVER_MEMORY_CPU_PINNED; if (pinned_enabled_) { use_pinned_memory_type = GetUsePinnedMemoryType(memory_type); } // Batch output may be processed differently based on the kind switch (batch_output.BatchOutputKind()) { case BatchOutput::Kind::BATCH_SCATTER_WITH_INPUT_SHAPE: { const auto& output_name = batch_output.TargetNames()[0]; const auto& input_name = batch_output.SourceInputs()[0]; const auto& datatype = batch_output.DataType(); size_t tensor_offset = 0; for (size_t idx = 0; idx < responses_->size(); idx++) { auto& request = requests_[idx]; auto& response = (*responses_)[idx]; // If then pending copies are from tensor buffer that is not // contiguous with 'response's part of that buffer, then need to // go ahead and perform the pending copies so that can start a // new contiguous region if necessary. if ((pending_pinned_byte_size_ > 0) && (tensor_offset != (pending_pinned_byte_size_ + pending_pinned_offset_))) { need_sync_ |= FlushPendingPinned(buffer, memory_type, memory_type_id); } // Override shape to be correct for this response, with a naive // assumption that the dynamic dimension in output is mapped to the same // dimension in the input auto output_batchn_shape = batch_output.OutputShape(); { TRITONBACKEND_Input* input; TRITONBACKEND_RequestInput(request, input_name.c_str(), &input); const int64_t* shape; TRITONBACKEND_InputProperties( input, nullptr, nullptr, &shape, nullptr, nullptr, nullptr); for (size_t dim_idx = 0; dim_idx < output_batchn_shape.size(); dim_idx++) { if (output_batchn_shape[dim_idx] == -1) { output_batchn_shape[dim_idx] = shape[dim_idx]; } } } int64_t tensor_byte_size = 0; RESPOND_AND_SET_NULL_IF_ERROR( &response, GetByteSize(datatype, output_batchn_shape, &tensor_byte_size)); TRITONBACKEND_Output* response_output; if (response != nullptr) { uint32_t output_count; RESPOND_AND_SET_NULL_IF_ERROR( &response, TRITONBACKEND_RequestOutputCount(request, &output_count)); if (response != nullptr) { for (uint32_t output_idx = 0; output_idx < output_count; output_idx++) { const char* name; RESPOND_AND_SET_NULL_IF_ERROR( &response, TRITONBACKEND_RequestOutputName(request, output_idx, &name)); if ((response != nullptr) && (output_name == name)) { RESPOND_AND_SET_NULL_IF_ERROR( &response, TRITONBACKEND_ResponseOutput( response, &response_output, name, datatype, output_batchn_shape.data(), output_batchn_shape.size())); if (response != nullptr) { need_sync_ |= SetFixedSizeBuffer( &response, response_output, output_name, tensor_byte_size, tensor_offset, buffer, memory_type, memory_type_id, use_pinned_memory_type, false /* state */); } break; } } } } tensor_offset += tensor_byte_size; } break; } } // Done with the tensor, flush any pending pinned copies. need_sync_ |= FlushPendingPinned(buffer, memory_type, memory_type_id); #ifdef TRITON_ENABLE_GPU if (need_sync_ && (event_ != nullptr)) { cudaEventRecord(event_, stream_); } #endif // TRITON_ENABLE_GPU } }} // namespace triton::backend ================================================ FILE: src/device_memory_tracker.cc ================================================ // Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // * Neither the name of NVIDIA CORPORATION nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "triton/backend/device_memory_tracker.h" #include #include #include "triton/core/tritonserver.h" namespace triton { namespace backend { std::unique_ptr DeviceMemoryTracker::tracker_{nullptr}; // Boilerplate from CUPTI examples namespace { #define LOG_IF_CUPTI_ERR(call) \ do { \ CUptiResult _status = call; \ if (_status != CUPTI_SUCCESS) { \ const char* errstr; \ cuptiGetResultString(_status, &errstr); \ LOG_ERROR << #call << " failed with error: " << errstr; \ } \ } while (0) #define THROW_IF_CUPTI_ERR(call) \ do { \ CUptiResult _status = call; \ if (_status != CUPTI_SUCCESS) { \ const char* errstr; \ cuptiGetResultString(_status, &errstr); \ throw std::runtime_error( \ std::string(#call) + " failed with error: " + errstr); \ } \ } while (0) #define BUF_SIZE (32 * 1024) #define ALIGN_SIZE (8) #define ALIGN_BUFFER(buffer, align) \ (((uintptr_t)(buffer) & ((align)-1)) \ ? ((buffer) + (align) - ((uintptr_t)(buffer) & ((align)-1))) \ : (buffer)) void bufferRequested(uint8_t** buffer, size_t* size, size_t* maxNumRecords) { uint8_t* bfr = (uint8_t*)malloc(BUF_SIZE + ALIGN_SIZE); if (bfr != nullptr) { *size = BUF_SIZE; *buffer = ALIGN_BUFFER(bfr, ALIGN_SIZE); *maxNumRecords = 0; } else { LOG_ERROR << "Failed to allocate buffer for CUPTI: out of memory"; } } void bufferCompleted( CUcontext ctx, uint32_t streamId, uint8_t* buffer, size_t size, size_t validSize) { CUptiResult status; CUpti_Activity* record = nullptr; if (validSize > 0) { do { status = cuptiActivityGetNextRecord(buffer, validSize, &record); if (status == CUPTI_SUCCESS) { DeviceMemoryTracker::TrackActivity(record); } else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) break; else { LOG_IF_CUPTI_ERR(status); } } while (1); // report any records dropped from the queue size_t dropped = 0; LOG_IF_CUPTI_ERR( cuptiActivityGetNumDroppedRecords(ctx, streamId, &dropped)); if (dropped != 0) { LOG_WARNING << "Dropped " << dropped << " activity records"; } } free(buffer); } } // namespace DeviceMemoryTracker::DeviceMemoryTracker() { cudaError_t cuerr = cudaGetDeviceCount(&device_cnt_); if ((cuerr == cudaErrorNoDevice) || (cuerr == cudaErrorInsufficientDriver)) { device_cnt_ = 0; } else if (cuerr != cudaSuccess) { throw std::runtime_error( "Unexpected failure on getting CUDA device count."); } // Use 'cuptiSubscribe' to check if the cupti has been initialized // elsewhere. Due to cupti limitation, there can only be one cupti client // within the process, so in the case of per-backend memory tracking, we // have to make the assumption that the other cupti client is using the same // memory tracker implementation so that the backend may use the cupti // configuration that is external to the backend without issue. auto cupti_res = cuptiSubscribe(&subscriber_, nullptr, nullptr); switch (cupti_res) { case CUPTI_SUCCESS: { THROW_IF_CUPTI_ERR( cuptiActivityRegisterCallbacks(bufferRequested, bufferCompleted)); THROW_IF_CUPTI_ERR(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME)); THROW_IF_CUPTI_ERR(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMORY2)); THROW_IF_CUPTI_ERR( cuptiActivityEnable(CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION)); break; } case CUPTI_ERROR_MULTIPLE_SUBSCRIBERS_NOT_SUPPORTED: { LOG_WARNING << "CUPTI has been initialized elsewhere, assuming the " "implementation is the same"; break; } default: { // other error, should propagate and disable memory tracking for the // backend const char* errstr; cuptiGetResultString(cupti_res, &errstr); throw std::runtime_error( std::string("Unexpected failure on configuring CUPTI: ") + errstr); } } } DeviceMemoryTracker::~DeviceMemoryTracker() { if (subscriber_) { cuptiUnsubscribe(subscriber_); } } int DeviceMemoryTracker::CudaDeviceCount() { if (tracker_) { return tracker_->device_cnt_; } throw std::runtime_error( "DeviceMemoryTracker::Init() must be called before using any " "DeviceMemoryTracker features."); } bool DeviceMemoryTracker::Init() { if (tracker_ == nullptr) { try { tracker_.reset(new DeviceMemoryTracker()); } catch (const std::runtime_error& ex) { // Fail initialization LOG_ERROR << ex.what(); return false; } } return true; } void DeviceMemoryTracker::Fini() { tracker_.reset(); } void DeviceMemoryTracker::TrackThreadMemoryUsage(MemoryUsage* usage) { if (!usage) { return; } if (tracker_) { THROW_IF_CUPTI_ERR(cuptiActivityPushExternalCorrelationId( CUPTI_EXTERNAL_CORRELATION_KIND_UNKNOWN, reinterpret_cast(&usage->cupti_tracker_))); usage->tracked_ = true; } else { throw std::runtime_error( "DeviceMemoryTracker::Init() must be called before using any " "DeviceMemoryTracker features."); } } void DeviceMemoryTracker::UntrackThreadMemoryUsage(MemoryUsage* usage) { if (!usage) { return; } if (tracker_) { THROW_IF_CUPTI_ERR(cuptiActivityFlushAll(0)); uint64_t id = 0; THROW_IF_CUPTI_ERR(cuptiActivityPopExternalCorrelationId( CUPTI_EXTERNAL_CORRELATION_KIND_UNKNOWN, &id)); usage->tracked_ = false; } else { throw std::runtime_error( "DeviceMemoryTracker::Init() must be called before using any " "DeviceMemoryTracker features."); } } void DeviceMemoryTracker::TrackActivityInternal(CUpti_Activity* record) { switch (record->kind) { case CUPTI_ACTIVITY_KIND_MEMORY2: { CUpti_ActivityMemory3* memory_record = (CUpti_ActivityMemory3*)record; TRITONBACKEND_CuptiTracker* usage = nullptr; { std::lock_guard lk(mtx_); auto it = activity_to_memory_usage_.find(memory_record->correlationId); if (it != activity_to_memory_usage_.end()) { usage = reinterpret_cast(it->second); activity_to_memory_usage_.erase(it); } } const bool is_allocation = (memory_record->memoryOperationType == CUPTI_ACTIVITY_MEMORY_OPERATION_TYPE_ALLOCATION); const bool is_release = (memory_record->memoryOperationType == CUPTI_ACTIVITY_MEMORY_OPERATION_TYPE_RELEASE); // Ignore memory record that is not associated with a // TRITONBACKEND_CuptiTracker object or not related to allocations if ((usage == nullptr) || (!usage->valid_) || (!is_allocation && !is_release)) { break; } switch (memory_record->memoryKind) { case CUPTI_ACTIVITY_MEMORY_KIND_DEVICE: { usage->valid_ = UpdateMemoryTypeUsage( memory_record, is_allocation, usage->cuda_memory_usage_byte_, usage->cuda_array_len_); break; } case CUPTI_ACTIVITY_MEMORY_KIND_PINNED: { usage->valid_ = UpdateMemoryTypeUsage( memory_record, is_allocation, usage->pinned_memory_usage_byte_, usage->pinned_array_len_); break; } case CUPTI_ACTIVITY_MEMORY_KIND_PAGEABLE: { usage->valid_ = UpdateMemoryTypeUsage( memory_record, is_allocation, usage->system_memory_usage_byte_, usage->system_array_len_); break; } default: LOG_WARNING << "Unrecognized type of memory is allocated, kind " << memory_record->memoryKind; usage->valid_ = false; break; } break; } case CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION: { CUpti_ActivityExternalCorrelation* corr = (CUpti_ActivityExternalCorrelation*)record; if (CUPTI_EXTERNAL_CORRELATION_KIND_UNKNOWN == corr->externalKind) { std::lock_guard lk(mtx_); activity_to_memory_usage_[corr->correlationId] = static_cast(corr->externalId); } break; } case CUPTI_ACTIVITY_KIND_RUNTIME: { // DO NOTHING, runtime API will be captured and reported to properly // initialize records for CUPTI_ACTIVITY_KIND_MEMORY2. break; } default: LOG_ERROR << "Unexpected capture of cupti record, kind: " << record->kind; break; } } inline bool DeviceMemoryTracker::UpdateMemoryTypeUsage( CUpti_ActivityMemory3* memory_record, const bool is_allocation, int64_t* memory_usage, uint32_t usage_len) { if (memory_record->deviceId >= usage_len) { return false; } if (is_allocation) { memory_usage[memory_record->deviceId] += memory_record->bytes; } else { memory_usage[memory_record->deviceId] -= memory_record->bytes; } return true; } }} // namespace triton::backend ================================================ FILE: src/kernel.cu ================================================ // Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // * Neither the name of NVIDIA CORPORATION nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include "kernel.h" #define THREADBLOCK_SIZE 512 __launch_bounds__(THREADBLOCK_SIZE) __global__ void TritonGatherKernel( const int8_t** __restrict input_ptr_buffer, const size_t* __restrict byte_size_buffer, const size_t* __restrict byte_size_offset_buffer, int8_t* __restrict output_buffer) { int request_idx = blockIdx.x; int lane_id = threadIdx.x; const int8_t* request_input_buffer = input_ptr_buffer[request_idx]; int byte_size = byte_size_buffer[request_idx]; int byte_size_offset = byte_size_offset_buffer[request_idx]; int8_t* output_buffer_with_offset = output_buffer + byte_size_offset; if (((byte_size % 4) == 0) && (((uint64_t)request_input_buffer % 4) == 0) && (((uint64_t)output_buffer_with_offset % 4) == 0)) { int32_t* input_4 = (int32_t*)request_input_buffer; int32_t* output_4 = (int32_t*)output_buffer_with_offset; int element_count = byte_size / 4; for (int elem_id = lane_id; elem_id < element_count; elem_id += THREADBLOCK_SIZE) { output_4[elem_id] = input_4[elem_id]; } } else { for (int elem_id = lane_id; elem_id < byte_size; elem_id += THREADBLOCK_SIZE) { output_buffer_with_offset[elem_id] = __ldg(request_input_buffer + elem_id); } } } #ifdef __cplusplus extern "C" { #endif cudaError_t RunGatherKernel( const int8_t** input_ptr_buffer, const size_t* byte_size_buffer, const size_t* byte_size_offset_buffer, int8_t* output_buffer, size_t request_count, cudaStream_t stream) { TritonGatherKernel<<>>( input_ptr_buffer, byte_size_buffer, byte_size_offset_buffer, output_buffer); return cudaGetLastError(); } #ifdef __cplusplus } #endif ================================================ FILE: src/kernel.h ================================================ // Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // * Neither the name of NVIDIA CORPORATION nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma once #include #include #ifdef __cplusplus extern "C" { #endif cudaError_t RunGatherKernel( const int8_t** input_ptr_buffer, const size_t* byte_size_buffer, const size_t* byte_size_offset_buffer, int8_t* output_buffer, size_t request_count, cudaStream_t stream); #ifdef __cplusplus } #endif