Repository: pytorch/tensorpipe Branch: main Commit: b4b77d1006e7 Files: 292 Total size: 1.2 MB Directory structure: gitextract_wzzfsv6c/ ├── .circleci/ │ ├── Dockerfile.cuda10.1 │ ├── Dockerfile.cuda10.2 │ ├── Dockerfile.cuda11.0 │ ├── Dockerfile.cuda11.1 │ ├── Dockerfile.cuda9.2 │ └── config.yml ├── .gitignore ├── .gitmodules ├── CMakeLists.txt ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE.txt ├── README.md ├── cmake/ │ ├── FindPackageHandleStandardArgs.cmake │ ├── FindPackageMessage.cmake │ ├── Finduv.cmake │ ├── MiscCheck.cmake │ ├── Options.cmake │ └── Sanitize.cmake ├── docs/ │ ├── cuda_gotchas.md │ ├── development.md │ ├── linux_support.md │ ├── shm.md │ └── thread_model.md ├── setup.py ├── tensorpipe/ │ ├── .clang-format │ ├── .clang-tidy │ ├── CMakeLists.txt │ ├── benchmark/ │ │ ├── CMakeLists.txt │ │ ├── benchmark_pipe.cc │ │ ├── benchmark_transport.cc │ │ ├── channel_registry.cc │ │ ├── channel_registry.h │ │ ├── measurements.h │ │ ├── options.cc │ │ ├── options.h │ │ ├── registry.h │ │ ├── transport_registry.cc │ │ └── transport_registry.h │ ├── channel/ │ │ ├── basic/ │ │ │ ├── channel_impl.cc │ │ │ ├── channel_impl.h │ │ │ ├── context_impl.cc │ │ │ ├── context_impl.h │ │ │ ├── factory.cc │ │ │ └── factory.h │ │ ├── channel.h │ │ ├── channel_boilerplate.h │ │ ├── channel_impl_boilerplate.h │ │ ├── cma/ │ │ │ ├── channel_impl.cc │ │ │ ├── channel_impl.h │ │ │ ├── context_impl.cc │ │ │ ├── context_impl.h │ │ │ ├── factory.cc │ │ │ └── factory.h │ │ ├── context.h │ │ ├── context_boilerplate.h │ │ ├── context_impl_boilerplate.h │ │ ├── cuda_basic/ │ │ │ ├── channel_impl.cc │ │ │ ├── channel_impl.h │ │ │ ├── constants.h │ │ │ ├── context_impl.cc │ │ │ ├── context_impl.h │ │ │ ├── factory.cc │ │ │ └── factory.h │ │ ├── cuda_gdr/ │ │ │ ├── channel_impl.cc │ │ │ ├── channel_impl.h │ │ │ ├── constants.h │ │ │ ├── context_impl.cc │ │ │ ├── context_impl.h │ │ │ ├── error.h │ │ │ ├── factory.cc │ │ │ └── factory.h │ │ ├── cuda_ipc/ │ │ │ ├── channel_impl.cc │ │ │ ├── channel_impl.h │ │ │ ├── constants.h │ │ │ ├── context_impl.cc │ │ │ ├── context_impl.h │ │ │ ├── factory.cc │ │ │ └── factory.h │ │ ├── cuda_xth/ │ │ │ ├── channel_impl.cc │ │ │ ├── channel_impl.h │ │ │ ├── context_impl.cc │ │ │ ├── context_impl.h │ │ │ ├── factory.cc │ │ │ └── factory.h │ │ ├── error.cc │ │ ├── error.h │ │ ├── helpers.cc │ │ ├── helpers.h │ │ ├── mpt/ │ │ │ ├── channel_impl.cc │ │ │ ├── channel_impl.h │ │ │ ├── context_impl.cc │ │ │ ├── context_impl.h │ │ │ ├── factory.cc │ │ │ ├── factory.h │ │ │ └── nop_types.h │ │ └── xth/ │ │ ├── channel_impl.cc │ │ ├── channel_impl.h │ │ ├── context_impl.cc │ │ ├── context_impl.h │ │ ├── factory.cc │ │ └── factory.h │ ├── common/ │ │ ├── address.cc │ │ ├── address.h │ │ ├── allocator.cc │ │ ├── allocator.h │ │ ├── buffer.h │ │ ├── busy_polling_loop.h │ │ ├── callback.h │ │ ├── cpu_buffer.h │ │ ├── cuda.h │ │ ├── cuda_buffer.cc │ │ ├── cuda_buffer.h │ │ ├── cuda_lib.h │ │ ├── cuda_loop.cc │ │ ├── cuda_loop.h │ │ ├── deferred_executor.h │ │ ├── defs.h │ │ ├── device.h │ │ ├── dl.h │ │ ├── epoll_loop.cc │ │ ├── epoll_loop.h │ │ ├── error.cc │ │ ├── error.h │ │ ├── error_macros.h │ │ ├── fd.cc │ │ ├── fd.h │ │ ├── ibv.cc │ │ ├── ibv.h │ │ ├── ibv_lib.h │ │ ├── memory.h │ │ ├── nop.h │ │ ├── nvml_lib.h │ │ ├── optional.h │ │ ├── queue.h │ │ ├── ringbuffer.h │ │ ├── ringbuffer_read_write_ops.h │ │ ├── ringbuffer_role.h │ │ ├── shm_ringbuffer.h │ │ ├── shm_segment.cc │ │ ├── shm_segment.h │ │ ├── socket.cc │ │ ├── socket.h │ │ ├── state_machine.h │ │ ├── stream_read_write_ops.h │ │ ├── strings.h │ │ ├── system.cc │ │ └── system.h │ ├── config.h.in │ ├── config_cuda.h.in │ ├── core/ │ │ ├── context.cc │ │ ├── context.h │ │ ├── context_impl.cc │ │ ├── context_impl.h │ │ ├── error.cc │ │ ├── error.h │ │ ├── listener.cc │ │ ├── listener.h │ │ ├── listener_impl.cc │ │ ├── listener_impl.h │ │ ├── message.h │ │ ├── nop_types.h │ │ ├── pipe.cc │ │ ├── pipe.h │ │ ├── pipe_impl.cc │ │ └── pipe_impl.h │ ├── misc/ │ │ ├── CMakeLists.txt │ │ └── dump_state_machine.cc │ ├── python/ │ │ ├── CMakeLists.txt │ │ └── tensorpipe.cc │ ├── tensorpipe.h │ ├── tensorpipe_cuda.h │ ├── test/ │ │ ├── CMakeLists.txt │ │ ├── channel/ │ │ │ ├── basic/ │ │ │ │ └── basic_test.cc │ │ │ ├── channel_test.cc │ │ │ ├── channel_test.h │ │ │ ├── channel_test_cpu.cc │ │ │ ├── channel_test_cpu.h │ │ │ ├── channel_test_cuda.cc │ │ │ ├── channel_test_cuda.h │ │ │ ├── channel_test_cuda_multi_gpu.cc │ │ │ ├── channel_test_cuda_xdtt.cc │ │ │ ├── cma/ │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── cma_test.cc │ │ │ │ ├── docker_tests.sh │ │ │ │ ├── probe.cc │ │ │ │ └── probe_report_checker.py │ │ │ ├── cuda_basic/ │ │ │ │ └── cuda_basic_test.cc │ │ │ ├── cuda_gdr/ │ │ │ │ └── cuda_gdr_test.cc │ │ │ ├── cuda_helpers.h │ │ │ ├── cuda_ipc/ │ │ │ │ └── cuda_ipc_test.cc │ │ │ ├── cuda_xth/ │ │ │ │ └── cuda_xth_test.cc │ │ │ ├── kernel.cu │ │ │ ├── kernel.cuh │ │ │ ├── mpt/ │ │ │ │ └── mpt_test.cc │ │ │ └── xth/ │ │ │ └── xth_test.cc │ │ ├── common/ │ │ │ ├── cuda_test.cc │ │ │ ├── defs_test.cc │ │ │ ├── epoll_loop_test.cc │ │ │ ├── ringbuffer_test.cc │ │ │ ├── shm_ringbuffer_test.cc │ │ │ ├── shm_segment_test.cc │ │ │ └── system_test.cc │ │ ├── core/ │ │ │ ├── context_test.cc │ │ │ ├── listener_test.cc │ │ │ ├── pipe_cuda_test.cc │ │ │ ├── pipe_test.cc │ │ │ └── pipe_test.h │ │ ├── peer_group.h │ │ ├── python/ │ │ │ └── tensorpipe.py │ │ ├── test.cc │ │ ├── test_environment.cc │ │ ├── test_environment.h │ │ └── transport/ │ │ ├── connection_test.cc │ │ ├── context_test.cc │ │ ├── ibv/ │ │ │ ├── connection_test.cc │ │ │ ├── context_test.cc │ │ │ ├── ibv_test.cc │ │ │ ├── ibv_test.h │ │ │ └── sockaddr_test.cc │ │ ├── listener_test.cc │ │ ├── shm/ │ │ │ ├── connection_test.cc │ │ │ ├── listener_test.cc │ │ │ ├── reactor_test.cc │ │ │ ├── shm_test.cc │ │ │ ├── shm_test.h │ │ │ └── sockaddr_test.cc │ │ ├── transport_test.h │ │ └── uv/ │ │ ├── connection_test.cc │ │ ├── context_test.cc │ │ ├── loop_test.cc │ │ ├── sockaddr_test.cc │ │ ├── uv_test.cc │ │ └── uv_test.h │ └── transport/ │ ├── connection.h │ ├── connection_boilerplate.h │ ├── connection_impl_boilerplate.h │ ├── context.h │ ├── context_boilerplate.h │ ├── context_impl_boilerplate.h │ ├── error.cc │ ├── error.h │ ├── ibv/ │ │ ├── connection_impl.cc │ │ ├── connection_impl.h │ │ ├── constants.h │ │ ├── context_impl.cc │ │ ├── context_impl.h │ │ ├── error.cc │ │ ├── error.h │ │ ├── factory.cc │ │ ├── factory.h │ │ ├── listener_impl.cc │ │ ├── listener_impl.h │ │ ├── reactor.cc │ │ ├── reactor.h │ │ ├── sockaddr.cc │ │ ├── sockaddr.h │ │ ├── utility.cc │ │ └── utility.h │ ├── listener.h │ ├── listener_boilerplate.h │ ├── listener_impl_boilerplate.h │ ├── shm/ │ │ ├── connection_impl.cc │ │ ├── connection_impl.h │ │ ├── context_impl.cc │ │ ├── context_impl.h │ │ ├── factory.cc │ │ ├── factory.h │ │ ├── listener_impl.cc │ │ ├── listener_impl.h │ │ ├── reactor.cc │ │ ├── reactor.h │ │ ├── sockaddr.cc │ │ └── sockaddr.h │ └── uv/ │ ├── connection_impl.cc │ ├── connection_impl.h │ ├── context_impl.cc │ ├── context_impl.h │ ├── error.cc │ ├── error.h │ ├── factory.cc │ ├── factory.h │ ├── listener_impl.cc │ ├── listener_impl.h │ ├── loop.cc │ ├── loop.h │ ├── sockaddr.cc │ ├── sockaddr.h │ ├── utility.cc │ ├── utility.h │ └── uv.h └── third_party/ └── README.md ================================================ FILE CONTENTS ================================================ ================================================ FILE: .circleci/Dockerfile.cuda10.1 ================================================ FROM nvidia/cuda:10.1-devel-ubuntu18.04 # Install APT packages. RUN apt-get update && \ apt-get install -y build-essential cmake COPY . /tensorpipe WORKDIR /tensorpipe ================================================ FILE: .circleci/Dockerfile.cuda10.2 ================================================ FROM nvidia/cuda:10.2-devel-ubuntu18.04 # Install APT packages. RUN apt-get update && \ apt-get install -y build-essential cmake COPY . /tensorpipe WORKDIR /tensorpipe ================================================ FILE: .circleci/Dockerfile.cuda11.0 ================================================ FROM nvidia/cuda:11.0-devel-ubuntu18.04 # Install APT packages. RUN apt-get update && \ apt-get install -y build-essential cmake COPY . /tensorpipe WORKDIR /tensorpipe ================================================ FILE: .circleci/Dockerfile.cuda11.1 ================================================ FROM nvidia/cuda:11.1-devel-ubuntu18.04 # Install APT packages. RUN apt-get update && \ apt-get install -y build-essential cmake COPY . /tensorpipe WORKDIR /tensorpipe ================================================ FILE: .circleci/Dockerfile.cuda9.2 ================================================ FROM nvidia/cuda:9.2-devel-ubuntu18.04 # Install APT packages. RUN apt-get update && \ apt-get install -y build-essential cmake COPY . /tensorpipe WORKDIR /tensorpipe ================================================ FILE: .circleci/config.yml ================================================ version: 2.1 jobs: build: parameters: docker_image: type: string default: "" apt_get: type: string default: "" c_compiler: type: string default: "" cxx_compiler: type: string default: "" cmake_args: type: string default: "" nproc: type: integer default: 20 docker: - image: << parameters.docker_image >> steps: - checkout - run: name: Install apt packages command: | apt-get update apt-get install -y git-core build-essential cmake << parameters.apt_get >> - run: name: Initialize submodules command: | git submodule init git submodule update - run: name: Build command: | mkdir build cd build cmake ../ \ -DCMAKE_C_FLAGS="-Werror -Wno-deprecated-declarations" \ -DCMAKE_CXX_FLAGS="-Werror -Wno-deprecated-declarations" \ -DCMAKE_C_COMPILER=<< parameters.c_compiler >> \ -DCMAKE_CXX_COMPILER=<< parameters.cxx_compiler >> \ -DTP_ENABLE_CMA=OFF \ -DTP_ENABLE_CUDA_IPC=OFF \ -DTP_ENABLE_IBV=OFF \ -DTP_BUILD_TESTING=ON \ << parameters.cmake_args >> make -j<> - run: name: Test command: | cd build ./tensorpipe/test/tensorpipe_test - run: name: Install command: | cd build make install build_gpu: parameters: cuda_version: type: string exclude_tests: type: string default: "" machine: resource_class: gpu.nvidia.small.multi image: ubuntu-1604-cuda-10.1:201909-23 docker_layer_caching: true steps: - checkout - run: name: Initialize submodules command: | git submodule init git submodule update - run: name: Build/test command: | docker build -t tensorpipe -f .circleci/Dockerfile.cuda<< parameters.cuda_version >> . docker run --gpus all --pid=host tensorpipe sh -c " mkdir build && cd build && cmake ../ \ -DCMAKE_C_FLAGS=\"-Werror -Wno-deprecated-declarations\" \ -DCMAKE_CXX_FLAGS=\"-Werror -Wno-deprecated-declarations\" \ -DCUDA_NVCC_FLAGS=\"-gencode arch=compute_61,code=sm_61\" \ -DTP_ENABLE_SHM=OFF \ -DTP_ENABLE_CMA=OFF \ -DTP_USE_CUDA=ON \ -DTP_ENABLE_CUDA_IPC=ON \ -DTP_ENABLE_IBV=OFF \ -DTP_BUILD_TESTING=ON && make -j20 && ./tensorpipe/test/tensorpipe_test --gtest_filter='-<< parameters.exclude_tests >>' && make install" bare_metal: parameters: image: type: string default: "" apt_get: type: string default: "" c_compiler: type: string default: "" cxx_compiler: type: string default: "" cmake_args: type: string default: "" nproc: type: integer default: 20 machine: image: << parameters.image >> steps: - checkout - run: name: Install apt packages command: | sudo apt-get update sudo apt-get install -y git-core build-essential cmake libibverbs1 rdma-core linux-modules-extra-$(uname -r) << parameters.apt_get >> - run: name: Initialize submodules command: | git submodule init git submodule update - run: name: Build command: | mkdir build cd build cmake ../ \ -DCMAKE_C_FLAGS="-Werror -Wno-deprecated-declarations" \ -DCMAKE_CXX_FLAGS="-Werror -Wno-deprecated-declarations" \ -DCMAKE_C_COMPILER=<< parameters.c_compiler >> \ -DCMAKE_CXX_COMPILER=<< parameters.cxx_compiler >> \ -DTP_ENABLE_CUDA_IPC=OFF \ -DTP_ENABLE_IBV=ON \ -DTP_BUILD_TESTING=ON \ << parameters.cmake_args >> make -j<> - run: name: Configure Soft-RoCE (RXE) InfiniBand interface command: | # Find the name of the first non-loopback IP interface INTERFACE_NAME=$(ip link | grep '^2: ' | sed -re 's/2: ([a-z0-9]+): .*/\1/') sudo rdma link add rxe0 type rxe netdev $INTERFACE_NAME - run: name: Test command: | cd build ./tensorpipe/test/tensorpipe_test - run: name: Test CMA channel autodetection with Docker command: | bash -eo pipefail tensorpipe/test/channel/cma/docker_tests.sh - run: name: Install command: | cd build sudo make install build_osx: macos: xcode: 12.4.0 steps: - checkout - run: name: Install homebrew packages command: | brew install cmake - run: name: Initialize submodules command: | git submodule init git submodule update - run: name: Build command: | mkdir build cd build cmake ../ \ -DCMAKE_C_FLAGS="-Werror -Wno-deprecated-declarations" \ -DCMAKE_CXX_FLAGS="-Werror -Wno-deprecated-declarations" \ -DTP_BUILD_TESTING=ON make -j - run: name: Test command: | cd build ./tensorpipe/test/tensorpipe_test - run: name: Install command: | cd build make install python: parameters: docker_image: type: string default: "" apt_get: type: string default: "" docker: - image: << parameters.docker_image >> steps: - checkout - run: name: Install apt packages command: | apt-get update apt-get install -y git-core build-essential cmake python3-dev python3-venv << parameters.apt_get >> - run: name: Initialize submodules command: | git submodule init git submodule update - run: name: Build command: | python3 -m venv venv source venv/bin/activate TP_ENABLE_CMA=OFF TP_ENABLE_CUDA_IPC=OFF TP_ENABLE_IBV=OFF python3 setup.py install - run: name: Test command: | source venv/bin/activate python3 tensorpipe/test/python/tensorpipe.py format: docker: - image: ubuntu:18.04 steps: - checkout - run: name: Install clang-format command: | apt-get update apt-get install -y git-core clang-format-10 - run: name: Verify clang-format command: | git ls-files | grep -E '\.(cc|h)$' | xargs clang-format-10 -i if git diff --quiet; then echo "Formatting OK!" else echo "Formatting not OK!" echo "------------------" git --no-pager diff --color exit 1 fi workflows: build: jobs: - build: name: gcc5 docker_image: ubuntu:18.04 apt_get: "gcc-5 g++-5" c_compiler: gcc-5 cxx_compiler: g++-5 - build: name: gcc7 docker_image: ubuntu:18.04 apt_get: "gcc-7 g++-7" c_compiler: gcc-7 cxx_compiler: g++-7 - build: name: clang6 docker_image: ubuntu:18.04 apt_get: "clang-6.0" c_compiler: clang-6.0 cxx_compiler: clang++-6.0 - build: name: gcc7-asan docker_image: ubuntu:18.04 apt_get: "gcc-7 g++-7" c_compiler: gcc-7 cxx_compiler: g++-7 cmake_args: -DSANITIZE=address - build: name: gcc7-tsan docker_image: ubuntu:18.04 apt_get: "gcc-7 g++-7" c_compiler: gcc-7 cxx_compiler: g++-7 cmake_args: -DSANITIZE=thread - bare_metal: name: bare-metal image: ubuntu-2004:202008-01 apt_get: "gcc-7 g++-7" c_compiler: gcc-7 cxx_compiler: g++-7 - build_gpu: name: GPU (CUDA 9.2) cuda_version: "9.2" # Excluding CudaGdr for lack of InfiniBand hardware, and CudaIpc on # multi GPU for lack of p2p capabilities. exclude_tests: "CudaGdr*:CudaIpc/CudaMultiGPUChannelTestSuite*" - build_gpu: name: GPU (CUDA 10.1) cuda_version: "10.1" # Excluding CudaGdr for lack of InfiniBand hardware, and CudaIpc on # multi GPU for lack of p2p capabilities. exclude_tests: "CudaGdr*:CudaIpc/CudaMultiGPUChannelTestSuite*" - build_gpu: name: GPU (CUDA 10.2) cuda_version: "10.2" # Excluding CudaGdr for lack of InfiniBand hardware, and CudaIpc on # multi GPU for lack of p2p capabilities. exclude_tests: "CudaGdr*:CudaIpc/CudaMultiGPUChannelTestSuite*" - build_gpu: name: GPU (CUDA 11.0) cuda_version: "11.0" # Excluding CudaGdr for lack of InfiniBand hardware, and CudaIpc on # multi GPU for lack of p2p capabilities. exclude_tests: "CudaGdr*:CudaIpc/CudaMultiGPUChannelTestSuite*" - build_gpu: name: GPU (CUDA 11.1) cuda_version: "11.1" # Excluding CudaGdr for lack of InfiniBand hardware, and CudaIpc on # multi GPU for lack of p2p capabilities, and CudaBasic/CudaMultiGPUChannelTestSuite.SendAcrossNonDefaultDevices/0 # because it does not work with CUDA 11.1 (cf. https://github.com/pytorch/tensorpipe/issues/368). exclude_tests: "CudaGdr*:CudaIpc/CudaMultiGPUChannelTestSuite*:CudaBasic/CudaMultiGPUChannelTestSuite.SendAcrossNonDefaultDevices/0" - build_osx: name: OSX - python: name: python docker_image: ubuntu:18.04 apt_get: "clang-6.0" - format: name: clang-format ================================================ FILE: .gitignore ================================================ *~ .DS_Store /build/ /cmake-build-debug/ ================================================ FILE: .gitmodules ================================================ [submodule "third_party/pybind11"] path = third_party/pybind11 url = https://github.com/pybind/pybind11.git [submodule "third_party/libuv"] path = third_party/libuv url = https://github.com/libuv/libuv.git branch = v1.x [submodule "third_party/googletest"] path = third_party/googletest url = https://github.com/google/googletest.git [submodule "third_party/libnop"] path = third_party/libnop url = https://github.com/google/libnop.git ================================================ FILE: CMakeLists.txt ================================================ # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. cmake_minimum_required(VERSION 3.18 FATAL_ERROR) project(tensorpipe LANGUAGES C CXX) set(CMAKE_CXX_STANDARD 17) list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake") # Expose build options. include(Options) # Define sanitizer option, if specified. include(Sanitize) # Misc checks to cope with various compiler modes. include(MiscCheck) add_subdirectory(tensorpipe) install(EXPORT TensorpipeTargets DESTINATION share/cmake/Tensorpipe FILE TensorpipeTargets.cmake) ================================================ FILE: CODE_OF_CONDUCT.md ================================================ # Code of Conduct ## Our Pledge In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to make participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, religion, or sexual identity and orientation. ## Our Standards Examples of behavior that contributes to creating a positive environment include: * Using welcoming and inclusive language * Being respectful of differing viewpoints and experiences * Gracefully accepting constructive criticism * Focusing on what is best for the community * Showing empathy towards other community members Examples of unacceptable behavior by participants include: * The use of sexualized language or imagery and unwelcome sexual attention or advances * Trolling, insulting/derogatory comments, and personal or political attacks * Public or private harassment * Publishing others' private information, such as a physical or electronic address, without explicit permission * Other conduct which could reasonably be considered inappropriate in a professional setting ## Our Responsibilities Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. ## Scope This Code of Conduct applies within all project spaces, and it also applies when an individual is representing the project or its community in public spaces. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. ## Enforcement Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at . All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. ## Attribution This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html [homepage]: https://www.contributor-covenant.org For answers to common questions about this code of conduct, see https://www.contributor-covenant.org/faq ================================================ FILE: CONTRIBUTING.md ================================================ # Contributing to TensorPipe We want to make contributing to this project as easy and transparent as possible. ## Our Development Process This project's source-of-truth is the version in Facebook's internal codebase, which is continuously synced with the GitHub mirror using [ShipIt](https://github.com/facebook/fbshipit). Pull requests on GitHub are copied over using ImportIt (a companion tool for ShipIt). ## Pull Requests We actively welcome your pull requests. 1. Fork the repo and create your branch from `main`. 2. If you've added code that should be tested, add tests. 3. If you've changed APIs, update the documentation. 4. Ensure the test suite passes. 5. Make sure your code lints. 6. If you haven't already, complete the Contributor License Agreement ("CLA"). ## Contributor License Agreement ("CLA") In order to accept your pull request, we need you to submit a CLA. You only need to do this once to work on any of Facebook's open source projects. Complete your CLA here: ## Issues We use GitHub issues to track public bugs. Please ensure your description is clear and has sufficient instructions to be able to reproduce the issue. Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe disclosure of security bugs. In those cases, please go through the process outlined on that page and do not file a public issue. ## Coding Style This source code is formatted using `clang-format`, with project-specific rules recorded in the `.clang-format` file. ## License By contributing to TensorPipe, you agree that your contributions will be licensed under the LICENSE.txt file in the root directory of this source tree. ================================================ FILE: LICENSE.txt ================================================ BSD License For TensorPipe software Copyright (c) Meta Platforms, Inc. and affiliates. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name Meta nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ================================================ FILE: README.md ================================================ # TensorPipe The TensorPipe project provides a tensor-aware channel to transfer rich objects from one process to another while using the fastest transport for the tensors contained therein (e.g., CUDA device-to-device copy). > :warning: Update (2025-12) tensorpipe is in maintenance mode and no new changes are planned beyond minimal build fixes. Please see https://github.com/meta-pytorch/torchcomms and https://github.com/meta-pytorch/monarch for alternatives. ## Getting started First clone the repository: ```shell $ git clone --recursive https://github.com/pytorch/tensorpipe ``` Then, build as follows (using ninja instead of make): ``` shell $ cd tensorpipe $ mkdir build $ cd build $ cmake ../ -GNinja $ ninja ``` You can find test executables in `build/tensorpipe/test`. ## Interface There are four classes you need to know about: - `tensorpipe::Context`, which keeps track of the global state of the system, such as thread pools, open file descriptors, etc. - `tensorpipe::Listener`, which allows one process to open an entry point for other processes to connect to. - `tensorpipe::Pipe`, the one communication primitive that this entire project is about. You can obtain one either by connecting to the listener of another process or from such a listener when another process connects to it. Once you have a pipe, you can send messages on it, and that's the whole point. - `tensorpipe::Message`, which is the the language that pipes read and write in. Pipes are streams of structured messages (not just raw byte buffers), and a message is composed of a "core" payload (memory living on CPU) plus a list of tensors (memory living on any device, like GPUs). Sending a message from one end of the pipe to the other can be achieved using the `write` method, which takes a message (with the data to send) and a callback which will be invoked once the sending has completed. This callback will be invoked with an error (if one happened) and with the message. Receiving a message takes two steps: on an incoming message, first the pipe asks you to provide some memory to hold the message in, and then you ask the pipe to read the data into that memory. In order to do this, first you must register a callback that will be notified for incoming messages. This is performed by calling the `readDescriptor` method with said callback. The callback will be invoked with a so-called descriptor, which can be seen as a "message skeleton", i.e., a message with no buffers attached to it (they are set to null pointers). The job of this callback is filling in those buffers, either by allocating the required memory or by obtaining it from somewhere else (from a cache, as a slice of a batch that's being assembled, ...). This descriptor also contains some metadata, given by the sender, which can be used to provide allocation hints or any other information that can help the receiver determine where to store the data. Once the message's buffers are ready, you can tell the pipe to go ahead and fill them in with the incoming data by passing the message to the `read` method, together with a callback which will be called when all the data has been received and stored. As when writing, this callback will be given a (possibly empty) error and the original message. The `readDescriptor` callback is one-shot, which means that after it fires it "expires" and will not be called again. It must be re-armed for a new event to be received. When you pass a message to the pipe, to send it or to receive into it, you must not tamper with the underlying memory until the callback has completed, even if the `write` or `read` call already returned. (The `write` and `read` calls, and all other calls, are non-blocking so that it's easier to schedule asynchronous parallel trasfers without having to use threads). This means you can not deallocate the memory or alter it in any way, as the pipe may still be reading or modifying it. In other terms, you relinquish control over the memory when you pass a message to the pipe, only to reacquire it once the message is given back to you in the callback. This contract is encoded by the requirement to move the messages into and out of the pipe (using rvalue references). Also, because of this agreement, all callbacks will always be called, even if the pipe is closed or if it errors, in order to give back the memory. The order in which messages are written to a pipe is preserved when these messages are read on the other side. Moreover, for a given pipe endpoint, the callbacks of the performed operations are executed in the same order that these operations were scheduled, even if the operations are performed asynchronously or out-of-band and thus may overlap or occur out of order. What this means is that if two write operations are scheduled one after the other back-to-back, even if the second one completes before the first one, its callback is delayed until the first one also completes and its callback is invoked. The same applies for reads. All the callbacks of all the pipes in a given context are called from the same per-context thread and thus no two callbacks will occur at the same time. However, different contexts will use different threads and their callbacks may thus overlap. All the callbacks are invoked with an error reference. This may be "empty", i.e., indicate that no error has in fact occurred. In this case, the error object evaluates to false. In case of an actual error it will instead evaluate to true. When invoked with an error, the remaining arguments of the callback may be meaningless. For the `read` and `write` callbacks they will still contain the message that these methods will be invoked with, but the `readDescriptor` one will be an empty or invalid message. It should not be used. There is no expectation for the `readDescriptor` callback to be armed at all times. Similarly, it is not necessary to call the `read` method immediately after a descriptor has been read. Both these possibilities are by design, in order to allow the user of the pipe to apply some backpressure in case it's receiving messages at a faster rate than it can handle, or for any other reason. This backpressure will be propagated to the lower-level components as as far down as possible (e.g., by stopping listening for readability events on the socket file descriptor). ## Transports and channels TensorPipe aims to be "backend-agnostic": it doesn't want to be restricted to a single way of copying data around but wants to be able to choose the fastest medium from a library of backends, based on the circumstances (e.g., are the two processes on the same machine?) and on the available hardware (e.g., are the GPUs connected with NVLink?). TensorPipe strives to have the largest selection of backends, enabling users to implement specific backends for their systems (should the default ones prove limited) and encouraging contributions. The two processes that are establishing a pipe will automatically negotiate during setup to determine which of the backends they have at their disposal can be used and how well they would perform, in order to choose the best one in a way that is completely transparent to the user. Backends come in two flavors: - Transports are the connections used by the pipes to transfer control messages, and the (smallish) core payloads. They are meant to be lightweight and low-latency. The most basic transport is a simple TCP one, which should work in all scenarios. A more optimized one, for example, is based on a ring buffer allocated in shared memory, which two processes on the same machine can use to communicate by performing just a memory copy, without passing through the kernel. - Channels are where the heavy lifting takes place, as they take care of copying the (larger) tensor data. High bandwidths are a requirement. Examples include multiplexing chunks of data across multiple TCP sockets and processes, so to saturate the NIC's bandwidth. Or using a CUDA memcpy call to transfer memory from one GPU to another using NVLink. These different usage patterns promote different design choices when implementing transports and channels, which means the two are not perfectly interchangeable. For example, a TCP-based transport is best implemented using a single connection, whereas a TCP-based channel will benefit from using multiple connection and chunk and multiplex the payload over them in order to saturate the bandwidth even on the most powerful NICs. Moreover, the APIs of transports and channels put different constraints on them, which demand and permit different approaches. As a rule of thumb, we require more from the transports: the only out-of-band information they can use is a simple address, which is all they can use to bootstrap the connection, and they need to include some "signaling" capabilities (a write on one side "wakes up" the other side by causing a read). Channels, on the other hand, have much looser requirements: they basically just need to implement a `memcpy` and, for anything beyond that, they can leverage a transport that the pipe gives to them for support. ## License TensorPipe is BSD licensed, as found in the [LICENSE.txt](LICENSE.txt) file. ================================================ FILE: cmake/FindPackageHandleStandardArgs.cmake ================================================ # Copyright 2000-2020 Kitware, Inc. and Contributors # All rights reserved. # # Distributed under the OSI-approved BSD 3-Clause License. See # https://cmake.org/licensing for details. #[=======================================================================[.rst: FindPackageHandleStandardArgs ----------------------------- This module provides a function intended to be used in :ref:`Find Modules` implementing :command:`find_package()` calls. It handles the ``REQUIRED``, ``QUIET`` and version-related arguments of ``find_package``. It also sets the ``_FOUND`` variable. The package is considered found if all variables listed contain valid results, e.g. valid filepaths. .. command:: find_package_handle_standard_args There are two signatures:: find_package_handle_standard_args( (DEFAULT_MSG|) ... ) find_package_handle_standard_args( [FOUND_VAR ] [REQUIRED_VARS ...] [VERSION_VAR ] [HANDLE_COMPONENTS] [CONFIG_MODE] [FAIL_MESSAGE ] ) The ``_FOUND`` variable will be set to ``TRUE`` if all the variables ``...`` are valid and any optional constraints are satisfied, and ``FALSE`` otherwise. A success or failure message may be displayed based on the results and on whether the ``REQUIRED`` and/or ``QUIET`` option was given to the :command:`find_package` call. The options are: ``(DEFAULT_MSG|)`` In the simple signature this specifies the failure message. Use ``DEFAULT_MSG`` to ask for a default message to be computed (recommended). Not valid in the full signature. ``FOUND_VAR `` Obsolete. Specifies either ``_FOUND`` or ``_FOUND`` as the result variable. This exists only for compatibility with older versions of CMake and is now ignored. Result variables of both names are always set for compatibility. ``REQUIRED_VARS ...`` Specify the variables which are required for this package. These may be named in the generated failure message asking the user to set the missing variable values. Therefore these should typically be cache entries such as ``FOO_LIBRARY`` and not output variables like ``FOO_LIBRARIES``. ``VERSION_VAR `` Specify the name of a variable that holds the version of the package that has been found. This version will be checked against the (potentially) specified required version given to the :command:`find_package` call, including its ``EXACT`` option. The default messages include information about the required version and the version which has been actually found, both if the version is ok or not. ``HANDLE_COMPONENTS`` Enable handling of package components. In this case, the command will report which components have been found and which are missing, and the ``_FOUND`` variable will be set to ``FALSE`` if any of the required components (i.e. not the ones listed after the ``OPTIONAL_COMPONENTS`` option of :command:`find_package`) are missing. ``CONFIG_MODE`` Specify that the calling find module is a wrapper around a call to ``find_package( NO_MODULE)``. This implies a ``VERSION_VAR`` value of ``_VERSION``. The command will automatically check whether the package configuration file was found. ``FAIL_MESSAGE `` Specify a custom failure message instead of using the default generated message. Not recommended. Example for the simple signature: .. code-block:: cmake find_package_handle_standard_args(LibXml2 DEFAULT_MSG LIBXML2_LIBRARY LIBXML2_INCLUDE_DIR) The ``LibXml2`` package is considered to be found if both ``LIBXML2_LIBRARY`` and ``LIBXML2_INCLUDE_DIR`` are valid. Then also ``LibXml2_FOUND`` is set to ``TRUE``. If it is not found and ``REQUIRED`` was used, it fails with a :command:`message(FATAL_ERROR)`, independent whether ``QUIET`` was used or not. If it is found, success will be reported, including the content of the first ````. On repeated CMake runs, the same message will not be printed again. Example for the full signature: .. code-block:: cmake find_package_handle_standard_args(LibArchive REQUIRED_VARS LibArchive_LIBRARY LibArchive_INCLUDE_DIR VERSION_VAR LibArchive_VERSION) In this case, the ``LibArchive`` package is considered to be found if both ``LibArchive_LIBRARY`` and ``LibArchive_INCLUDE_DIR`` are valid. Also the version of ``LibArchive`` will be checked by using the version contained in ``LibArchive_VERSION``. Since no ``FAIL_MESSAGE`` is given, the default messages will be printed. Another example for the full signature: .. code-block:: cmake find_package(Automoc4 QUIET NO_MODULE HINTS /opt/automoc4) find_package_handle_standard_args(Automoc4 CONFIG_MODE) In this case, a ``FindAutmoc4.cmake`` module wraps a call to ``find_package(Automoc4 NO_MODULE)`` and adds an additional search directory for ``automoc4``. Then the call to ``find_package_handle_standard_args`` produces a proper success/failure message. #]=======================================================================] include(${CMAKE_CURRENT_LIST_DIR}/FindPackageMessage.cmake) # internal helper macro macro(_FPHSA_FAILURE_MESSAGE _msg) if (${_NAME}_FIND_REQUIRED) message(FATAL_ERROR "${_msg}") else () if (NOT ${_NAME}_FIND_QUIETLY) message(STATUS "${_msg}") endif () endif () endmacro() # internal helper macro to generate the failure message when used in CONFIG_MODE: macro(_FPHSA_HANDLE_FAILURE_CONFIG_MODE) # _CONFIG is set, but FOUND is false, this means that some other of the REQUIRED_VARS was not found: if(${_NAME}_CONFIG) _FPHSA_FAILURE_MESSAGE("${FPHSA_FAIL_MESSAGE}: missing:${MISSING_VARS} (found ${${_NAME}_CONFIG} ${VERSION_MSG})") else() # If _CONSIDERED_CONFIGS is set, the config-file has been found, but no suitable version. # List them all in the error message: if(${_NAME}_CONSIDERED_CONFIGS) set(configsText "") list(LENGTH ${_NAME}_CONSIDERED_CONFIGS configsCount) math(EXPR configsCount "${configsCount} - 1") foreach(currentConfigIndex RANGE ${configsCount}) list(GET ${_NAME}_CONSIDERED_CONFIGS ${currentConfigIndex} filename) list(GET ${_NAME}_CONSIDERED_VERSIONS ${currentConfigIndex} version) string(APPEND configsText " ${filename} (version ${version})\n") endforeach() if (${_NAME}_NOT_FOUND_MESSAGE) string(APPEND configsText " Reason given by package: ${${_NAME}_NOT_FOUND_MESSAGE}\n") endif() _FPHSA_FAILURE_MESSAGE("${FPHSA_FAIL_MESSAGE} ${VERSION_MSG}, checked the following files:\n${configsText}") else() # Simple case: No Config-file was found at all: _FPHSA_FAILURE_MESSAGE("${FPHSA_FAIL_MESSAGE}: found neither ${_NAME}Config.cmake nor ${_NAME_LOWER}-config.cmake ${VERSION_MSG}") endif() endif() endmacro() function(FIND_PACKAGE_HANDLE_STANDARD_ARGS _NAME _FIRST_ARG) # Set up the arguments for `cmake_parse_arguments`. set(options CONFIG_MODE HANDLE_COMPONENTS) set(oneValueArgs FAIL_MESSAGE VERSION_VAR FOUND_VAR) set(multiValueArgs REQUIRED_VARS) # Check whether we are in 'simple' or 'extended' mode: set(_KEYWORDS_FOR_EXTENDED_MODE ${options} ${oneValueArgs} ${multiValueArgs} ) list(FIND _KEYWORDS_FOR_EXTENDED_MODE "${_FIRST_ARG}" INDEX) if(${INDEX} EQUAL -1) set(FPHSA_FAIL_MESSAGE ${_FIRST_ARG}) set(FPHSA_REQUIRED_VARS ${ARGN}) set(FPHSA_VERSION_VAR) else() cmake_parse_arguments(FPHSA "${options}" "${oneValueArgs}" "${multiValueArgs}" ${_FIRST_ARG} ${ARGN}) if(FPHSA_UNPARSED_ARGUMENTS) message(FATAL_ERROR "Unknown keywords given to FIND_PACKAGE_HANDLE_STANDARD_ARGS(): \"${FPHSA_UNPARSED_ARGUMENTS}\"") endif() if(NOT FPHSA_FAIL_MESSAGE) set(FPHSA_FAIL_MESSAGE "DEFAULT_MSG") endif() # In config-mode, we rely on the variable _CONFIG, which is set by find_package() # when it successfully found the config-file, including version checking: if(FPHSA_CONFIG_MODE) list(INSERT FPHSA_REQUIRED_VARS 0 ${_NAME}_CONFIG) list(REMOVE_DUPLICATES FPHSA_REQUIRED_VARS) set(FPHSA_VERSION_VAR ${_NAME}_VERSION) endif() if(NOT FPHSA_REQUIRED_VARS) message(FATAL_ERROR "No REQUIRED_VARS specified for FIND_PACKAGE_HANDLE_STANDARD_ARGS()") endif() endif() # now that we collected all arguments, process them if("x${FPHSA_FAIL_MESSAGE}" STREQUAL "xDEFAULT_MSG") set(FPHSA_FAIL_MESSAGE "Could NOT find ${_NAME}") endif() list(GET FPHSA_REQUIRED_VARS 0 _FIRST_REQUIRED_VAR) string(TOUPPER ${_NAME} _NAME_UPPER) string(TOLOWER ${_NAME} _NAME_LOWER) if(FPHSA_FOUND_VAR) if(FPHSA_FOUND_VAR MATCHES "^${_NAME}_FOUND$" OR FPHSA_FOUND_VAR MATCHES "^${_NAME_UPPER}_FOUND$") set(_FOUND_VAR ${FPHSA_FOUND_VAR}) else() message(FATAL_ERROR "The argument for FOUND_VAR is \"${FPHSA_FOUND_VAR}\", but only \"${_NAME}_FOUND\" and \"${_NAME_UPPER}_FOUND\" are valid names.") endif() else() set(_FOUND_VAR ${_NAME_UPPER}_FOUND) endif() # collect all variables which were not found, so they can be printed, so the # user knows better what went wrong (#6375) set(MISSING_VARS "") set(DETAILS "") # check if all passed variables are valid set(FPHSA_FOUND_${_NAME} TRUE) foreach(_CURRENT_VAR ${FPHSA_REQUIRED_VARS}) if(NOT ${_CURRENT_VAR}) set(FPHSA_FOUND_${_NAME} FALSE) string(APPEND MISSING_VARS " ${_CURRENT_VAR}") else() string(APPEND DETAILS "[${${_CURRENT_VAR}}]") endif() endforeach() if(FPHSA_FOUND_${_NAME}) set(${_NAME}_FOUND TRUE) set(${_NAME_UPPER}_FOUND TRUE) else() set(${_NAME}_FOUND FALSE) set(${_NAME_UPPER}_FOUND FALSE) endif() # component handling unset(FOUND_COMPONENTS_MSG) unset(MISSING_COMPONENTS_MSG) if(FPHSA_HANDLE_COMPONENTS) foreach(comp ${${_NAME}_FIND_COMPONENTS}) if(${_NAME}_${comp}_FOUND) if(NOT DEFINED FOUND_COMPONENTS_MSG) set(FOUND_COMPONENTS_MSG "found components: ") endif() string(APPEND FOUND_COMPONENTS_MSG " ${comp}") else() if(NOT DEFINED MISSING_COMPONENTS_MSG) set(MISSING_COMPONENTS_MSG "missing components: ") endif() string(APPEND MISSING_COMPONENTS_MSG " ${comp}") if(${_NAME}_FIND_REQUIRED_${comp}) set(${_NAME}_FOUND FALSE) string(APPEND MISSING_VARS " ${comp}") endif() endif() endforeach() set(COMPONENT_MSG "${FOUND_COMPONENTS_MSG} ${MISSING_COMPONENTS_MSG}") string(APPEND DETAILS "[c${COMPONENT_MSG}]") endif() # version handling: set(VERSION_MSG "") set(VERSION_OK TRUE) # check with DEFINED here as the requested or found version may be "0" if (DEFINED ${_NAME}_FIND_VERSION) if(DEFINED ${FPHSA_VERSION_VAR}) set(_FOUND_VERSION ${${FPHSA_VERSION_VAR}}) if(${_NAME}_FIND_VERSION_EXACT) # exact version required # count the dots in the version string string(REGEX REPLACE "[^.]" "" _VERSION_DOTS "${_FOUND_VERSION}") # add one dot because there is one dot more than there are components string(LENGTH "${_VERSION_DOTS}." _VERSION_DOTS) if (_VERSION_DOTS GREATER ${_NAME}_FIND_VERSION_COUNT) # Because of the C++ implementation of find_package() ${_NAME}_FIND_VERSION_COUNT # is at most 4 here. Therefore a simple lookup table is used. if (${_NAME}_FIND_VERSION_COUNT EQUAL 1) set(_VERSION_REGEX "[^.]*") elseif (${_NAME}_FIND_VERSION_COUNT EQUAL 2) set(_VERSION_REGEX "[^.]*\\.[^.]*") elseif (${_NAME}_FIND_VERSION_COUNT EQUAL 3) set(_VERSION_REGEX "[^.]*\\.[^.]*\\.[^.]*") else () set(_VERSION_REGEX "[^.]*\\.[^.]*\\.[^.]*\\.[^.]*") endif () string(REGEX REPLACE "^(${_VERSION_REGEX})\\..*" "\\1" _VERSION_HEAD "${_FOUND_VERSION}") unset(_VERSION_REGEX) if (NOT ${_NAME}_FIND_VERSION VERSION_EQUAL _VERSION_HEAD) set(VERSION_MSG "Found unsuitable version \"${_FOUND_VERSION}\", but required is exact version \"${${_NAME}_FIND_VERSION}\"") set(VERSION_OK FALSE) else () set(VERSION_MSG "(found suitable exact version \"${_FOUND_VERSION}\")") endif () unset(_VERSION_HEAD) else () if (NOT ${_NAME}_FIND_VERSION VERSION_EQUAL _FOUND_VERSION) set(VERSION_MSG "Found unsuitable version \"${_FOUND_VERSION}\", but required is exact version \"${${_NAME}_FIND_VERSION}\"") set(VERSION_OK FALSE) else () set(VERSION_MSG "(found suitable exact version \"${_FOUND_VERSION}\")") endif () endif () unset(_VERSION_DOTS) else() # minimum version specified: if (${_NAME}_FIND_VERSION VERSION_GREATER _FOUND_VERSION) set(VERSION_MSG "Found unsuitable version \"${_FOUND_VERSION}\", but required is at least \"${${_NAME}_FIND_VERSION}\"") set(VERSION_OK FALSE) else () set(VERSION_MSG "(found suitable version \"${_FOUND_VERSION}\", minimum required is \"${${_NAME}_FIND_VERSION}\")") endif () endif() else() # if the package was not found, but a version was given, add that to the output: if(${_NAME}_FIND_VERSION_EXACT) set(VERSION_MSG "(Required is exact version \"${${_NAME}_FIND_VERSION}\")") else() set(VERSION_MSG "(Required is at least version \"${${_NAME}_FIND_VERSION}\")") endif() endif() else () # Check with DEFINED as the found version may be 0. if(DEFINED ${FPHSA_VERSION_VAR}) set(VERSION_MSG "(found version \"${${FPHSA_VERSION_VAR}}\")") endif() endif () if(VERSION_OK) string(APPEND DETAILS "[v${${FPHSA_VERSION_VAR}}(${${_NAME}_FIND_VERSION})]") else() set(${_NAME}_FOUND FALSE) endif() # print the result: if (${_NAME}_FOUND) FIND_PACKAGE_MESSAGE(${_NAME} "Found ${_NAME}: ${${_FIRST_REQUIRED_VAR}} ${VERSION_MSG} ${COMPONENT_MSG}" "${DETAILS}") else () if(FPHSA_CONFIG_MODE) _FPHSA_HANDLE_FAILURE_CONFIG_MODE() else() if(NOT VERSION_OK) _FPHSA_FAILURE_MESSAGE("${FPHSA_FAIL_MESSAGE}: ${VERSION_MSG} (found ${${_FIRST_REQUIRED_VAR}})") else() _FPHSA_FAILURE_MESSAGE("${FPHSA_FAIL_MESSAGE} (missing:${MISSING_VARS}) ${VERSION_MSG}") endif() endif() endif () set(${_NAME}_FOUND ${${_NAME}_FOUND} PARENT_SCOPE) set(${_NAME_UPPER}_FOUND ${${_NAME}_FOUND} PARENT_SCOPE) endfunction() ================================================ FILE: cmake/FindPackageMessage.cmake ================================================ # Copyright 2000-2020 Kitware, Inc. and Contributors # All rights reserved. # # Distributed under the OSI-approved BSD 3-Clause License. See # https://cmake.org/licensing for details. #.rst: # FindPackageMessage # ------------------ # # # # FIND_PACKAGE_MESSAGE( "message for user" "find result details") # # This macro is intended to be used in FindXXX.cmake modules files. It # will print a message once for each unique find result. This is useful # for telling the user where a package was found. The first argument # specifies the name (XXX) of the package. The second argument # specifies the message to display. The third argument lists details # about the find result so that if they change the message will be # displayed again. The macro also obeys the QUIET argument to the # find_package command. # # Example: # # :: # # if(X11_FOUND) # FIND_PACKAGE_MESSAGE(X11 "Found X11: ${X11_X11_LIB}" # "[${X11_X11_LIB}][${X11_INCLUDE_DIR}]") # else() # ... # endif() function(FIND_PACKAGE_MESSAGE pkg msg details) # Avoid printing a message repeatedly for the same find result. if(NOT ${pkg}_FIND_QUIETLY) string(REPLACE "\n" "" details "${details}") set(DETAILS_VAR FIND_PACKAGE_MESSAGE_DETAILS_${pkg}) if(NOT "${details}" STREQUAL "${${DETAILS_VAR}}") # The message has not yet been printed. message(STATUS "${msg}") # Save the find details in the cache to avoid printing the same # message again. set("${DETAILS_VAR}" "${details}" CACHE INTERNAL "Details about finding ${pkg}") endif() endif() endfunction() ================================================ FILE: cmake/Finduv.cmake ================================================ # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. # # Finduv # ------ # # Imported Targets # ^^^^^^^^^^^^^^^^ # # An imported target named ``uv::uv`` is provided if libuv has been found. # # Result Variables # ^^^^^^^^^^^^^^^^ # # This module defines the following variables: # # ``uv_FOUND`` # True if libuv was found, false otherwise. # ``uv_LIBRARY_DIRS`` # The path(s) to uv libraries. # ``uv_VERSION`` # The version of libuv found. # find_package(PkgConfig QUIET) if((NOT TP_BUILD_LIBUV) AND PkgConfig_FOUND) pkg_check_modules(uv QUIET IMPORTED_TARGET GLOBAL libuv) if(uv_FOUND) add_library(uv::uv ALIAS PkgConfig::uv) endif() endif() if(NOT uv_FOUND) set(uv_VERSION "1.51.0") set(uv_LIBRARY_DIRS "submodule") set(libuv_DIR ${PROJECT_SOURCE_DIR}/third_party/libuv) add_subdirectory(${libuv_DIR} ${PROJECT_BINARY_DIR}/third_party/libuv EXCLUDE_FROM_ALL) # This hack duplicates the `uv_a` target, so that we can call # install(TARGETS ... EXPORT) on it, which is not possible when the target is # defined in a subdirectory in CMake 3.5. get_target_property(_uv_sources uv_a SOURCES) set(_uv_sources_abs) foreach(_uv_src ${_uv_sources}) list(APPEND _uv_sources_abs "${libuv_DIR}/${_uv_src}") endforeach() add_library(tensorpipe_uv STATIC ${_uv_sources_abs}) if(BUILD_SHARED_LIBS) set_target_properties(tensorpipe_uv PROPERTIES POSITION_INDEPENDENT_CODE 1) endif() get_target_property(_link_libs uv_a LINK_LIBRARIES) target_link_libraries(tensorpipe_uv PRIVATE ${_link_libs}) get_target_property(_include_dirs uv_a INCLUDE_DIRECTORIES) target_include_directories(tensorpipe_uv PRIVATE ${_include_dirs}) target_include_directories(tensorpipe_uv PUBLIC $) get_target_property(_compile_definitions uv_a COMPILE_DEFINITIONS) target_compile_definitions(tensorpipe_uv PRIVATE ${_compile_definitions}) get_target_property(_compile_options uv_a COMPILE_OPTIONS) target_compile_options(tensorpipe_uv PRIVATE ${_compile_options}) install(TARGETS tensorpipe_uv EXPORT TensorpipeTargets ARCHIVE DESTINATION ${TP_INSTALL_LIBDIR}) add_library(uv::uv ALIAS tensorpipe_uv) endif() include(FindPackageHandleStandardArgs) find_package_handle_standard_args(uv REQUIRED_VARS uv_VERSION VERSION_VAR uv_VERSION) ================================================ FILE: cmake/MiscCheck.cmake ================================================ # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. include(CheckCXXSourceCompiles) include(CMakePushCheckState) # We use the [[nodiscard]] attribute, which GCC 5 complains about. # Silence this warning if GCC 5 is used. if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 6) add_definitions("-Wno-attributes") endif() endif() ================================================ FILE: cmake/Options.cmake ================================================ # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. if(CMAKE_SYSTEM_NAME STREQUAL "Linux") set(LINUX ON) else() set(LINUX OFF) endif() macro(TP_CONDITIONAL_BACKEND name docstring condition) # No clue why this monstrosity is needed. But cmake_dependent_option has it, # and the code doesn't seem to work without it. string(REGEX REPLACE " +" ";" TP_CONDITIONAL_BACKEND_CONDITION "${condition}") if(${TP_CONDITIONAL_BACKEND_CONDITION}) set(TP_CONDITIONAL_BACKEND_CAN_ENABLE ON) else() set(TP_CONDITIONAL_BACKEND_CAN_ENABLE OFF) endif() set(${name} ${TP_CONDITIONAL_BACKEND_CAN_ENABLE} CACHE BOOL ${docstring}) if(${name} AND NOT ${TP_CONDITIONAL_BACKEND_CAN_ENABLE}) message(FATAL_ERROR "${name} was explicitly set, but that can't be honored") endif() endmacro() # Try to auto-detect the presence of some libraries in order to enable/disable # the transports/channels that make use of them. # TODO Add CUDA to this list, in order to fix the TODO below # TODO: Default to ON if CUDA available. option(TP_USE_CUDA "Enable support for CUDA tensors" OFF) # Optional features option(TP_BUILD_BENCHMARK "Build benchmarks" OFF) option(TP_BUILD_MISC "Build misc tools" OFF) option(TP_BUILD_PYTHON "Build python bindings" OFF) option(TP_BUILD_TESTING "Build tests" OFF) # Whether to build a static or shared library if(BUILD_SHARED_LIBS) set(TP_STATIC_OR_SHARED SHARED CACHE STRING "") else() set(TP_STATIC_OR_SHARED STATIC CACHE STRING "") endif() mark_as_advanced(TP_STATIC_OR_SHARED) # Force to build libuv from the included submodule option(TP_BUILD_LIBUV "Build libuv from source" OFF) # Directories include(GNUInstallDirs) set(TP_INSTALL_LIBDIR ${CMAKE_INSTALL_LIBDIR} CACHE STRING "Directory in which to install libraries") mark_as_advanced(TP_INSTALL_LIBDIR) set(TP_INSTALL_INCLUDEDIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE STRING "Directory in which to install public headers") mark_as_advanced(TP_INSTALL_INCLUDEDIR) ================================================ FILE: cmake/Sanitize.cmake ================================================ # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. if(SANITIZE) add_definitions("-fsanitize=${SANITIZE}") add_definitions("-fno-omit-frame-pointer") set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsanitize=${SANITIZE}") endif() ================================================ FILE: docs/cuda_gotchas.md ================================================ # CUDA gotchas While implementing CUDA channels we hit some undocumented "quirks" which forced us to adapt our original designs. We collect them here for future reference (although this list may not be exhaustive). Please add more items whenever we learn new things "the hard way". We’re mostly interested in unexpected behaviors that could entail substantial design changes, although smaller technical pitfalls are welcome too. ## Most functions initialize a context on the current device A lot of CUDA functions cause a CUDA context to be initialized for the "current" device (which is a thread-local variable managed by CUDA). This consumes on-device memory (plus it can cause deadlocks when combined with NCCL). By invoking CUDA functions without first explicitly setting the current device we risk accidentally initializing CUDA contexts on devices on which we weren’t supposed to (especially device 0, since it’s the "default"). In order to avoid this, a device guard should be used for *all* operations. They are very cheap, hence don’t be shy! At times it’s not clear which device should be used in such guard, for example during initialization, however we must only use devices that the user has explicitly provided, hence we may have to lazily delay initialization in those cases. ## Querying the device of a pointer can fail By choice, TensorPipe doesn’t ask users to provide the device index when they pass in a CUDA pointer, for simplicity, since it would be redundant as the device index can be extracted from the pointer. This "extraction" is thus the only CUDA operation for which we can’t possibly set up a device guard. This has proven to be a problem because, due to a bug in CUDA, the extraction would fail if the current device had been *explicitly* set to an invalid (uninitialized) device. (A default "unset" current device would work). This occurred often, because if we used a device guard when the current device was unset, its destructor would explicitly reset the current device to 0. Our investigation seemed to show that an unset current device in the CUDA runtime corresponded to a null current context in the CUDA driver, whereas an invalid current device corresponded to an invalid non-null context. Thus our workaround was to use the driver API directly and first reset its current context to null (in a sense, use a "reverse" device guard, which temporarily "unsets" the current device). ## Releasing shared resources implicitly synchronizes Some CUDA operations perform an implicit device synchronization: they block the CPU thread until the GPU "catches up", that is, it waits for *all* previously-launched kernels for that device (on any stream) to complete. Such functions also cause later kernels (enqueued by another concurrent thread) to delay their launch on the device until the blocking function returns (we’ve occasionally been calling this a "kernel fence"). This is bad because it would mean that an internal TensorPipe operation can interfere with the user’s scheduling of kernels and thus degrade GPU utilization. The [CUDA programming guide](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#implicit-synchronization) mentions such a behavior (in section 3.2.6.5.4), however we’ve found out that the list of circumstances where this occurs is incomplete and incorrect. As a rule of thumb, we’ve seen this behavior happen mainly when *releasing* a resource shared among kernels (e.g., device memory, pinned host memory, IPC memory handles), as if CUDA wanted to ensure there were no kernels using this resource anymore before freeing it. A mental model could be to imagine that kernels acquire a shared lock to it, while freeing it needs a unique lock. The only solution to this limitation is to allocate a pool of these resources at the beginning and reuse them. ## Creating IPC events deadlocks Another CUDA bug we hit was that the creation of CUDA events with the interprocess flag would sometimes deadlock. [Here’s a (not so small) repro](https://gist.github.com/lw/f34836416e7674bbdda8b4925c2999f2). We couldn’t pin it down to a specific condition, or to a race with another call. NVIDIA confirmed the bug and supposedly fixed it in version 450 of the CUDA driver. Since we still need to support earlier versions, as a workaround we’re taking great care to create all our IPC events as early as possible (hoping to avoid whatever races) and reuse them. ## Memory won’t be freed if there’s open IPC handles to it Imagine that process B has received and opened an IPC handle to some device memory allocated and owned by process A, and process A frees this memory without B first closing its handle to it. The CUDA doc described this as undefined behavior (hence we can’t complain), but in practice what we’ve observed is that the memory will *not* be freed, that is, it will not be reused for subsequent allocation requests, thus possibly causing OOMs. In a sense, it’s if as that memory were "leaked". This is displayed rather confusingly in `nvidia-smi`’s accounting: the memory appears as occupied in the device statistics, but no process appears to be responsible for it. ## Cannot open same IPC handle more than once There’s a limitation in older versions of CUDA where, if process A allocates some memory, only *one* binding to it can be opened in process B using IPC handles. Attempting to re-open the same handle a second time will fail. Note that one cannot get multiple "different" handles for the same memory, as CUDA always returns the same one. In practice it means that the user could pass some memory for TensorPipe for which it has already manually created and shared a handle, thus it’s unsafe for TensorPipe to also get and open a handle. We can only safely do it for private memory that we’re managing ourselves. Also note that this limitation was lifted in CUDA 11.1. ## The pointer for an opened IPC handle could be "offset" wrt the source pointer The CUDA doc on this is clear albeit cryptic: given a pointer, CUDA returns the IPC handle for its *allocation*. Hence if we allocate some memory at address p0 and ask for the IPC handle of address p1 = p0 + offset, we’ll get the IPC handle for p0! This means that when we open the handle we need to add back that offset. Luckily CUDA offers a function to query p0 given p1. Note that this situation happens a lot in PyTorch due to the caching allocator sometimes returning slices from larger blocks. ## Not all pairs of GPUs can access each other’s memory Device to device (D2D) transfers are supported by CUDA only when peer-to-peer (P2P) capabilities exist between the two GPUs. This is handled transparently by CUDA, which will automatically select the most performant direct link. Concretely, it will use NVLink, but only if there’s a dedicated "cable" connecting those two devices. If the NVLink mesh is not a complete graph (as is often the case, e.g., hybrid-cube meshes (HCM) are very common), for the missing pairs CUDA will use PCIe transfers, but only if the two devices are attached to the same chipset/controller/host bridge. If there are multiple chipsets (which is also common, e.g., the DGX machines have two), then D2D transfers between some pairs of GPUs might just not be possible through CUDA! In principle this is easy enough to detect since CUDA offers a function for it (and `nvidia-smi topo` also displays it), however we can’t use it if the two devices aren’t both "visible" to the process (we’re referring to the `CUDA_VISIBLE_DEVICES` environment variable). For such cases the only option is to use the NVML library, which doesn’t honor that env var, but in turn adds the complexity of matching corresponding devices between CUDA and NVML (which is best done through their UUID). Moreover, additional complexity was required in TensorPipe to handle the case where some but not all pairs of GPUs between two processes supported P2P. ## Registering CUDA memory with IB is slow This is kinda known, but it’s better to repeat it: the registration and deregistration of memory with InfiniBand is considered a "setup" step, and is very slow, and should thus be avoided as much as possible during the "hot" data path, for example using a staging area or by caching these registrations. ## Registering CUDA memory with IB requires an extra NVIDIA kernel module When we pass a pointer to InfiniBand for registration, InfiniBand needs to understand that this virtual address points to CUDA device memory and not to some CPU memory. For that it needs to be aware of CUDA, and it does so through so-called "peer memory client", which NVIDIA provides (through a separate kernel module) and registers with InfiniBand, and which is queried by InfiniBand before "falling back" to assuming the pointer points to CPU memory. This peer memory client feature is only available in Mellanox’s InfiniBand distribution (called OFED, OpenFabrics Enterprise Distribution), and not in vanilla upstream InfiniBand. On the client side (our side) luckily nothing changes in the API. ## Registering CUDA memory with IB occupies the PCIe window Each PCIe device has a handful of "memory windows" it exposes, through which the host or other devices can access and modify the device’s memory (both to issue commands and to send/retrieve data). These are called BARs (base address registers). In the case of NVIDIA GPUs the BAR that appears to map to the device’s main memory is BAR1. This is often sized much smaller than the memory itself (say, 256MB for a 16GB GPU), with the idea that it will just be used as a staging area. Also note that CUDA already reserves a few dozen MBs of that window. When registering CUDA device memory with InfiniBand, an additional mapping is created in that window (during the `ibv_reg_mr` call) and will thus fail if the window doesn’t have enough remaining space (e.g., if the buffer being registered is larger than the window). This means we can’t straightforwardly register the user-provided buffers. However, with the right combination of GPU and of CPU BIOS, the BAR1 can become as large as the GPU’s main memory itself, in which case this won’t be a problem anymore. ## Registering CUDA memory with IB doesn’t leak it Contrary to IPC handles, freeing CUDA device memory while it’s still registered with InfiniBand does not appear to interfere with the deallocation, hence the memory will correctly become reusable. ## IB messages have a maximum size Each send/recv operation over InfiniBand can only handle up to a certain amount of data, usually at least 1GB, and will fail for larger amounts. This limit can be queried on the device, and chunking must be used for larger sizes. It appears that, at least on some NICs and with some drivers, there's also a "minimum size" of 32 bytes, with messages failing with odd errors for smaller sizes. It's still unclear whether it's a bug. ## GPUs need to be matched with the right IB NIC On some machine types there may be multiple GPUs and multiple InfiniBand devices and they need to be carefully matched. Using the same IB NIC for all GPUs will introduce a bottleneck while leaving all other NICs unused. Matching them up "randomly" means that the data paths over PCIe of different GPU-NIC pairs might cross each other (thus, again, causing a bottleneck), might traverse the host, or otherwise interfere. These machines are usually set up so that each GPU has one NIC that it’s "naturally" closest to, for example they share the same PCIe switch, thus we need a logic to be able to detect and implement this. ================================================ FILE: docs/development.md ================================================ # Development TensorPipe uses CMake for its build system. ## Dependencies To build TensorPipe, you need: * C++14 compatible compiler (GCC >= 5.5 or Clang >= 6) ## Clone the repository Example: ``` shell git clone --recursive https://github.com/pytorch/tensorpipe ``` If you have updated an already cloned repository, make sure that the submodules are up to date: ``` shell git submodule sync git submodule update --init ``` It is imperative to check out the submodules before running CMake. Find the list of submodules and a description of what they're used for on [this page][third_party]. [third_party]: https://github.com/pytorch/tensorpipe/tree/main/third_party ## Using CMake Example: ``` shell mkdir build cd build cmake ../ -DCMAKE_BUILD_TYPE=Debug -DSANITIZE=thread make ``` You can specify CMake variables by passing them as arguments to the `cmake` command. Useful CMake variables: * `CMAKE_C_COMPILER` -- Define which C compiler to use. * `CMAKE_CXX_COMPILER` -- Define which C++ compiler to use. * `CMAKE_C_FLAGS` -- Additional flags for the C compiler. * `CMAKE_CXX_FLAGS` -- Additional flags for the C++ compiler. * `CMAKE_BUILD_TYPE` -- For example: `release`, `debug`. Useful TensorPipe specific variables: * `SANITIZE` -- configure the sanitizer to use (if any); for example: `address` or `thread`, to run with `asan` or `tsan`, respectively. ## Ninja To make CMake output something other than the default `Makefile`, see [`cmake-generators(7)`][cmake-generators]. We like to use the [Ninja][ninja] generator because it works well for incremental builds. On the command line, specify `-GNinja` to use it. [cmake-generators]: https://cmake.org/cmake/help/v3.4/manual/cmake-generators.7.html [ninja]: https://en.wikipedia.org/wiki/Ninja_(build_system) ================================================ FILE: docs/linux_support.md ================================================ This document is intended for developers and advanced users. It’s the kind of document that risks going out of date very quickly, hence take it with a grain of salt. In order to try to be as performant as possible, TensorPipe sometimes relies on new and advanced kernel features. This is causing issues to users who are building and/or running on old kernels. Hence, whenever we use such features, we should always “guard” them somehow, i.e., detect their availability at compile-time or (preferably) at runtime, and disable the backend or mark it non-viable. It is ok-ish for users with old kernels to not have access to all backends, as long as there’s always at least one backend they can use. ## Compile-time vs runtime, Linux vs glibc Unfortunately, both the kernel version used for building and the one used for running affect whether we can use a feature. This means that the availability of a function or flag during build doesn’t mean it will be supported at runtime (this is especially true for the official builds of PyTorch). On the other hand, it also means that even if the runtime kernel supports a feature, we may not be able to use it because we didn’t have access to a system header when building (e.g., to get a flag). While sometimes we can “polyfill” this information, it’s not always doable. An additional complication is added by the fact that we typically access syscalls through their glibc wrappers. First of all, this means we only get access to a syscall once glibc wraps it, which could happen years later. But it also means we link to a glibc symbol, and thus to a specific version of glibc’s shared object. With the kernel, using an unsupported feature results in a runtime error when first used, which we can catch; but with glibc we get a loader error due to missing symbols at startup, even if the user doesn’t use TensorPipe, even if we could “tolerate” these symbols’ absence. It is thus desirable at times to avoid the glibc wrappers. ## Common tricks for how to guard/polyfill * Kernel flags are typically defined as preprocessor flags (i.e., `#define FOO`). This is stuff like `O_TMPFILE`, `MAP_SHARED_VALIDATE`, `PR_SET_PTRACER`, ... It’s easy to detect this in the code, with a `#ifdef FOO`, and since these flags are (usually?) constants, it’s also easy to define them ourselves. This “polyfill” allows us to build on an old kernel but still run on a new one. * For a new-ish syscall, we probably don’t want to use the glibc wrapper, for the problems described above, and because it’s hard to detect its availability (the best option is a CMake check whose result we inject as a preprocessor flag). An alternative is to invoke it through the generic `syscall` syscall, using the `SYS_foo` flags. This could bring a few issues on its own (especially for 32bit systems) but for now it hasn’t come to bite us. This way we skip glibc entirely, and simply end up getting ENOSYS if the runtime kernel doesn’t support the syscall. Those `SYS_foo` flags are defined by glibc, but it seems glibc defines them automatically for all the syscalls it “finds” in the kernel, and not just for the syscalls that glibc supports. Unfortunately we cannot “polyfill” the `SYS_foo` flags if we don’t find them, because they have different values on different architectures. ## What do others do? Since [Apr 2017](https://github.com/libuv/libuv/commit/4e6101388015c6d0879308d566f0a4b79edc0c13), libuv only supports Linux 2.6.32 (December 2009) and glibc 2.12 (May 2010). (This doesn’t mean earlier versions are necessarily broken, but that libuv reserves the right to break them). Libuv seems to be somewhat tied to the RedHat/CentOS releases, which are common and have a very long lifespan. It doesn’t make sense for us to support older versions than what libuv does, because if libuv decides to break them there’s nothing we can do. PyTorch tries to support the [manylinux2014 platform](https://www.python.org/dev/peps/pep-0599/) (defined by Python for use in PyPI/pip), which allows up to glibc 2.17 (December 2012). However, it’s not clear if we’re there yet, and the previous version is `manylinux2010` which comes with glibc 2.12. Hence a reasonable recommendation seems to be to draw the line at Linux 2.6.32 and glibc 2.12. However, people with older versions than those have already reported issues and asked for fixes, which we can probably consider on a case-by-case basis. ## Kernel features used by TensorPipe ### Linux 2.1.4 (October 1996) * The `getresuid` and `getresgid` syscalls. ### Linux 2.3.16 (September 1999) * The `/proc/sys/kernel/random/boot_id` file. See `random(4)`. No git hash as it predates the use of git by Linux https://github.com/torvalds/linux/blob/1da177e4c3f41524e886b7f1b8a0c1fc7321cac2/drivers/char/random.c#L1270-L1278 ### Linux 2.3.20 (October 1999) * The `PR_GET_DUMPABLE` flag for `prctl`. No git hash as it predates the use of git by Linux https://github.com/torvalds/linux/blob/1da177e4c3f41524e886b7f1b8a0c1fc7321cac2/include/linux/prctl.h#L10 ### Linux 2.6.26 (July 2008) * Version 3 of Linux capabilities. (Initial capability support, including the `capget` syscall, dates back to Linux 2.1.100, from May 1998). See `capget(2)`. https://github.com/torvalds/linux/commit/ca05a99a54db1db5bca72eccb5866d2a86f8517f ### Linux 3.2 (January 2012) * Cross-Memory Attach (i.e., the `process_vm_readv` syscall). See `process_vm_readv(2)`. https://github.com/torvalds/linux/commit/fcf634098c00dd9cd247447368495f0b79be12d1 ### Linux 3.4 (May 2012) * The YAMA security module, and thus the `/proc/sys/kernel/yama/ptrace_scope` file. This includes the `PR_SET_PTRACER` and the `PR_SET_PTRACER_ANY` flags for `prctl`. See `ptrace(2)`. https://github.com/torvalds/linux/commit/2d514487faf188938a4ee4fb3464eeecfbdcf8eb https://github.com/torvalds/linux/commit/bf06189e4d14641c0148bea16e9dd24943862215 ### Linux 3.8 (February 2013) * The `/proc/[pid]/ns/[ns]` files. Although that directory, and the `net` file therein, were already present in 3.0, the `pid` and `user` ones only arrived in 3.8 and, more importantly, the ability to identify a namespace by the inode number of those files came in 3.8 (when they stopped being hardlinks and became symlinks). See `proc(5)` and `namespaces(7)` and others. https://github.com/torvalds/linux/commit/6b4e306aa3dc94a0545eb9279475b1ab6209a31f https://github.com/torvalds/linux/commit/13b6f57623bc485e116344fe91fbcb29f149242b https://github.com/torvalds/linux/commit/57e8391d327609cbf12d843259c968b9e5c1838f https://github.com/torvalds/linux/commit/cde1975bc242f3e1072bde623ef378e547b73f91 https://github.com/torvalds/linux/commit/bf056bfa80596a5d14b26b17276a56a0dcb080e5 https://github.com/torvalds/linux/commit/98f842e675f96ffac96e6c50315790912b2812be ### Linux 3.11 (September 2013) * The `O_TMPFILE` flag for `open`. See `open(2)`. https://github.com/torvalds/linux/commit/60545d0d4610b02e55f65d141c95b18ccf855b6e ### Linux 3.17 (October 2014) * The `memfd_create` syscall. See `memfd_create(2)`. https://github.com/torvalds/linux/commit/9183df25fe7b194563db3fec6dc3202a5855839c ### Linux 4.11 (April 2017) * The `/sys/kernel/security/lsm` file in `securityfs` (a list of active Linux Security Modules). https://github.com/torvalds/linux/commit/d69dece5f5b6bc7a5e39d2b6136ddc69469331fe ### TODO * All that sysfs PCIe stuff done by CUDA GDR (e.g., resolving GPUs and NICs to PCIe paths, getting the BAR1 size, ...), plus checking the nv_mem_peer module ## Glibc features required by TensorPipe ### Glibc 2.2.5 (January 2002) * The `capget` function. ### Glibc 2.3.3 (December 2003) * The `dlinfo` function. (All of `dlopen`, `dlclose`, `dlsym` and `dlerror` were present since at least glibc 2.0). ### Glibc 2.12 (May 2010) * The `pthread_setname_np` function. ================================================ FILE: docs/shm.md ================================================ # The shm transport This document is an attempt to capture the design principles and inner working of the shm transport (see `tensorpipe/transport/shm`). Its performance makes it an efficient alternative to IP based transports for same-machine communication. At the core of a transport implementation lies a listener, a connection, and a context. Listeners accept connections. Contexts create listeners and can connect to remote listeners. ## Concepts ### Ring buffers Shared memory ring buffers are a core building block for the shm transport. They are implemented with split control and data sections. This means the data section can be fully aligned. The header section stores a read/write transaction flag and the head and tail offsets into the data section. Producers and consumers of the ring buffer use atomic instructions to mutate this header depending on their intent. ### File descriptors The header and data segments of a shared memory ring buffer are created as follows. First, a file is created in `/dev/shm` with the `O_TMPFILE` flag. This means that anything written to the resulting file is lost when the last file descriptor is closed, unless the file is given a name. Because we never give this file a name, the segment is automatically cleaned up when the last process that has its file descriptor terminates. Per above, creating a shared memory ring buffer yields 2 file descriptors, one for the header segment and one for the data segment. These file descriptors are shared over a Unix domain socket. ### The reactor This is a TensorPipe specific component. It uses a shared memory ring buffer to allow other processes to trigger functions. If a process wants another process to trigger a function, it registers this function with the reactor, and gets back a 32-bit token. Then, the file descriptors of the reactor's ring buffer, as well as the token, are sent to another process. The other process can now map the reactor ring buffer, and trigger the registered function by writing the token to the ring buffer. See [considerations](#considerations) below on why this was used. ### Unix domain sockets Coordination between process to bootstrap a connection that uses shared memory ring buffers is implemented using Unix domain sockets. The listening side of a connection binds and listens on an abstract socket address. A typical Unix domain socket "address" is a filesystem pathname. An abstract socket address, by contrast, is not visible on any filesystem. They exist in a single abstract socket namespace shared by all processes on the machine. Removing the filesystem dependency means two things: 1. (+) It is not necessary to purge stale Unix domain socket files. 2. (-) These sockets don't have permissions, so any process that has its name can connect. Read more about abstract domain sockets [here][1] and [here][2]. [1]: http://man7.org/linux/man-pages/man7/unix.7.html [2]: https://utcc.utoronto.ca/~cks/space/blog/linux/SocketAbstractNamespace Once processes have established a Unix domain socket, it is used to: 1. Pass the shared memory file descriptors to a peer process. 2. Signal peer termination (through eof on socket closure). 3. ... nothing else. All data moves through the ring buffers. **Note:** abstract socket addresses are a Linux specific feature. ## Bringing it together So, to establish one of these shared memory connections, we first listen on some unique abstract socket address. This address must be known to the process that wishes to connect. For a quick test we can use a pre-shared address. Otherwise, we can generate a UUID and share it with some out of band mechanism. The connecting process connects and the listening process accepts. We have now established a Unix domain socket and move on to the next step. Each process creates a new shared memory ring buffer specifically for this connection. We refer to this ring buffer as the _inbox_. We expect each process to be pinned to a specific NUMA node and perform the memory allocation in the same NUMA domain. The file descriptors of the inbox, the file descriptors of the reactor, and a token to trigger readability of the inbox, are shared over the socket. Each process receives file descriptors from their peer and initializes the corresponding ring buffers. The peer's inbox is referred to as the _outbox_. The token to trigger remote readability is referred to as the _outbox trigger_. The connection is now established! Writes are performed by writing directly into the outbox and triggering the outbox trigger. The trigger wakes up the peer's reactor and executes a function that notifies the connection of readability. Subsequently, the connection checks if there was a pending read operation, and processes it if so. When either process destructs the connection, or crashes, the original Unix domain socket is closed, which signals the peer process that it shouldn't expect more writes to its inbox and can destruct the connection as well. ## Considerations A single process may have multiple connections. Therefore, it may have multiple inbox ring buffers. One way to react to incoming writes is to simply check if there are any bytes to read. This requires checking all N inboxes for reads, which can become problematic if N gets large. To better solve this multiplexing problem we initially used an [`eventfd(2)`][eventfd] per inbox. This file descriptor was registered with the existing [`epoll(7)`][epoll] loop and would trigger the readability function when it became readable. To perform a write, the peer process would first write to the outbox and then write to the peer's eventfd. [eventfd]: http://man7.org/linux/man-pages/man2/eventfd.2.html [epoll]: http://man7.org/linux/man-pages/man7/epoll.7.html A simple ping/pong performance benchmark using this approach, with both processes pinned to the same NUMA node, showed a lower bound latency of ~12 microseconds. This seemed high for a pair of ring buffer writes, so we explored alternatives, and came up with the reactor approach. Now, the same benchmark runs with a lower bound latency of about ~1.7 microseconds, which is a 7x improvement over the `eventfd(2)`/`epoll(7)` approach. ================================================ FILE: docs/thread_model.md ================================================ # TensorPipe's thread model TensorPipe is spawning multiple threads internally. This is a design requirement as, for example, a single thread wouldn't manage to drive a modern network interface card (NIC) at capacity and saturate its bandwidth, even if it did nothing by write on the socket: multiple threads writing in parallel to multiple sockets are the only way to achieve that. Moreover, the possibility of spawning new threads when needed allows for a simpler architecture in the implementation of TensorPipe's modular approach to backends (transports and channels): if one of these backends needs to perform some heavy operation (a blocking syscall, an event loop, ...) it can launch a dedicated thread for it rather than having to schedule it on the user thread or on a shared thread pool, thus having to "fit" the operation into some framework. This heavy reliance on multi-threading poses of course challenges in coordination and robustness. This document aims to outline the patterns we've ended up adopting to have a structured and principled design around this. ## Callbacks TensorPipe uses callbacks to organize the control flow around asynchronous and deferred execution. While this may be an anti-pattern leading to so-called "spaghetti code" or "callback hell", we realized that it was the only approach that would yield the performance we need. Modern alternatives to callbacks (promises/futures, coroutines, ...) would have introduced an unacceptable overhead in some cases. Nearly all operations in TensorPipe are non-blocking and are performed asynchronously, in background, with their results notified through callbacks. This includes the creation of pipes and connections (the objects may still be performing initialization when they are given to the user and, although operations can be performed on them, these will be delayed until setup completes). And it also includes destruction, which means that internal resources may not be immediately freed when a user-facing object is deleted. The only synchronization point that allows the user to wait for such cleanup to finish is the context's `join` method. Some other methods that may occasionally wait are the ones that return a value, for example the ones to retrieve addresses. ## Shared pointers As soon as threads and callbacks enter the mix, race conditions start to pop up. Among the first ones, there's the problem of ownership: ideally we want a `unique_ptr`-style semantics, where each object has a clear owner who controls its lifetime. However, when this owner asks another thread to perform an operation on that object as part of a callback, that callback also (temporarily) needs access to the object. As there may be multiple operations with multiple callbacks at the same time, transferring ownership isn't an option, and sharing it is the only way to go. This however requires synchronization among the various users: if the "real" user had a `unique_ptr` and gave raw pointers to the callbacks, the real user may delete the object without the callbacks noticing or having any way to stop/delay it. This would then cause use-after-free errors. There must thus be a sort of "lock" that prevents the object from being deleted while someone is working on it, like a "semaphore" counting the users. It turns out a perfect tool for the job is `shared_ptr`. Acquiring a lock on the object corresponds to obtaining a `shared_ptr` instance, which increases the reference count. The object will only be deleted when its refcount reaches zero, which means all its users (the "real" ones and the callbacks) have stopped using the object. We have however solved a problem by creating an opposite one: a memory leak. Imagine an object (say, a pipe) that is the "real" owner of another one (say, a channel) from which it is expecting a callback, and that callback captures a `shared_ptr` to the first object in its closure. This is a reference cycle. It means that even if the "real" owner of the first object relinquishes its `shared_ptr`, the objects won't be destroyed until the callback fires (if ever). An easy solution to this is to have callbacks only keep a `shared_ptr` when they are running, not while they are waiting. Again, the standard library has the perfect tool for the job: the `weak_ptr`, which will keep the refcount unchanged but can be "locked" to obtain a real `shared_ptr` when needed (curious coincidence that the terminology aligns with our). So, in short: the real owner of an object keeps a `shared_ptr` to it, it passes `weak_ptr`s to be stored in callbacks, and these are locked back to `shared_ptr`s just before running the callbacks. (If locking fails, the callback isn't run). ## Public objects vs private implementations It turns out that what we said above isn't always true: in some cases we may want a callback to keep the object alive until it has fired. This happens because some callbacks are one half of a "contract" regarding data ownership: throughout the API (at higher and lower levels), `read`, `write`, `send` and `recv` methods take some data (source or destination buffers), and by doing so the caller hands over control of the data to the object. The way for the object to yield ownership back to the caller is by invoking the callback. We must thus ensure that these callbacks are always called. However, we must also avoid calling them when we're not ready yet to give up access to the data. For a more concrete example, consider the user trying to destroy a pipe that has a pending write operation, while some other thread is simultaneously performing a memory copy as part of that write operation. If we invoke the write operation's callback before aborting the memory copy we're giving the user the right to deallocate the buffer, which may lead the other thread to segfault. Here is what needs to happen: when a user deletes a pipe, all its pending operations must be interrupted, which in turn also aborts the lower level operations; the pipe's callbacks, however, must not be fired and instead kept alive while waiting for the lower level operations to wrap up, and only then they can be triggered. This shows that a subset of the pipe, containing at least the callbacks, must survive the destruction of the whole pipe. In other words, the lifetime of the inner part must be detacheable from the one of the outer shell. In order to do so, most public objects are just thin wrappers around a single member field, which is just a pointer to an instance of a private "implementation" (abbreviated as impl), which is where everything happens. The impl is a `shared_ptr` so that its life cycle can be detached and extended with respect to the one of the public object. The callbacks that we must wait for in order to regain control of some resource also capture a `shared_ptr`. This way we can still get the "signal" from when the public object is deleted (and can start terminating pending operations) but we're also able to keep the impl around while wait for the shut down to complete. ## Locking Objects can be accessed and worked on from many threads, from all directions, above (user threads, higher up the stack) and below (low-level backend threads). To avoid race conditions on the internal state of these object, we must have mutual exclusion between threads, using locks. While it may be possible to have separate fine-grained locks for different parts of some objects, in general it is safer and easier to have one mutex per object, and use it to lock all operations. That's easily said, but it just as easily leads to deadlocks, which in our experience come in two flavors: - When an object (holding its own lock) calls a "upward" callback which (inline/serially) tries to perform an operation on that same object, which tries to acquire the same lock. This is a perfectly legitimate behavior, since all of our callbacks are "one-shot", that is, they "burn out" after they fire and thus must be immediately rearmed. - When an object (holding its own lock) performs an operation on a lower level object, passing a callback to it, and this callback is called immediately (inline/serially) and tries to also acquire the lock of the first object. This typically happens when the lower level object is in an error state and can thus "shortcut" the operation and immediately trigger the callback instead of deferring it to a thread. Mitigations for these problems are possible but none is universal and they all have drawbacks. Examples are: - When calling upward callbacks, extract one from the object onto the stack, put the object in a consistent state, release its lock and then call the callback. This works but there's a racing risk which would cause callbacks to not be called in their intended order. - Have a dedicated thread from which to invoke callbacks. Therefore other threads, instead of triggering callbacks, push them to some queue that is consumed by this thread. This resembles the semi-future and executor pattern. We used to have such a pattern in place for calling the pipe callbacks but it was introducing an unacceptable latency overhead. - The backends already typically have a thread they can defer callbacks to, and for the most part they already do. However having such a thread isn't necessarily a requirement for a transport, and such threads may not be running at all times (e.g., once a backend has been joined). - We could replace regular locks with reentrant locks (also called recursive). This is typically considered bad practice, though, and when at some point we tried this we indeed hit problems. The next section presents a more disciplined way of dealing with races. ## Event loops A classic way of dealing with parallel I/O is event loops: repeatedly polling a set of file descriptors for readability/writability (blocking to wait for them to become ready), dealing with them, and repeating. Syscalls to do this are `select`, `epoll`, and more. The `libuv` library used by one of TensorPipe's transports is also based on an event loop. Event loops are typically single-threaded, and they allow to "simulate" parallelism by multiplexing thread if those threads would spend most of their time doing blocking I/O. The simplicity of event loops, their single-threaded safety and their established effectiveness prompted us to make them a foundation of our threading model. If an object already has a thread to which it offloads some operations (this is the case for most transports and some channels, but not the pipe) then we defer all operations to it. And we really mean all of them: all manipulation of the object (scheduling operations, querying information, running callbacks) must be done from within that event loop thread. All operations that are attempted on the object, either from another thread or from within the event loop thread (for example, by a callback in user code) are deferred, appended to a queue, and dealt with at a later iteration of the loop. This guarantees that we'll always have a single thread accessing such objects, thus ensuring thread safety without even using any locks. Note that such design isn't a requirement for transports, it's just the pattern that we've adopted for all our current transports. If, on the other hand, an object does not have access to a thread to use as an event loop, we'll "borrow" the caller's thread and temporarily use it as an event loop. We'll similarly have a queue of tasks, and the thread will consume them one by one, until none are left, at which point we'll stop occupying the thread and release it back to the caller. If any new operation is attempted by another thread while one of these temporary event loops is running, that operation is added to the queue and thus deferred to the already-running event loop, with the new thread immediately able to return to what it was doing. ================================================ FILE: setup.py ================================================ #!/usr/bin/env python3 # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. import os import subprocess import sys from pathlib import Path from setuptools import Extension, setup from setuptools.command.build_ext import build_ext class CMakeBuild(build_ext): def run(self): for ext in self.extensions: self.build_extension(ext) def build_extension(self, ext): if not os.path.exists(self.build_temp): os.makedirs(self.build_temp) source_path = Path(__file__).parent.resolve() output_path = Path(self.get_ext_fullpath(ext.name)).parent.resolve() build_type = "Debug" if self.debug else "Release" cmake_cmd = [ "cmake", f"{source_path}", f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={output_path}", f"-DPYTHON_EXECUTABLE={sys.executable}", f"-DCMAKE_BUILD_TYPE={build_type}", "-DCMAKE_C_COMPILER=clang-6.0", "-DCMAKE_CXX_COMPILER=clang++-6.0", "-DCMAKE_POSITION_INDEPENDENT_CODE=true", "-DTP_BUILD_PYTHON=true", ] for opt in os.environ: if opt.startswith("TP_"): cmake_cmd.append(f"-D{opt}={os.environ[opt]}") make_cmd = ["make", "-j", "pytensorpipe"] subprocess.check_call(cmake_cmd, cwd=self.build_temp) subprocess.check_call(make_cmd, cwd=self.build_temp) setup( name="tensorpipe", version="0.0.0", author="Facebook AI Research", ext_modules=[Extension("pytensorpipe", sources=[])], cmdclass={"build_ext": CMakeBuild}, zip_safe=False, ) ================================================ FILE: tensorpipe/.clang-format ================================================ --- AccessModifierOffset: -1 AlignAfterOpenBracket: AlwaysBreak AlignConsecutiveAssignments: false AlignConsecutiveDeclarations: false AlignEscapedNewlinesLeft: true AlignOperands: false AlignTrailingComments: false AllowAllParametersOfDeclarationOnNextLine: false AllowShortBlocksOnASingleLine: false AllowShortCaseLabelsOnASingleLine: false AllowShortFunctionsOnASingleLine: Empty AllowShortIfStatementsOnASingleLine: false AllowShortLoopsOnASingleLine: false AlwaysBreakAfterReturnType: None AlwaysBreakBeforeMultilineStrings: true AlwaysBreakTemplateDeclarations: true BinPackArguments: false BinPackParameters: false BraceWrapping: AfterClass: false AfterControlStatement: false AfterEnum: false AfterFunction: false AfterNamespace: false AfterObjCDeclaration: false AfterStruct: false AfterUnion: false BeforeCatch: false BeforeElse: false IndentBraces: false BreakBeforeBinaryOperators: None BreakBeforeBraces: Attach BreakBeforeTernaryOperators: true BreakConstructorInitializersBeforeComma: false BreakAfterJavaFieldAnnotations: false BreakStringLiterals: false ColumnLimit: 80 CommentPragmas: '^ IWYU pragma:' CompactNamespaces: false ConstructorInitializerAllOnOneLineOrOnePerLine: true ConstructorInitializerIndentWidth: 4 ContinuationIndentWidth: 4 Cpp11BracedListStyle: true DerivePointerAlignment: false DisableFormat: false ForEachMacros: [ FOR_EACH_RANGE, FOR_EACH, ] IncludeCategories: - Regex: '^<.*\.h(pp)?>' Priority: 1 - Regex: '^<.*' Priority: 2 - Regex: '.*' Priority: 3 IndentCaseLabels: true IndentWidth: 2 IndentWrappedFunctionNames: false KeepEmptyLinesAtTheStartOfBlocks: false MacroBlockBegin: '' MacroBlockEnd: '' MaxEmptyLinesToKeep: 1 NamespaceIndentation: None ObjCBlockIndentWidth: 2 ObjCSpaceAfterProperty: false ObjCSpaceBeforeProtocolList: false PenaltyBreakBeforeFirstCallParameter: 1 PenaltyBreakComment: 300 PenaltyBreakFirstLessLess: 120 PenaltyBreakString: 1000 PenaltyExcessCharacter: 1000000 PenaltyReturnTypeOnItsOwnLine: 2000000 PointerAlignment: Left ReflowComments: true SortIncludes: true SpaceAfterCStyleCast: false SpaceBeforeAssignmentOperators: true SpaceBeforeParens: ControlStatements SpaceInEmptyParentheses: false SpacesBeforeTrailingComments: 1 SpacesInAngles: false SpacesInContainerLiterals: true SpacesInCStyleCastParentheses: false SpacesInParentheses: false SpacesInSquareBrackets: false Standard: Cpp11 TabWidth: 8 UseTab: Never ... ================================================ FILE: tensorpipe/.clang-tidy ================================================ --- InheritParentConfig: true Checks: ' readability-identifier-naming, readability-inconsistent-declaration-parameter-name, readability-named-parameter, ' FormatStyle: file CheckOptions: # Names of classes (and structs?) - { key: readability-identifier-naming.ClassCase, value: CamelCase } # Names of enums and enum classes - { key: readability-identifier-naming.EnumCase, value: CamelCase } # Names of members and methods of classes (and structs?) - { key: readability-identifier-naming.MemberCase, value: camelBack } - { key: readability-identifier-naming.PrivateMemberCase, value: camelBack } - { key: readability-identifier-naming.PrivateMemberSuffix, value: '_' } - { key: readability-identifier-naming.ProtectedMemberCase, value: camelBack } - { key: readability-identifier-naming.ProtectedMemberSuffix, value: '_' } - { key: readability-identifier-naming.MethodCase, value: camelBack } # Names of parameters and local variables - { key: readability-identifier-naming.LocalVariableCase, value: camelBack } - { key: readability-identifier-naming.ParameterCase, value: camelBack } # Names of constants - { key: readability-identifier-naming.GlobalConstantCase, value: CamelCase } - { key: readability-identifier-naming.GlobalConstantPrefix, value: 'k' } # FIXME scoped enums are only supported in clang-tidy 12. # Names of (non-class) enum members # - { key: readability-identifier-naming.EnumConstantCase, value: UPPER_CASE } # Names of enum class members # - { key: readability-identifier-naming.ScopedEnumConstantCase, value: CamelCase } # - { key: readability-identifier-naming.ScopedEnumConstantPrefix, value: 'k' } # Names of template parameters - { key: readability-identifier-naming.TemplateParameterCase, value: CamelCase } # Names of global functions - { key: readability-identifier-naming.FunctionCase, value: camelBack } # Names of namespaces - { key: readability-identifier-naming.NamespaceCase, value: lower_case } ... ================================================ FILE: tensorpipe/CMakeLists.txt ================================================ # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. # TP_SRCS is the list of source files that we need to build libtensorpipe. set(TP_SRCS) # TP_PUBLIC_HDRS is the list of public header files that we need to install. set(TP_PUBLIC_HDRS) # TP_LINK_LIBRARIES is list of dependent libraries to be linked set(TP_LINK_LIBRARIES) # TP_INCLUDE_DIRS is list of include path to be used set(TP_INCLUDE_DIRS) list(APPEND TP_SRCS channel/error.cc channel/helpers.cc common/address.cc common/allocator.cc common/error.cc common/fd.cc common/socket.cc common/system.cc core/context.cc core/context_impl.cc core/error.cc core/listener.cc core/listener_impl.cc core/pipe.cc core/pipe_impl.cc transport/error.cc) list(APPEND TP_PUBLIC_HDRS tensorpipe.h channel/context.h channel/error.h common/buffer.h common/cpu_buffer.h common/device.h common/error.h common/optional.h core/context.h core/error.h core/listener.h core/message.h core/pipe.h transport/context.h transport/error.h) list(APPEND TP_INCLUDE_DIRS $ $ $) ## Channels ### basic list(APPEND TP_SRCS channel/basic/channel_impl.cc channel/basic/context_impl.cc channel/basic/factory.cc) list(APPEND TP_PUBLIC_HDRS channel/basic/factory.h) ### xth list(APPEND TP_SRCS channel/xth/channel_impl.cc channel/xth/context_impl.cc channel/xth/factory.cc) list(APPEND TP_PUBLIC_HDRS channel/xth/factory.h) ### cma tp_conditional_backend( TP_ENABLE_CMA "Enable cross-memory attach channel" "LINUX") if(TP_ENABLE_CMA) list(APPEND TP_SRCS channel/cma/channel_impl.cc channel/cma/context_impl.cc channel/cma/factory.cc) list(APPEND TP_PUBLIC_HDRS channel/cma/factory.h) set(TENSORPIPE_HAS_CMA_CHANNEL 1) endif() ### mpt list(APPEND TP_SRCS channel/mpt/channel_impl.cc channel/mpt/context_impl.cc channel/mpt/factory.cc) list(APPEND TP_PUBLIC_HDRS channel/mpt/factory.h) ## Transports ### uv list(APPEND TP_SRCS transport/uv/connection_impl.cc transport/uv/context_impl.cc transport/uv/error.cc transport/uv/factory.cc transport/uv/listener_impl.cc transport/uv/loop.cc transport/uv/sockaddr.cc transport/uv/utility.cc) list(APPEND TP_PUBLIC_HDRS transport/uv/error.h transport/uv/factory.h transport/uv/utility.h) # Add uv package find_package(uv REQUIRED) list(APPEND TP_LINK_LIBRARIES uv::uv) ### shm tp_conditional_backend( TP_ENABLE_SHM "Enable shared-memory transport" "LINUX") if(TP_ENABLE_SHM) list(APPEND TP_SRCS common/epoll_loop.cc common/shm_segment.cc transport/shm/connection_impl.cc transport/shm/context_impl.cc transport/shm/factory.cc transport/shm/listener_impl.cc transport/shm/reactor.cc transport/shm/sockaddr.cc) list(APPEND TP_PUBLIC_HDRS transport/shm/factory.h) set(TENSORPIPE_HAS_SHM_TRANSPORT 1) endif() ### ibv tp_conditional_backend( TP_ENABLE_IBV "Enable InfiniBand transport" "LINUX") if(TP_ENABLE_IBV) list(APPEND TP_SRCS common/epoll_loop.cc common/ibv.cc transport/ibv/connection_impl.cc transport/ibv/context_impl.cc transport/ibv/error.cc transport/ibv/factory.cc transport/ibv/listener_impl.cc transport/ibv/reactor.cc transport/ibv/sockaddr.cc transport/ibv/utility.cc) list(APPEND TP_PUBLIC_HDRS transport/ibv/error.h transport/ibv/factory.h transport/ibv/utility.h) set(TENSORPIPE_HAS_IBV_TRANSPORT 1) endif() ## MAC OS specific library deps if(APPLE) find_library(CF CoreFoundation) find_library(IOKIT IOKit) list(APPEND TP_LINK_LIBRARIES ${CF} ${IOKIT}) endif() ## Config configure_file(config.h.in config.h) ## Libnop # We should keep libnop headers private as they should not be exposed to downstream users, # but they're currently transitively included by tensorpipe/transport/connection.h (which # is still unclear whether it should be a public or private header). list(APPEND TP_INCLUDE_DIRS $) ## Target # Add the tensorpipe library target add_library(tensorpipe ${TP_STATIC_OR_SHARED} ${TP_SRCS}) # Set target properties if(BUILD_SHARED_LIBS) set_target_properties(tensorpipe PROPERTIES POSITION_INDEPENDENT_CODE 1) endif() # Add all the link libraries and include directories to the tensorpipe target and keeping the link PUBLIC target_link_libraries(tensorpipe PRIVATE ${TP_LINK_LIBRARIES}) target_include_directories(tensorpipe PUBLIC ${TP_INCLUDE_DIRS}) ## Install install(TARGETS tensorpipe EXPORT TensorpipeTargets LIBRARY DESTINATION ${TP_INSTALL_LIBDIR} ARCHIVE DESTINATION ${TP_INSTALL_LIBDIR}) foreach(_header_file ${TP_PUBLIC_HDRS}) get_filename_component(_TP_HEADER_SUBDIR "${_header_file}" DIRECTORY) install(FILES ${_header_file} DESTINATION ${TP_INSTALL_INCLUDEDIR}/tensorpipe/${_TP_HEADER_SUBDIR}) endforeach() install(FILES ${CMAKE_CURRENT_BINARY_DIR}/config.h DESTINATION ${TP_INSTALL_INCLUDEDIR}/tensorpipe) ## CUDA if(TP_USE_CUDA) # TP_SRCS is the list of source files that we need to build libtensorpipe. set(TP_CUDA_SRCS) # TP_PUBLIC_HDRS is the list of public header files that we need to install. set(TP_CUDA_PUBLIC_HDRS) # TP_LINK_LIBRARIES is list of dependent libraries to be linked set(TP_CUDA_LINK_LIBRARIES) # TP_INCLUDE_DIRS is list of include path to be used set(TP_CUDA_INCLUDE_DIRS) find_package(CUDA REQUIRED) list(APPEND TP_CUDA_LINK_LIBRARIES ${CUDA_LIBRARIES}) list(APPEND TP_CUDA_INCLUDE_DIRS ${CUDA_INCLUDE_DIRS}) list(APPEND TP_CUDA_SRCS common/cuda_buffer.cc) list(APPEND TP_CUDA_PUBLIC_HDRS tensorpipe_cuda.h common/cuda_buffer.h) ### cuda_xth list(APPEND TP_CUDA_SRCS channel/cuda_xth/channel_impl.cc channel/cuda_xth/context_impl.cc channel/cuda_xth/factory.cc) list(APPEND TP_CUDA_PUBLIC_HDRS channel/cuda_xth/factory.h) ### cuda_basic list(APPEND TP_CUDA_SRCS channel/cuda_basic/channel_impl.cc channel/cuda_basic/context_impl.cc channel/cuda_basic/factory.cc common/cuda_loop.cc) list(APPEND TP_CUDA_PUBLIC_HDRS channel/cuda_basic/factory.h) ### cuda_ipc tp_conditional_backend( TP_ENABLE_CUDA_IPC "Enable CUDA inter-process communication channel" "TP_USE_CUDA") if(TP_ENABLE_CUDA_IPC) list(APPEND TP_CUDA_SRCS channel/cuda_ipc/channel_impl.cc channel/cuda_ipc/context_impl.cc channel/cuda_ipc/factory.cc) list(APPEND TP_CUDA_PUBLIC_HDRS channel/cuda_ipc/factory.h) set(TENSORPIPE_HAS_CUDA_IPC_CHANNEL 1) endif() ### cuda_gdr tp_conditional_backend( TP_ENABLE_CUDA_GDR "Enable CUDA GpuDirect (InfiniBand) channel" "LINUX") if(TP_ENABLE_CUDA_GDR) list(APPEND TP_CUDA_SRCS common/ibv.cc channel/cuda_gdr/channel_impl.cc channel/cuda_gdr/context_impl.cc channel/cuda_gdr/factory.cc) list(APPEND TP_CUDA_PUBLIC_HDRS channel/cuda_gdr/error.h channel/cuda_gdr/factory.h) set(TENSORPIPE_HAS_CUDA_GDR_CHANNEL 1) endif() configure_file(config_cuda.h.in config_cuda.h) add_library(tensorpipe_cuda ${TP_STATIC_OR_SHARED} ${TP_CUDA_SRCS}) if(BUILD_SHARED_LIBS) set_target_properties(tensorpipe_cuda PROPERTIES POSITION_INDEPENDENT_CODE 1) endif() target_link_libraries(tensorpipe_cuda PUBLIC tensorpipe) target_link_libraries(tensorpipe_cuda PRIVATE ${TP_CUDA_LINK_LIBRARIES}) target_include_directories(tensorpipe_cuda PUBLIC ${TP_CUDA_INCLUDE_DIRS}) install(TARGETS tensorpipe_cuda EXPORT TensorpipeTargets LIBRARY DESTINATION ${TP_INSTALL_LIBDIR} ARCHIVE DESTINATION ${TP_INSTALL_LIBDIR}) foreach(_header_file ${TP_CUDA_PUBLIC_HDRS}) get_filename_component(_TP_HEADER_SUBDIR "${_header_file}" DIRECTORY) install(FILES ${_header_file} DESTINATION ${TP_INSTALL_INCLUDEDIR}/tensorpipe/${_TP_HEADER_SUBDIR}) endforeach() install(FILES ${CMAKE_CURRENT_BINARY_DIR}/config_cuda.h DESTINATION ${TP_INSTALL_INCLUDEDIR}/tensorpipe) endif() ## Python bindings if(TP_BUILD_PYTHON) add_subdirectory(python) endif() ## Benchmarks if (TP_BUILD_BENCHMARK) add_subdirectory(benchmark) endif() ## Misc tools if (TP_BUILD_MISC) add_subdirectory(misc) endif() ## Tests if(TP_BUILD_TESTING) add_subdirectory(test) endif() ================================================ FILE: tensorpipe/benchmark/CMakeLists.txt ================================================ # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. # TODO: Make those separate CMake projects. add_executable(benchmark_transport benchmark_transport.cc options.cc transport_registry.cc) target_link_libraries(benchmark_transport PRIVATE tensorpipe) add_executable(benchmark_pipe benchmark_pipe.cc options.cc transport_registry.cc channel_registry.cc) target_link_libraries(benchmark_pipe PRIVATE tensorpipe tensorpipe_cuda) ================================================ FILE: tensorpipe/benchmark/benchmark_pipe.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include #include #include #include #include #include #include #include #include // We might sometimes want to run this benchmark using NCCL instead of // TensorPipe. We don't want to include NCCL as a submodule and deal with the // build issues. So we've prepared the code and left it around, but disabled it. #if USE_NCCL #include #define TP_NCCL_CHECK(op) \ { \ ncclResult_t res = (op); \ TP_THROW_ASSERT_IF(res != ncclSuccess); \ } struct NcclCommDeleter { void operator()(ncclComm_t comm) { TP_NCCL_CHECK(ncclCommDestroy(comm)); } }; using NcclComm = std::unique_ptr, NcclCommDeleter>; static NcclComm createNcclComm(int rank, int worldSize, ncclUniqueId uniqueId) { ncclComm_t comm; TP_NCCL_CHECK(ncclCommInitRank(&comm, worldSize, uniqueId, rank)); return NcclComm(comm, NcclCommDeleter{}); } #endif // USE_NCCL using namespace tensorpipe; using namespace tensorpipe::benchmark; static constexpr int kNumWarmUpRounds = 5; using Payload = std::unique_ptr; using CpuTensor = std::unique_ptr; struct CudaMemoryDeleter { void operator()(void* ptr) { TP_CUDA_CHECK(cudaFree(ptr)); } }; struct CudaStreamDeleter { void operator()(cudaStream_t stream) { TP_CUDA_CHECK(cudaStreamDestroy(stream)); } }; using CudaTensor = std::unique_ptr; using CudaStream = std::unique_ptr, CudaStreamDeleter>; struct Data { size_t numPayloads; size_t payloadSize; std::vector expectedPayload; std::vector expectedPayloadMetadata; std::vector temporaryPayload; size_t numTensors; size_t tensorSize; TensorType tensorType; std::vector expectedCpuTensor; std::vector expectedCudaTensor; std::vector expectedTensorMetadata; std::vector temporaryCpuTensor; std::vector temporaryCudaTensor; CudaStream cudaStream; size_t cudaSyncPeriod; std::string expectedMetadata; #if USE_NCCL NcclComm ncclComm; #endif // USE_NCCL }; struct MultiDeviceMeasurements { // The CPU time to do each ping-pong. Measurements cpu; // The CPU time of N iterations, including a final CUDA stream sync. Measurements cuda; }; static void printMeasurements(Measurements& measurements, size_t dataLen) { measurements.sort(); fprintf( stderr, "%-15s %-15s %-12s %-7s %-7s %-7s %-7s\n", "chunk-size", "# ping-pong", "avg (usec)", "p50", "p75", "p90", "p95"); fprintf( stderr, "%-15lu %-15lu %-12.3f %-7.3f %-7.3f %-7.3f %-7.3f\n", dataLen, measurements.size(), measurements.sum().count() / (float)measurements.size() / 1000.0, measurements.percentile(0.50).count() / 1000.0, measurements.percentile(0.75).count() / 1000.0, measurements.percentile(0.90).count() / 1000.0, measurements.percentile(0.95).count() / 1000.0); } static void printMultiDeviceMeasurements( MultiDeviceMeasurements& measurements, size_t dataLen) { printMeasurements(measurements.cpu, dataLen); printMeasurements(measurements.cuda, dataLen); } static std::unique_ptr createEmptyCpuData(size_t size) { return std::make_unique(size); } static std::unique_ptr createFullCpuData(size_t size) { std::unique_ptr data = createEmptyCpuData(size); // Generate fixed data for validation between peers for (size_t i = 0; i < size; i++) { data[i] = (i >> 8) ^ (i & 0xff); } return data; } static CudaTensor createEmptyCudaData(size_t size) { uint8_t* ptr; TP_CUDA_CHECK(cudaMalloc(&ptr, size)); return CudaTensor(ptr); } static CudaTensor createFullCudaData(size_t size) { uint8_t* ptr; TP_CUDA_CHECK(cudaMalloc(&ptr, size)); CpuTensor data = createFullCpuData(size); TP_CUDA_CHECK(cudaMemcpy(ptr, data.get(), size, cudaMemcpyHostToDevice)); return CudaTensor(ptr); } static CudaStream createCudaStream() { cudaStream_t stream; TP_CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); return CudaStream(stream); } static void serverPongPingNonBlock( std::shared_ptr pipe, int& numWarmUps, int& numRoundTrips, std::promise& doneProm, Data& data, Measurements& measurements) { #if USE_NCCL for (int iterIdx = 0; iterIdx < numWarmUps + numRoundTrips; iterIdx++) { // TODO Handle multiple tensors. TP_NCCL_CHECK(ncclRecv( data.temporaryCudaTensor[0].get(), data.tensorSize, ncclInt8, 1, data.ncclComm.get(), data.cudaStream.get())); TP_NCCL_CHECK(ncclSend( data.temporaryCudaTensor[0].get(), data.tensorSize, ncclInt8, 1, data.ncclComm.get(), data.cudaStream.get())); } doneProm.set_value(); return; #endif // USE_NCCL pipe->readDescriptor( [pipe, &numWarmUps, &numRoundTrips, &doneProm, &data, &measurements]( const Error& error, Descriptor descriptor) { TP_THROW_ASSERT_IF(error) << error.what(); Allocation allocation; TP_DCHECK_EQ(descriptor.metadata, data.expectedMetadata); if (data.payloadSize > 0) { TP_DCHECK_EQ(descriptor.payloads.size(), data.numPayloads); allocation.payloads.resize(data.numPayloads); for (size_t payloadIdx = 0; payloadIdx < data.numPayloads; payloadIdx++) { TP_DCHECK_EQ( descriptor.payloads[payloadIdx].metadata, data.expectedPayloadMetadata[payloadIdx]); TP_DCHECK_EQ( descriptor.payloads[payloadIdx].length, data.payloadSize); allocation.payloads[payloadIdx].data = data.temporaryPayload[payloadIdx].get(); } } else { TP_DCHECK_EQ(descriptor.payloads.size(), 0); } if (data.tensorSize > 0) { TP_DCHECK_EQ(descriptor.tensors.size(), data.numTensors); allocation.tensors.resize(data.numTensors); for (size_t tensorIdx = 0; tensorIdx < data.numTensors; tensorIdx++) { TP_DCHECK_EQ( descriptor.tensors[tensorIdx].metadata, data.expectedTensorMetadata[tensorIdx]); TP_DCHECK_EQ(descriptor.tensors[tensorIdx].length, data.tensorSize); if (data.tensorType == TensorType::kCpu) { allocation.tensors[tensorIdx].buffer = CpuBuffer{ .ptr = data.temporaryCpuTensor[tensorIdx].get(), }; } else if (data.tensorType == TensorType::kCuda) { allocation.tensors[tensorIdx].buffer = CudaBuffer{ .ptr = data.temporaryCudaTensor[tensorIdx].get(), .stream = data.cudaStream.get(), }; } else { TP_THROW_ASSERT() << "Unknown tensor type"; } } } else { TP_DCHECK_EQ(descriptor.tensors.size(), 0); } pipe->read( allocation, [pipe, &numWarmUps, &numRoundTrips, &doneProm, &data, &measurements, descriptor{std::move(descriptor)}, allocation](const Error& error) { TP_THROW_ASSERT_IF(error) << error.what(); Message message; if (data.payloadSize > 0) { TP_DCHECK_EQ(allocation.payloads.size(), data.numPayloads); message.payloads.resize(data.numPayloads); for (size_t payloadIdx = 0; payloadIdx < data.numPayloads; payloadIdx++) { TP_DCHECK_EQ( descriptor.payloads[payloadIdx].length, data.payloadSize); TP_DCHECK_EQ( memcmp( allocation.payloads[payloadIdx].data, data.expectedPayload[payloadIdx].get(), descriptor.payloads[payloadIdx].length), 0); message.payloads[payloadIdx] = { .data = data.expectedPayload[payloadIdx].get(), .length = descriptor.payloads[payloadIdx].length, }; } } else { TP_DCHECK_EQ(allocation.payloads.size(), 0); } if (data.tensorSize > 0) { TP_DCHECK_EQ(allocation.tensors.size(), data.numTensors); message.tensors.resize(data.numTensors); for (size_t tensorIdx = 0; tensorIdx < data.numTensors; tensorIdx++) { TP_DCHECK_EQ( descriptor.tensors[tensorIdx].length, data.tensorSize); if (data.tensorType == TensorType::kCpu) { TP_DCHECK_EQ( memcmp( allocation.tensors[tensorIdx] .buffer.unwrap() .ptr, data.expectedCpuTensor[tensorIdx].get(), descriptor.tensors[tensorIdx].length), 0); } else if (data.tensorType == TensorType::kCuda) { // No (easy) way to do a memcmp with CUDA, I believe... } else { TP_THROW_ASSERT() << "Unknown tensor type"; } message.tensors[tensorIdx] = { .buffer = allocation.tensors[tensorIdx].buffer, .length = descriptor.tensors[tensorIdx].length, .targetDevice = descriptor.tensors[tensorIdx].sourceDevice, }; } } else { TP_DCHECK_EQ(allocation.tensors.size(), 0); } pipe->write( std::move(message), [pipe, &numWarmUps, &numRoundTrips, &doneProm, &data, &measurements](const Error& error) { TP_THROW_ASSERT_IF(error) << error.what(); if (numWarmUps > 0) { numWarmUps -= 1; } else { numRoundTrips -= 1; } if (numRoundTrips > 0) { serverPongPingNonBlock( pipe, numWarmUps, numRoundTrips, doneProm, data, measurements); } else { doneProm.set_value(); } }); }); }); } // Start with receiving ping static void runServer(const Options& options) { std::string addr = options.address; int numWarmUps = kNumWarmUpRounds; int numRoundTrips = options.numRoundTrips; Data data; data.numPayloads = options.numPayloads; data.payloadSize = options.payloadSize; for (size_t payloadIdx = 0; payloadIdx < options.numPayloads; payloadIdx++) { data.expectedPayload.push_back(createFullCpuData(options.payloadSize)); data.expectedPayloadMetadata.push_back( std::string(options.metadataSize, 0x42)); data.temporaryPayload.push_back(createEmptyCpuData(options.payloadSize)); } data.numTensors = options.numTensors; data.tensorSize = options.tensorSize; data.tensorType = options.tensorType; for (size_t tensorIdx = 0; tensorIdx < options.numTensors; tensorIdx++) { data.expectedTensorMetadata.push_back( std::string(options.metadataSize, 0x42)); if (options.tensorType == TensorType::kCpu) { data.expectedCpuTensor.push_back(createFullCpuData(options.tensorSize)); data.temporaryCpuTensor.push_back(createEmptyCpuData(options.tensorSize)); } else if (options.tensorType == TensorType::kCuda) { data.expectedCudaTensor.push_back(createFullCudaData(options.tensorSize)); data.temporaryCudaTensor.push_back( createEmptyCudaData(options.tensorSize)); data.cudaStream = createCudaStream(); } else { TP_THROW_ASSERT() << "Unknown tensor type"; } } data.cudaSyncPeriod = options.cudaSyncPeriod; data.expectedMetadata = std::string(options.metadataSize, 0x42); Measurements measurements; measurements.reserve(options.numRoundTrips); std::shared_ptr context = std::make_shared(); auto transportContext = TensorpipeTransportRegistry().create(options.transport); validateTransportContext(transportContext); context->registerTransport(0, options.transport, transportContext); auto channelContext = TensorpipeChannelRegistry().create(options.channel); validateChannelContext(channelContext); context->registerChannel(0, options.channel, channelContext); std::promise> pipeProm; std::shared_ptr listener = context->listen({addr}); listener->accept([&](const Error& error, std::shared_ptr pipe) { TP_THROW_ASSERT_IF(error) << error.what(); pipeProm.set_value(std::move(pipe)); }); std::shared_ptr pipe = pipeProm.get_future().get(); #if USE_NCCL std::promise uniqueIdProm; pipe->readDescriptor([&](const Error& error, Descriptor descriptor) { TP_THROW_ASSERT_IF(error) << error.what(); uniqueIdProm.set_value( *reinterpret_cast(descriptor.metadata.c_str())); }); ncclUniqueId uniqueId = uniqueIdProm.get_future().get(); data.ncclComm = createNcclComm(/*rank=*/0, /*worldSize=*/2, uniqueId); #endif std::promise doneProm; serverPongPingNonBlock( std::move(pipe), numWarmUps, numRoundTrips, doneProm, data, measurements); doneProm.get_future().get(); listener.reset(); context->join(); } static void clientPingPongNonBlock( std::shared_ptr pipe, int& numWarmUps, int& numRoundTrips, std::promise& doneProm, Data& data, MultiDeviceMeasurements& measurements) { #if USE_NCCL for (int iterIdx = 0; iterIdx < numWarmUps + numRoundTrips; iterIdx++) { if (iterIdx >= numWarmUps) { measurements.cpu.markStart(); if ((iterIdx - numWarmUps) % data.cudaSyncPeriod == 0) { measurements.cuda.markStart(); } } TP_NCCL_CHECK(ncclSend( data.expectedCudaTensor[0].get(), data.tensorSize, ncclInt8, 0, data.ncclComm.get(), data.cudaStream.get())); TP_NCCL_CHECK(ncclRecv( data.temporaryCudaTensor[0].get(), data.tensorSize, ncclInt8, 0, data.ncclComm.get(), data.cudaStream.get())); if (iterIdx >= numWarmUps) { measurements.cpu.markStop(); if ((iterIdx - numWarmUps + 1) % data.cudaSyncPeriod == 0) { TP_CUDA_CHECK(cudaStreamSynchronize(data.cudaStream.get())); measurements.cuda.markStop(data.cudaSyncPeriod); } } } printMultiDeviceMeasurements(measurements, data.payloadSize); doneProm.set_value(); return; #endif // USE_NCCL if (numWarmUps == 0) { measurements.cpu.markStart(); if (numRoundTrips % data.cudaSyncPeriod == 0) { measurements.cuda.markStart(); } } Message message; message.metadata = data.expectedMetadata; if (data.payloadSize > 0) { for (size_t payloadIdx = 0; payloadIdx < data.numPayloads; payloadIdx++) { Message::Payload payload; payload.data = data.expectedPayload[payloadIdx].get(); payload.length = data.payloadSize; message.payloads.push_back(std::move(payload)); } } else { TP_DCHECK_EQ(message.payloads.size(), 0); } if (data.tensorSize > 0) { for (size_t tensorIdx = 0; tensorIdx < data.numTensors; tensorIdx++) { Message::Tensor tensor; tensor.length = data.tensorSize; if (data.tensorType == TensorType::kCpu) { tensor.buffer = CpuBuffer{.ptr = data.expectedCpuTensor[tensorIdx].get()}; tensor.targetDevice = Device(kCpuDeviceType, 0); } else if (data.tensorType == TensorType::kCuda) { tensor.buffer = CudaBuffer{ .ptr = data.expectedCudaTensor[tensorIdx].get(), .stream = data.cudaStream.get(), }; tensor.targetDevice = Device(kCudaDeviceType, 0); } else { TP_THROW_ASSERT() << "Unknown tensor type"; } message.tensors.push_back(std::move(tensor)); } } else { TP_DCHECK_EQ(message.tensors.size(), 0); } pipe->write( std::move(message), [pipe, &numWarmUps, &numRoundTrips, &doneProm, &data, &measurements]( const Error& error) { TP_THROW_ASSERT_IF(error) << error.what(); pipe->readDescriptor([pipe, &numWarmUps, &numRoundTrips, &doneProm, &data, &measurements]( const Error& error, Descriptor descriptor) { TP_THROW_ASSERT_IF(error) << error.what(); Allocation allocation; TP_DCHECK_EQ(descriptor.metadata, data.expectedMetadata); if (data.payloadSize > 0) { TP_DCHECK_EQ(descriptor.payloads.size(), data.numPayloads); allocation.payloads.resize(data.numPayloads); for (size_t payloadIdx = 0; payloadIdx < data.numPayloads; payloadIdx++) { TP_DCHECK_EQ( descriptor.payloads[payloadIdx].metadata, data.expectedPayloadMetadata[payloadIdx]); TP_DCHECK_EQ( descriptor.payloads[payloadIdx].length, data.payloadSize); allocation.payloads[payloadIdx].data = data.temporaryPayload[payloadIdx].get(); } } else { TP_DCHECK_EQ(descriptor.payloads.size(), 0); } if (data.tensorSize > 0) { TP_DCHECK_EQ(descriptor.tensors.size(), data.numTensors); allocation.tensors.resize(data.numTensors); for (size_t tensorIdx = 0; tensorIdx < data.numTensors; tensorIdx++) { TP_DCHECK_EQ( descriptor.tensors[tensorIdx].metadata, data.expectedTensorMetadata[tensorIdx]); TP_DCHECK_EQ( descriptor.tensors[tensorIdx].length, data.tensorSize); if (data.tensorType == TensorType::kCpu) { allocation.tensors[tensorIdx].buffer = CpuBuffer{ .ptr = data.temporaryCpuTensor[tensorIdx].get(), }; } else if (data.tensorType == TensorType::kCuda) { allocation.tensors[tensorIdx].buffer = CudaBuffer{ .ptr = data.temporaryCudaTensor[tensorIdx].get(), .stream = data.cudaStream.get(), }; } else { TP_THROW_ASSERT() << "Unknown tensor type"; } } } else { TP_DCHECK_EQ(descriptor.tensors.size(), 0); } pipe->read( allocation, [pipe, &numWarmUps, &numRoundTrips, &doneProm, &data, &measurements, descriptor{std::move(descriptor)}, allocation](const Error& error) { if (numWarmUps == 0) { measurements.cpu.markStop(); if ((numRoundTrips - 1) % data.cudaSyncPeriod == 0) { TP_CUDA_CHECK(cudaStreamSynchronize(data.cudaStream.get())); measurements.cuda.markStop(data.cudaSyncPeriod); } } TP_THROW_ASSERT_IF(error) << error.what(); if (data.payloadSize > 0) { TP_DCHECK_EQ(allocation.payloads.size(), data.numPayloads); for (size_t payloadIdx = 0; payloadIdx < data.numPayloads; payloadIdx++) { TP_DCHECK_EQ( memcmp( allocation.payloads[payloadIdx].data, data.expectedPayload[payloadIdx].get(), descriptor.payloads[payloadIdx].length), 0); } } else { TP_DCHECK_EQ(allocation.payloads.size(), 0); } if (data.tensorSize > 0) { TP_DCHECK_EQ(allocation.tensors.size(), data.numTensors); for (size_t tensorIdx = 0; tensorIdx < data.numTensors; tensorIdx++) { if (data.tensorType == TensorType::kCpu) { TP_DCHECK_EQ( memcmp( allocation.tensors[tensorIdx] .buffer.unwrap() .ptr, data.expectedCpuTensor[tensorIdx].get(), descriptor.tensors[tensorIdx].length), 0); } else if (data.tensorType == TensorType::kCuda) { // No (easy) way to do a memcmp with CUDA, I // believe... } else { TP_THROW_ASSERT() << "Unknown tensor type"; } } } else { TP_DCHECK_EQ(allocation.tensors.size(), 0); } if (numWarmUps > 0) { numWarmUps -= 1; } else { numRoundTrips -= 1; } if (numRoundTrips > 0) { clientPingPongNonBlock( pipe, numWarmUps, numRoundTrips, doneProm, data, measurements); } else { printMultiDeviceMeasurements(measurements, data.payloadSize); doneProm.set_value(); } }); }); }); } // Start with sending ping static void runClient(const Options& options) { std::string addr = options.address; int numWarmUps = kNumWarmUpRounds; int numRoundTrips = options.numRoundTrips; Data data; data.numPayloads = options.numPayloads; data.payloadSize = options.payloadSize; for (size_t payloadIdx = 0; payloadIdx < options.numPayloads; payloadIdx++) { data.expectedPayload.push_back(createFullCpuData(options.payloadSize)); data.expectedPayloadMetadata.push_back( std::string(options.metadataSize, 0x42)); data.temporaryPayload.push_back(createEmptyCpuData(options.payloadSize)); } data.numTensors = options.numTensors; data.tensorSize = options.tensorSize; data.tensorType = options.tensorType; for (size_t tensorIdx = 0; tensorIdx < options.numTensors; tensorIdx++) { data.expectedTensorMetadata.push_back( std::string(options.metadataSize, 0x42)); if (data.tensorType == TensorType::kCpu) { data.expectedCpuTensor.push_back(createFullCpuData(options.tensorSize)); data.temporaryCpuTensor.push_back(createEmptyCpuData(options.tensorSize)); } else if (data.tensorType == TensorType::kCuda) { data.expectedCudaTensor.push_back(createFullCudaData(options.tensorSize)); data.temporaryCudaTensor.push_back( createEmptyCudaData(options.tensorSize)); data.cudaStream = createCudaStream(); } else { TP_THROW_ASSERT() << "Unknown tensor type"; } } data.cudaSyncPeriod = options.cudaSyncPeriod; data.expectedMetadata = std::string(options.metadataSize, 0x42); MultiDeviceMeasurements measurements; measurements.cpu.reserve(options.numRoundTrips); measurements.cuda.reserve(options.numRoundTrips / data.cudaSyncPeriod); std::shared_ptr context = std::make_shared(); auto transportContext = TensorpipeTransportRegistry().create(options.transport); validateTransportContext(transportContext); context->registerTransport(0, options.transport, transportContext); auto channelContext = TensorpipeChannelRegistry().create(options.channel); validateChannelContext(channelContext); context->registerChannel(0, options.channel, channelContext); std::shared_ptr pipe = context->connect(addr); #if USE_NCCL ncclUniqueId uniqueId; TP_NCCL_CHECK(ncclGetUniqueId(&uniqueId)); Message message; message.metadata = std::string( reinterpret_cast(&uniqueId), reinterpret_cast(&uniqueId) + sizeof(ncclUniqueId)); std::promise promise; pipe->write(std::move(message), [&](const Error& error) { TP_THROW_ASSERT_IF(error) << error.what(); promise.set_value(); }); promise.get_future().get(); data.ncclComm = createNcclComm(/*rank=*/1, /*worldSize=*/2, uniqueId); #endif // USE_NCCL std::promise doneProm; clientPingPongNonBlock( std::move(pipe), numWarmUps, numRoundTrips, doneProm, data, measurements); doneProm.get_future().get(); context->join(); } int main(int argc, char** argv) { struct Options x = parseOptions(argc, argv); std::cout << "mode = " << x.mode << "\n"; std::cout << "transport = " << x.transport << "\n"; std::cout << "channel = " << x.channel << "\n"; std::cout << "address = " << x.address << "\n"; std::cout << "num_round_trips = " << x.numRoundTrips << "\n"; std::cout << "num_payloads = " << x.numPayloads << "\n"; std::cout << "payload_size = " << x.payloadSize << "\n"; std::cout << "num_tensors = " << x.numTensors << "\n"; std::cout << "tensor_size = " << x.tensorSize << "\n"; std::cout << "tensor_type = " << (x.tensorType == TensorType::kCpu ? "cpu" : "cuda") << "\n"; std::cout << "metadata_size = " << x.metadataSize << "\n"; if (x.mode == "listen") { runServer(x); } else if (x.mode == "connect") { runClient(x); } else { // Should never be here TP_THROW_ASSERT() << "unknown mode: " << x.mode; } return 0; } ================================================ FILE: tensorpipe/benchmark/benchmark_transport.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include #include #include #include using namespace tensorpipe; using namespace tensorpipe::benchmark; using namespace tensorpipe::transport; struct Data { std::unique_ptr expected; std::unique_ptr temporary; size_t size; }; static void printMeasurements(Measurements& measurements, size_t dataLen) { measurements.sort(); fprintf( stderr, "%-15s %-15s %-12s %-7s %-7s %-7s %-7s\n", "chunk-size", "# ping-pong", "avg (usec)", "p50", "p75", "p90", "p95"); fprintf( stderr, "%-15lu %-15lu %-12.3f %-7.3f %-7.3f %-7.3f %-7.3f\n", dataLen, measurements.size(), measurements.sum().count() / (float)measurements.size() / 1000.0, measurements.percentile(0.50).count() / 1000.0, measurements.percentile(0.75).count() / 1000.0, measurements.percentile(0.90).count() / 1000.0, measurements.percentile(0.95).count() / 1000.0); } static std::unique_ptr createData(const int size) { auto data = std::make_unique(size); // Generate fixed data for validation between peers for (int i = 0; i < size; i++) { data[i] = (i >> 8) ^ (i & 0xff); } return data; } static void serverPongPingNonBlock( std::shared_ptr conn, int& numRoundTrips, std::promise& doneProm, Data& data, Measurements& measurements) { conn->read( data.temporary.get(), data.size, [conn, &numRoundTrips, &doneProm, &data, &measurements]( const Error& error, const void* ptr, size_t len) { TP_THROW_ASSERT_IF(error) << error.what(); TP_DCHECK_EQ(len, data.size); TP_DCHECK_EQ(memcmp(ptr, data.expected.get(), len), 0); conn->write( data.temporary.get(), data.size, [conn, &numRoundTrips, &doneProm, &data, &measurements]( const Error& error) { TP_THROW_ASSERT_IF(error) << error.what(); if (--numRoundTrips > 0) { serverPongPingNonBlock( conn, numRoundTrips, doneProm, data, measurements); } else { doneProm.set_value(); } }); }); } // Start with receiving ping static void runServer(const Options& options) { std::string addr = options.address; int numRoundTrips = options.numRoundTrips; Data data = { createData(options.payloadSize), std::make_unique(options.payloadSize), options.payloadSize}; Measurements measurements; measurements.reserve(options.numRoundTrips); std::shared_ptr context; context = TensorpipeTransportRegistry().create(options.transport); validateTransportContext(context); std::promise> connProm; std::shared_ptr listener = context->listen(addr); listener->accept([&](const Error& error, std::shared_ptr conn) { TP_THROW_ASSERT_IF(error) << error.what(); connProm.set_value(std::move(conn)); }); std::shared_ptr conn = connProm.get_future().get(); std::promise doneProm; serverPongPingNonBlock( std::move(conn), numRoundTrips, doneProm, data, measurements); doneProm.get_future().get(); context->join(); } static void clientPingPongNonBlock( std::shared_ptr conn, int& numRoundTrips, std::promise& doneProm, Data& data, Measurements& measurements) { measurements.markStart(); conn->write( data.expected.get(), data.size, [conn, &numRoundTrips, &doneProm, &data, &measurements]( const Error& error) { TP_THROW_ASSERT_IF(error) << error.what(); conn->read( data.temporary.get(), data.size, [conn, &numRoundTrips, &doneProm, &data, &measurements]( const Error& error, const void* ptr, size_t len) { measurements.markStop(); TP_THROW_ASSERT_IF(error) << error.what(); TP_DCHECK_EQ(len, data.size); TP_DCHECK_EQ(memcmp(ptr, data.expected.get(), len), 0); if (--numRoundTrips > 0) { clientPingPongNonBlock( conn, numRoundTrips, doneProm, data, measurements); } else { printMeasurements(measurements, data.size); doneProm.set_value(); } }); }); } // Start with sending ping static void runClient(const Options& options) { std::string addr = options.address; int numRoundTrips = options.numRoundTrips; Data data = { createData(options.payloadSize), std::make_unique(options.payloadSize), options.payloadSize}; Measurements measurements; measurements.reserve(options.numRoundTrips); std::shared_ptr context; context = TensorpipeTransportRegistry().create(options.transport); validateTransportContext(context); std::shared_ptr conn = context->connect(addr); std::promise doneProm; clientPingPongNonBlock( std::move(conn), numRoundTrips, doneProm, data, measurements); doneProm.get_future().get(); context->join(); } int main(int argc, char** argv) { struct Options x = parseOptions(argc, argv); std::cout << "mode = " << x.mode << "\n"; std::cout << "transport = " << x.transport << "\n"; std::cout << "address = " << x.address << "\n"; std::cout << "num_round_trips = " << x.numRoundTrips << "\n"; std::cout << "payload_size = " << x.payloadSize << "\n"; if (x.mode == "listen") { runServer(x); } else if (x.mode == "connect") { runClient(x); } else { // Should never be here TP_THROW_ASSERT() << "unknown mode: " << x.mode; } return 0; } ================================================ FILE: tensorpipe/benchmark/channel_registry.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include TP_DEFINE_SHARED_REGISTRY( TensorpipeChannelRegistry, tensorpipe::channel::Context); // BASIC std::shared_ptr makeBasicChannel() { return tensorpipe::channel::basic::create(); } TP_REGISTER_CREATOR(TensorpipeChannelRegistry, basic, makeBasicChannel); // CMA #if TENSORPIPE_HAS_CMA_CHANNEL std::shared_ptr makeCmaChannel() { return tensorpipe::channel::cma::create(); } TP_REGISTER_CREATOR(TensorpipeChannelRegistry, cma, makeCmaChannel); #endif // TENSORPIPE_HAS_CMA_CHANNEL // MPT std::shared_ptr makeMptChannel() { throw std::runtime_error("mtp channel requires arguments"); } TP_REGISTER_CREATOR(TensorpipeChannelRegistry, mpt, makeMptChannel); // XTH std::shared_ptr makeXthChannel() { return tensorpipe::channel::xth::create(); } TP_REGISTER_CREATOR(TensorpipeChannelRegistry, xth, makeXthChannel); // CUDA XTH std::shared_ptr makeCudaXthChannel() { return tensorpipe::channel::cuda_xth::create(); } TP_REGISTER_CREATOR(TensorpipeChannelRegistry, cuda_xth, makeCudaXthChannel); // CUDA BASIC std::shared_ptr makeCudaBasicChannel() { return tensorpipe::channel::cuda_basic::create( tensorpipe::channel::basic::create()); } TP_REGISTER_CREATOR( TensorpipeChannelRegistry, cuda_basic, makeCudaBasicChannel); // CUDA IPC #if TENSORPIPE_HAS_CUDA_IPC_CHANNEL std::shared_ptr makeCudaIpcChannel() { return tensorpipe::channel::cuda_ipc::create(); } TP_REGISTER_CREATOR(TensorpipeChannelRegistry, cuda_ipc, makeCudaIpcChannel); #endif // TENSORPIPE_HAS_CUDA_IPC_CHANNEL // CUDA GDR #if TENSORPIPE_HAS_CUDA_GDR_CHANNEL std::shared_ptr makeCudaGdrChannel() { return tensorpipe::channel::cuda_gdr::create(); } TP_REGISTER_CREATOR(TensorpipeChannelRegistry, cuda_gdr, makeCudaGdrChannel); #endif // TENSORPIPE_HAS_CUDA_GDR_CHANNEL void validateChannelContext( std::shared_ptr context) { if (!context) { auto keys = TensorpipeChannelRegistry().keys(); std::cout << "The channel you passed in is not supported. The following channels are valid: "; for (const auto& key : keys) { std::cout << key << ", "; } std::cout << "\n"; exit(EXIT_FAILURE); } } ================================================ FILE: tensorpipe/benchmark/channel_registry.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include TP_DECLARE_SHARED_REGISTRY( TensorpipeChannelRegistry, tensorpipe::channel::Context); void validateChannelContext( std::shared_ptr context); ================================================ FILE: tensorpipe/benchmark/measurements.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include namespace tensorpipe { namespace benchmark { class Measurements { using clock = std::chrono::high_resolution_clock; using nanoseconds = std::chrono::nanoseconds; public: void markStart() { start_ = clock::now(); } void markStop(size_t count = 1) { samples_.push_back((clock::now() - start_) / count); } void sort() { std::sort(samples_.begin(), samples_.end()); } void reserve(size_t capacity) { samples_.reserve(capacity); } size_t size() const { return samples_.size(); } nanoseconds sum() const { nanoseconds sum{0}; for (const auto& sample : samples_) { sum += sample; } return sum; } nanoseconds percentile(float f) const { return samples_[static_cast(f * samples_.size())]; } private: clock::time_point start_; std::vector samples_; }; } // namespace benchmark } // namespace tensorpipe ================================================ FILE: tensorpipe/benchmark/options.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include namespace tensorpipe { namespace benchmark { static void usage(int status, const char* argv0) { if (status != EXIT_SUCCESS) { fprintf(stderr, "`%s --help' for more information.\n", argv0); exit(status); } fprintf(stderr, "Usage: %s [OPTIONS]\n", argv0); #define X(x) fputs(x "\n", stderr); X(""); X("--mode=MODE Running mode [listen|connect]"); X("--transport=TRANSPORT Transport backend [shm|uv]"); X("--channel=CHANNEL Channel backend [basic]"); X("--address=ADDRESS Address to listen or connect to"); X("--num-round-trips=NUM Number of write/read pairs to perform"); X("--num-payloads=NUM [optional] Number of payloads of each write/read pair"); X("--payload-size=SIZE [optional] Size of payload of each write/read pair"); X("--num-tensors=NUM [optional] Number of tensors of each write/read pair"); X("--tensor-size=SIZE [optional] Size of tensor of each write/read pair"); X("--tensor-type=TYPE [optional] Type of tensor (cpu or cuda)"); X("--metadata-size=SIZE [optional] Size of metadata of each write/read pair"); X("--cuda-sync-period=NUM [optiona] Number of round-trips between two stream syncs"); exit(status); } static void validateOptions(Options options, const char* argv0) { int status = EXIT_SUCCESS; if (options.mode.empty()) { fprintf(stderr, "Missing argument: --mode must be set\n"); status = EXIT_FAILURE; } if (options.transport.empty()) { fprintf(stderr, "Missing argument: --transport must be set\n"); status = EXIT_FAILURE; } if (options.address.empty()) { fprintf(stderr, "Missing argument: --address must be set\n"); status = EXIT_FAILURE; } if (options.numRoundTrips <= 0) { fprintf(stderr, "Missing argument: --num-round-trips must be set\n"); status = EXIT_FAILURE; } if (status != EXIT_SUCCESS) { usage(status, argv0); } } struct Options parseOptions(int argc, char** argv) { struct Options options; int opt; int flag = -1; enum Flags : int { MODE, TRANSPORT, CHANNEL, ADDRESS, NUM_ROUND_TRIPS, NUM_PAYLOADS, PAYLOAD_SIZE, NUM_TENSORS, TENSOR_SIZE, TENSOR_TYPE, METADATA_SIZE, CUDA_SYNC_PERIOD, HELP, }; static struct option longOptions[] = { {"mode", required_argument, &flag, MODE}, {"transport", required_argument, &flag, TRANSPORT}, {"channel", required_argument, &flag, CHANNEL}, {"address", required_argument, &flag, ADDRESS}, {"num-round-trips", required_argument, &flag, NUM_ROUND_TRIPS}, {"num-payloads", required_argument, &flag, NUM_PAYLOADS}, {"payload-size", required_argument, &flag, PAYLOAD_SIZE}, {"num-tensors", required_argument, &flag, NUM_TENSORS}, {"tensor-size", required_argument, &flag, TENSOR_SIZE}, {"tensor-type", required_argument, &flag, TENSOR_TYPE}, {"metadata-size", required_argument, &flag, METADATA_SIZE}, {"cuda-sync-period", required_argument, &flag, CUDA_SYNC_PERIOD}, {"help", no_argument, &flag, HELP}, {nullptr, 0, nullptr, 0}}; while (1) { opt = getopt_long(argc, argv, "", longOptions, nullptr); if (opt == -1) { break; } if (opt != 0) { usage(EXIT_FAILURE, argv[0]); break; } switch (flag) { case MODE: options.mode = std::string(optarg); if (options.mode != "listen" && options.mode != "connect") { fprintf(stderr, "Error:\n"); fprintf(stderr, " --mode must be [listen|connect]\n"); exit(EXIT_FAILURE); } break; case TRANSPORT: options.transport = std::string(optarg); break; case CHANNEL: options.channel = std::string(optarg); break; case ADDRESS: options.address = std::string(optarg); break; case NUM_ROUND_TRIPS: options.numRoundTrips = std::strtol(optarg, nullptr, 10); break; case NUM_PAYLOADS: options.numPayloads = std::strtoull(optarg, nullptr, 10); break; case PAYLOAD_SIZE: options.payloadSize = std::strtoull(optarg, nullptr, 10); break; case NUM_TENSORS: options.numTensors = std::strtoull(optarg, nullptr, 10); break; case TENSOR_SIZE: options.tensorSize = std::strtoull(optarg, nullptr, 10); break; case TENSOR_TYPE: if (std::string(optarg) == "cpu") { options.tensorType = TensorType::kCpu; } else if (std::string(optarg) == "cuda") { options.tensorType = TensorType::kCuda; } else { fprintf(stderr, "Error:\n"); fprintf(stderr, " --tensor-type must be [cpu|cuda]\n"); exit(EXIT_FAILURE); } break; case METADATA_SIZE: options.metadataSize = std::strtoull(optarg, nullptr, 10); break; case CUDA_SYNC_PERIOD: options.cudaSyncPeriod = std::strtoull(optarg, nullptr, 10); break; case HELP: usage(EXIT_SUCCESS, argv[0]); break; default: usage(EXIT_FAILURE, argv[0]); break; } } validateOptions(options, argv[0]); return options; } } // namespace benchmark } // namespace tensorpipe ================================================ FILE: tensorpipe/benchmark/options.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include namespace tensorpipe { namespace benchmark { enum class TensorType { kCpu, kCuda, }; struct Options { std::string mode; // server or client std::string transport; // shm or uv std::string channel; // basic std::string address; // address for listen or connect int numRoundTrips{0}; // number of write/read pairs size_t numPayloads{0}; size_t payloadSize{0}; size_t numTensors{0}; size_t tensorSize{0}; TensorType tensorType{TensorType::kCpu}; size_t metadataSize{0}; size_t cudaSyncPeriod{1}; }; struct Options parseOptions(int argc, char** argv); } // namespace benchmark } // namespace tensorpipe ================================================ FILE: tensorpipe/benchmark/registry.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ // NB: This Registry works poorly when you have other namespaces. /** * Simple registry implementation that uses static variables to * register object creators during program initialization time. This registry * implementation is largely borrowed from the PyTorch registry utility in file * pytorch/c10/util/Registry.h. */ #pragma once #include #include #include #include #include #include #include #include #include namespace tensorpipe { /** * @brief A template class that allows one to register classes by keys. * * The keys are usually a std::string specifying the name, but can be anything * that can be used in a std::map. * * You should most likely not use the Registry class explicitly, but use the * helper macros below to declare specific registries as well as registering * objects. */ template class Registry { public: typedef std::function Creator; Registry() : registry_() {} // Adds a key and its associated creator to the desired registry. If the key // already exists in the registry, we simply replace the old creator // with the new args for the key. void registerCreator(std::string key, Creator creator) { registry_[key] = creator; } // Allows you to register and key/Creator pair and provide a help_messge for // the key as well. void registerCreator( std::string key, Creator creator, const std::string& helpMsg) { registerCreator(key, creator); helpMessage_[key] = helpMsg; } // Returns whether a particular key exists in the given registry. inline bool has(std::string key) { return (registry_.count(key) != 0); } // Given the key, create() invokes the creator with the provided args and // returns the object that the creator function constructs. ObjectPtrType create(std::string key, Args... args) { if (registry_.count(key) == 0) { // Returns nullptr if the key is not registered. return nullptr; } return registry_[key](args...); } // Returns the registered keys as a std::vector. std::vector keys() const { std::vector keys; for (const auto& it : registry_) { keys.push_back(it.first); } return keys; } // Returns the help_message for the key if one is provided. inline const std::unordered_map& helpMessage() const { return helpMessage_; } const char* helpMessage(std::string key) const { auto it = helpMessage_.find(key); if (it == helpMessage_.end()) { return nullptr; } return it->second.c_str(); } private: std::unordered_map registry_; std::unordered_map helpMessage_; }; // Registerer is a class template that simplifies Register-ing keys for a given // registry. template class Registerer { public: explicit Registerer( std::string key, Registry& registry, typename Registry::Creator creator, const std::string& helpMsg = "") { registry.registerCreator(key, creator, helpMsg); } }; // The following macros should be used to create/add to registries. Avoid // invoking the Registry class template functions directly. #define TP_CONCATENATE_IMPL(s1, s2) s1##s2 #define TP_CONCATENATE(s1, s2) TP_CONCATENATE_IMPL(s1, s2) #define TP_ANONYMOUS_VARIABLE(str) TP_CONCATENATE(str, __LINE__) // Using the construct on first use idiom to avoid static order initialization // issue. Refer to this link for reference: // https://isocpp.org/wiki/faq/ctors#static-init-order-on-first-use #define TP_DEFINE_TYPED_REGISTRY(RegistryName, ObjectType, PtrType, ...) \ tensorpipe::Registry, ##__VA_ARGS__>& RegistryName() { \ static tensorpipe::Registry, ##__VA_ARGS__>* \ registry = \ new tensorpipe::Registry, ##__VA_ARGS__>(); \ return *registry; \ } #define TP_DECLARE_TYPED_REGISTRY(RegistryName, ObjectType, PtrType, ...) \ tensorpipe::Registry, ##__VA_ARGS__>& RegistryName(); \ typedef tensorpipe::Registerer, ##__VA_ARGS__> \ Registerer##RegistryName #define TP_DEFINE_SHARED_REGISTRY(RegistryName, ObjectType, ...) \ TP_DEFINE_TYPED_REGISTRY( \ RegistryName, ObjectType, std::shared_ptr, ##__VA_ARGS__) #define TP_DECLARE_SHARED_REGISTRY(RegistryName, ObjectType, ...) \ TP_DECLARE_TYPED_REGISTRY( \ RegistryName, ObjectType, std::shared_ptr, ##__VA_ARGS__) #define TP_REGISTER_TYPED_CREATOR(RegistryName, key, ...) \ static Registerer##RegistryName TP_ANONYMOUS_VARIABLE(g_##RegistryName)( \ key, RegistryName(), ##__VA_ARGS__); #define TP_REGISTER_CREATOR(RegistryName, key, ...) \ TP_REGISTER_TYPED_CREATOR(RegistryName, #key, __VA_ARGS__) } // namespace tensorpipe ================================================ FILE: tensorpipe/benchmark/transport_registry.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include TP_DEFINE_SHARED_REGISTRY( TensorpipeTransportRegistry, tensorpipe::transport::Context); // IBV #if TENSORPIPE_HAS_IBV_TRANSPORT std::shared_ptr makeIbvContext() { return tensorpipe::transport::ibv::create(); } TP_REGISTER_CREATOR(TensorpipeTransportRegistry, ibv, makeIbvContext); #endif // TENSORPIPE_HAS_IBV_TRANSPORT // SHM #if TENSORPIPE_HAS_SHM_TRANSPORT std::shared_ptr makeShmContext() { return tensorpipe::transport::shm::create(); } TP_REGISTER_CREATOR(TensorpipeTransportRegistry, shm, makeShmContext); #endif // TENSORPIPE_HAS_SHM_TRANSPORT // UV std::shared_ptr makeUvContext() { return tensorpipe::transport::uv::create(); } TP_REGISTER_CREATOR(TensorpipeTransportRegistry, uv, makeUvContext); void validateTransportContext( std::shared_ptr context) { if (!context) { auto keys = TensorpipeTransportRegistry().keys(); std::cout << "The transport you passed in is not supported. The following transports are valid: "; for (const auto& key : keys) { std::cout << key << ", "; } std::cout << "\n"; exit(EXIT_FAILURE); } } ================================================ FILE: tensorpipe/benchmark/transport_registry.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include TP_DECLARE_SHARED_REGISTRY( TensorpipeTransportRegistry, tensorpipe::transport::Context); void validateTransportContext( std::shared_ptr context); ================================================ FILE: tensorpipe/channel/basic/channel_impl.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include #include #include #include #include namespace tensorpipe { namespace channel { namespace basic { ChannelImpl::ChannelImpl( ConstructorToken token, std::shared_ptr context, std::string id, std::shared_ptr connection) : ChannelImplBoilerplate( token, std::move(context), std::move(id)), connection_(std::move(connection)) {} void ChannelImpl::initImplFromLoop() { context_->enroll(*this); } void ChannelImpl::sendImplFromLoop( uint64_t sequenceNumber, Buffer buffer, size_t length, TSendCallback callback) { SendOpIter opIter = sendOps_.emplaceBack(sequenceNumber); SendOperation& op = *opIter; op.ptr = buffer.unwrap().ptr; op.length = length; op.callback = std::move(callback); sendOps_.advanceOperation(opIter); } void ChannelImpl::advanceSendOperation( SendOpIter opIter, SendOperation::State prevOpState) { TP_DCHECK(context_->inLoop()); SendOperation& op = *opIter; sendOps_.attemptTransition( opIter, /*from=*/SendOperation::UNINITIALIZED, /*to=*/SendOperation::FINISHED, /*cond=*/error_ || op.length == 0, /*actions=*/{&ChannelImpl::callSendCallback}); // Needs to go after previous op to ensure predictable and consistent ordering // of write calls on the connection. sendOps_.attemptTransition( opIter, /*from=*/SendOperation::UNINITIALIZED, /*to=*/SendOperation::WRITING, /*cond=*/!error_ && prevOpState >= SendOperation::WRITING, /*actions=*/{&ChannelImpl::write}); sendOps_.attemptTransition( opIter, /*from=*/SendOperation::WRITING, /*to=*/SendOperation::FINISHED, /*cond=*/op.doneWriting, /*actions=*/{&ChannelImpl::callSendCallback}); } void ChannelImpl::write(SendOpIter opIter) { SendOperation& op = *opIter; TP_VLOG(6) << "Channel " << id_ << " is writing payload (#" << op.sequenceNumber << ")"; connection_->write( op.ptr, op.length, callbackWrapper_([opIter](ChannelImpl& impl) { TP_VLOG(6) << "Channel " << impl.id_ << " done writing payload (#" << opIter->sequenceNumber << ")"; opIter->doneWriting = true; impl.sendOps_.advanceOperation(opIter); })); } void ChannelImpl::callSendCallback(SendOpIter opIter) { SendOperation& op = *opIter; op.callback(error_); // Reset callback to release the resources it was holding. op.callback = nullptr; } void ChannelImpl::recvImplFromLoop( uint64_t sequenceNumber, Buffer buffer, size_t length, TRecvCallback callback) { RecvOpIter opIter = recvOps_.emplaceBack(sequenceNumber); RecvOperation& op = *opIter; op.ptr = buffer.unwrap().ptr; op.length = length; op.callback = std::move(callback); recvOps_.advanceOperation(opIter); } void ChannelImpl::advanceRecvOperation( RecvOpIter opIter, RecvOperation::State prevOpState) { TP_DCHECK(context_->inLoop()); RecvOperation& op = *opIter; recvOps_.attemptTransition( opIter, /*from=*/RecvOperation::UNINITIALIZED, /*to=*/RecvOperation::FINISHED, /*cond=*/error_ || op.length == 0, /*actions=*/{&ChannelImpl::callRecvCallback}); // Needs to go after previous op to ensure predictable and consistent ordering // of read calls on the connection. recvOps_.attemptTransition( opIter, /*from=*/RecvOperation::UNINITIALIZED, /*to=*/RecvOperation::READING, /*cond=*/!error_ && prevOpState >= RecvOperation::READING, /*actions=*/{&ChannelImpl::read}); recvOps_.attemptTransition( opIter, /*from=*/RecvOperation::READING, /*to=*/RecvOperation::FINISHED, /*cond=*/op.doneReading, /*actions=*/{&ChannelImpl::callRecvCallback}); } void ChannelImpl::read(RecvOpIter opIter) { RecvOperation& op = *opIter; TP_VLOG(6) << "Channel " << id_ << " is reading payload (#" << op.sequenceNumber << ")"; connection_->read( op.ptr, op.length, callbackWrapper_([opIter]( ChannelImpl& impl, const void* /* unused */, size_t /* unused */) { TP_VLOG(6) << "Channel " << impl.id_ << " done reading payload (#" << opIter->sequenceNumber << ")"; opIter->doneReading = true; impl.recvOps_.advanceOperation(opIter); })); } void ChannelImpl::callRecvCallback(RecvOpIter opIter) { RecvOperation& op = *opIter; op.callback(error_); // Reset callback to release the resources it was holding. op.callback = nullptr; } void ChannelImpl::handleErrorImpl() { sendOps_.advanceAllOperations(); recvOps_.advanceAllOperations(); // Close the connection so that all current operations will be aborted. This // will cause their callbacks to be invoked, and only then we'll invoke ours. connection_->close(); context_->unenroll(*this); } } // namespace basic } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/channel/basic/channel_impl.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include namespace tensorpipe { namespace channel { namespace basic { class ContextImpl; struct SendOperation { enum State { UNINITIALIZED, WRITING, FINISHED }; // Fields used by the state machine uint64_t sequenceNumber{0}; State state{UNINITIALIZED}; // Progress flags bool doneWriting{false}; // Arguments at creation const void* ptr; size_t length; TSendCallback callback; }; // State capturing a single recv operation. struct RecvOperation { enum State { UNINITIALIZED, READING, FINISHED }; // Fields used by the state machine uint64_t sequenceNumber{0}; State state{UNINITIALIZED}; // Progress flags bool doneReading{false}; // Arguments at creation void* ptr; size_t length; TRecvCallback callback; }; class ChannelImpl final : public ChannelImplBoilerplate { public: ChannelImpl( ConstructorToken token, std::shared_ptr context, std::string id, std::shared_ptr connection); protected: // Implement the entry points called by ChannelImplBoilerplate. void initImplFromLoop() override; void sendImplFromLoop( uint64_t sequenceNumber, Buffer buffer, size_t length, TSendCallback callback) override; void recvImplFromLoop( uint64_t sequenceNumber, Buffer buffer, size_t length, TRecvCallback callback) override; void handleErrorImpl() override; private: const std::shared_ptr connection_; OpsStateMachine sendOps_{ *this, &ChannelImpl::advanceSendOperation}; using SendOpIter = decltype(sendOps_)::Iter; OpsStateMachine recvOps_{ *this, &ChannelImpl::advanceRecvOperation}; using RecvOpIter = decltype(recvOps_)::Iter; // State machines for send and recv ops. void advanceSendOperation( SendOpIter opIter, SendOperation::State prevOpState); void advanceRecvOperation( RecvOpIter opIter, RecvOperation::State prevOpState); // Actions (i.e., methods that begin a state transition). // For send operations: void write(SendOpIter opIter); void callSendCallback(SendOpIter opIter); // For recv operations: void read(RecvOpIter opIter); void callRecvCallback(RecvOpIter opIter); }; } // namespace basic } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/channel/basic/context_impl.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include namespace tensorpipe { namespace channel { namespace basic { std::shared_ptr ContextImpl::create() { std::unordered_map deviceDescriptors = { {Device{kCpuDeviceType, 0}, "any"}}; return std::make_shared(std::move(deviceDescriptors)); } ContextImpl::ContextImpl( std::unordered_map deviceDescriptors) : ContextImplBoilerplate( std::move(deviceDescriptors)) {} std::shared_ptr ContextImpl::createChannel( std::vector> connections, Endpoint /* unused */) { TP_DCHECK_EQ(numConnectionsNeeded(), connections.size()); return createChannelInternal(std::move(connections[0])); } void ContextImpl::handleErrorImpl() {} void ContextImpl::joinImpl() {} bool ContextImpl::inLoop() const { return loop_.inLoop(); }; void ContextImpl::deferToLoop(std::function fn) { loop_.deferToLoop(std::move(fn)); }; } // namespace basic } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/channel/basic/context_impl.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include namespace tensorpipe { namespace channel { namespace basic { class ChannelImpl; class ContextImpl final : public ContextImplBoilerplate { public: static std::shared_ptr create(); explicit ContextImpl( std::unordered_map deviceDescriptors); std::shared_ptr createChannel( std::vector> connections, Endpoint endpoint); // Implement the DeferredExecutor interface. bool inLoop() const override; void deferToLoop(std::function fn) override; protected: // Implement the entry points called by ContextImplBoilerplate. void handleErrorImpl() override; void joinImpl() override; private: OnDemandDeferredExecutor loop_; }; } // namespace basic } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/channel/basic/factory.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include namespace tensorpipe { namespace channel { namespace basic { std::shared_ptr create() { return std::make_shared>(); } } // namespace basic } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/channel/basic/factory.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include namespace tensorpipe { namespace channel { namespace basic { std::shared_ptr create(); } // namespace basic } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/channel/channel.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include // Channels are an out of band mechanism to transfer data between // processes. Examples include a direct address space to address space // memory copy on the same machine, or a GPU-to-GPU memory copy. // // Construction of a channel happens as follows. // // 1) During initialization of a pipe, the connecting peer sends its // list of channel contexts and their device descriptors. The // device descriptor is used to determine whether or not a // channel can be used by a pair of peers. // 2) The listening side of the pipe compares the list it received // its own list to determine the list of channels that should be used // for the peers. // 3) For every channel that should be constructed, the listening // side registers a slot with its low level listener. These slots // uniquely identify inbound connections on this listener (by // sending a word-sized indentifier immediately after connecting) // and can be used to construct new connections. These slots are // sent to the connecting side of the pipe, which then attempts // to establish a new connection for every token. // 4) At this time, we have a new control connection for every // channel that is about to be constructed. Both sides of the // pipe can now create the channel instance using the newly // created connection. Further initialization that needs to // happen is defered to the channel implementation. We assume the // channel is usable from the moment it is constructed. // namespace tensorpipe { namespace channel { using TSendCallback = std::function; using TRecvCallback = std::function; // Abstract base class for channel classes. class Channel { public: // Send memory region to peer. virtual void send(Buffer buffer, size_t length, TSendCallback callback) = 0; // Receive memory region from peer. virtual void recv(Buffer buffer, size_t length, TRecvCallback callback) = 0; // Tell the channel what its identifier is. // // This is only supposed to be called from the high-level pipe. It will only // used for logging and debugging purposes. virtual void setId(std::string id) = 0; // Put the channel in a terminal state, aborting pending operations and // rejecting future ones, and release its resources. This may be carried out // asynchronously, in background. virtual void close() = 0; virtual ~Channel() = default; }; } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/channel/channel_boilerplate.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include #include namespace tensorpipe { namespace channel { template class ChannelBoilerplate : public Channel { public: template ChannelBoilerplate( typename ChannelImplBoilerplate::ConstructorToken token, std::shared_ptr context, std::string id, Args&&... args); explicit ChannelBoilerplate(std::shared_ptr channel); ChannelBoilerplate(const ChannelBoilerplate&) = delete; ChannelBoilerplate(ChannelBoilerplate&&) = delete; ChannelBoilerplate& operator=(const ChannelBoilerplate&) = delete; ChannelBoilerplate& operator=(ChannelBoilerplate&&) = delete; // Perform a send operation. void send(Buffer buffer, size_t length, TSendCallback callback) override; // Queue a recv operation. void recv(Buffer buffer, size_t length, TRecvCallback callback) override; // Tell the connection what its identifier is. void setId(std::string id) override; // Shut down the connection and its resources. void close() override; ~ChannelBoilerplate() override; protected: // Using a shared_ptr allows us to detach the lifetime of the implementation // from the public object's one and perform the destruction asynchronously. const std::shared_ptr impl_; }; template template ChannelBoilerplate::ChannelBoilerplate( typename ChannelImplBoilerplate::ConstructorToken token, std::shared_ptr context, std::string id, Args&&... args) : impl_(std::make_shared( token, std::move(context), std::move(id), std::forward(args)...)) { static_assert( std::is_base_of, TChan>::value, ""); impl_->init(); } template ChannelBoilerplate::ChannelBoilerplate( std::shared_ptr channel) : impl_(std::move(channel)) { static_assert( std::is_base_of, TChan>::value, ""); } template void ChannelBoilerplate::send( Buffer buffer, size_t length, TSendCallback callback) { if (unlikely(!impl_)) { // FIXME In C++-17 perhaps a global static inline variable would be better? static Error error = TP_CREATE_ERROR(ContextNotViableError); callback(error); return; } impl_->send(buffer, length, std::move(callback)); } template void ChannelBoilerplate::recv( Buffer buffer, size_t length, TRecvCallback callback) { if (unlikely(!impl_)) { // FIXME In C++-17 perhaps a global static inline variable would be better? static Error error = TP_CREATE_ERROR(ContextNotViableError); callback(error); return; } impl_->recv(buffer, length, std::move(callback)); } template void ChannelBoilerplate::setId(std::string id) { if (unlikely(!impl_)) { return; } impl_->setId(std::move(id)); } template void ChannelBoilerplate::close() { if (unlikely(!impl_)) { return; } impl_->close(); } template ChannelBoilerplate::~ChannelBoilerplate() { close(); } } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/channel/channel_impl_boilerplate.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include #include #include #include namespace tensorpipe { namespace channel { template class ContextImplBoilerplate; template class ChannelImplBoilerplate : public std::enable_shared_from_this { public: class ConstructorToken { public: ConstructorToken(const ConstructorToken&) = default; private: explicit ConstructorToken() {} friend ContextImplBoilerplate; }; ChannelImplBoilerplate( ConstructorToken token, std::shared_ptr context, std::string id); ChannelImplBoilerplate(const ChannelImplBoilerplate&) = delete; ChannelImplBoilerplate(ChannelImplBoilerplate&&) = delete; ChannelImplBoilerplate& operator=(const ChannelImplBoilerplate&) = delete; ChannelImplBoilerplate& operator=(ChannelImplBoilerplate&&) = delete; // Initialize member fields that need `shared_from_this`. void init(); // Perform a send operation. void send(Buffer buffer, size_t length, TSendCallback callback); // Queue a recv operation. void recv(Buffer buffer, size_t length, TRecvCallback callback); // Tell the connection what its identifier is. void setId(std::string id); // Shut down the connection and its resources. void close(); virtual ~ChannelImplBoilerplate() = default; protected: virtual void initImplFromLoop() = 0; virtual void sendImplFromLoop( uint64_t sequenceNumber, Buffer buffer, size_t length, TSendCallback callback) = 0; virtual void recvImplFromLoop( uint64_t sequenceNumber, Buffer buffer, size_t length, TRecvCallback callback) = 0; virtual void handleErrorImpl() = 0; virtual void setIdImpl() {} void setError(Error error); const std::shared_ptr context_; Error error_{Error::kSuccess}; // An identifier for the connection, composed of the identifier for the // context or listener, combined with an increasing sequence number. It will // only be used for logging and debugging purposes. std::string id_; CallbackWrapper callbackWrapper_{*this, *this->context_}; private: // Initialize member fields that need `shared_from_this`. void initFromLoop(); // Perform a send operation. void sendFromLoop(Buffer buffer, size_t length, TSendCallback callback); // Queue a recv operation. void recvFromLoop(Buffer buffer, size_t length, TRecvCallback callback); void setIdFromLoop(std::string id); // Shut down the connection and its resources. void closeFromLoop(); // Deal with an error. void handleError(); // A sequence number for the calls to send and recv. uint64_t nextTensorBeingSent_{0}; uint64_t nextTensorBeingReceived_{0}; // For some odd reason it seems we need to use a qualified name here... template friend class tensorpipe::CallbackWrapper; // Contexts do sometimes need to call directly into closeFromLoop, in order to // make sure that some of their operations can happen "atomically" on the // connection, without possibly other operations occurring in between (e.g., // an error). friend ContextImplBoilerplate; }; template ChannelImplBoilerplate::ChannelImplBoilerplate( ConstructorToken /* unused */, std::shared_ptr context, std::string id) : context_(std::move(context)), id_(std::move(id)) {} template void ChannelImplBoilerplate::init() { context_->deferToLoop( [impl{this->shared_from_this()}]() { impl->initFromLoop(); }); } template void ChannelImplBoilerplate::initFromLoop() { if (context_->closed()) { // Set the error without calling setError because we do not want to invoke // the subclass's handleErrorImpl as it would find itself in a weird state // (since initFromLoop wouldn't have been called). error_ = TP_CREATE_ERROR(ChannelClosedError); TP_VLOG(4) << "Channel " << id_ << " is closing (without initing)"; return; } initImplFromLoop(); } template void ChannelImplBoilerplate::send( Buffer buffer, size_t length, TSendCallback callback) { context_->deferToLoop([impl{this->shared_from_this()}, buffer, length, callback{std::move(callback)}]() mutable { impl->sendFromLoop(buffer, length, std::move(callback)); }); } template void ChannelImplBoilerplate::sendFromLoop( Buffer buffer, size_t length, TSendCallback callback) { TP_DCHECK(context_->inLoop()); const uint64_t sequenceNumber = nextTensorBeingSent_++; TP_VLOG(4) << "Channel " << id_ << " received a send request (#" << sequenceNumber << ")"; callback = [this, sequenceNumber, callback{std::move(callback)}]( const Error& error) { // There is no requirement for the channel to invoke callbacks in order. TP_VLOG(4) << "Channel " << id_ << " is calling a send callback (#" << sequenceNumber << ")"; callback(error); TP_VLOG(4) << "Channel " << id_ << " done calling a send callback (#" << sequenceNumber << ")"; }; if (error_) { callback(error_); return; } sendImplFromLoop(sequenceNumber, buffer, length, std::move(callback)); } template void ChannelImplBoilerplate::recv( Buffer buffer, size_t length, TRecvCallback callback) { context_->deferToLoop([impl{this->shared_from_this()}, buffer, length, callback{std::move(callback)}]() mutable { impl->recvFromLoop(buffer, length, std::move(callback)); }); } template void ChannelImplBoilerplate::recvFromLoop( Buffer buffer, size_t length, TRecvCallback callback) { TP_DCHECK(context_->inLoop()); const uint64_t sequenceNumber = nextTensorBeingReceived_++; TP_VLOG(4) << "Channel " << id_ << " received a recv request (#" << sequenceNumber << ")"; callback = [this, sequenceNumber, callback{std::move(callback)}]( const Error& error) { // There is no requirement for the channel to invoke callbacks in order. TP_VLOG(4) << "Channel " << id_ << " is calling a recv callback (#" << sequenceNumber << ")"; callback(error); TP_VLOG(4) << "Channel " << id_ << " done calling a recv callback (#" << sequenceNumber << ")"; }; if (error_) { callback(error_); return; } recvImplFromLoop(sequenceNumber, buffer, length, std::move(callback)); } template void ChannelImplBoilerplate::setId(std::string id) { context_->deferToLoop( [impl{this->shared_from_this()}, id{std::move(id)}]() mutable { impl->setIdFromLoop(std::move(id)); }); } template void ChannelImplBoilerplate::setIdFromLoop(std::string id) { TP_DCHECK(context_->inLoop()); TP_VLOG(4) << "Channel " << id_ << " was renamed to " << id; id_ = std::move(id); setIdImpl(); } template void ChannelImplBoilerplate::close() { context_->deferToLoop( [impl{this->shared_from_this()}]() { impl->closeFromLoop(); }); } template void ChannelImplBoilerplate::closeFromLoop() { TP_DCHECK(context_->inLoop()); TP_VLOG(4) << "Channel " << id_ << " is closing"; setError(TP_CREATE_ERROR(ChannelClosedError)); } template void ChannelImplBoilerplate::setError(Error error) { // Don't overwrite an error that's already set. if (error_ || !error) { return; } error_ = std::move(error); handleError(); } template void ChannelImplBoilerplate::handleError() { TP_DCHECK(context_->inLoop()); TP_VLOG(5) << "Channel " << id_ << " is handling error " << error_.what(); handleErrorImpl(); } } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/channel/cma/channel_impl.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include #include #include #include #include #include #include namespace tensorpipe { namespace channel { namespace cma { namespace { struct Descriptor { uint32_t pid; uint64_t ptr; NOP_STRUCTURE(Descriptor, pid, ptr); }; } // namespace ChannelImpl::ChannelImpl( ConstructorToken token, std::shared_ptr context, std::string id, std::shared_ptr descriptorConnection, std::shared_ptr completionConnection) : ChannelImplBoilerplate( token, std::move(context), std::move(id)), descriptorConnection_(std::move(descriptorConnection)), completionConnection_(std::move(completionConnection)) {} void ChannelImpl::initImplFromLoop() { context_->enroll(*this); } void ChannelImpl::sendImplFromLoop( uint64_t sequenceNumber, Buffer buffer, size_t length, TSendCallback callback) { SendOpIter opIter = sendOps_.emplaceBack(sequenceNumber); SendOperation& op = *opIter; op.callback = std::move(callback); op.ptr = buffer.unwrap().ptr; op.length = length; sendOps_.advanceOperation(opIter); } void ChannelImpl::advanceSendOperation( SendOpIter opIter, SendOperation::State prevOpState) { TP_DCHECK(context_->inLoop()); SendOperation& op = *opIter; sendOps_.attemptTransition( opIter, /*from=*/SendOperation::UNINITIALIZED, /*to=*/SendOperation::FINISHED, /*cond=*/error_ || op.length == 0, /*actions=*/{&ChannelImpl::callSendCallback}); // Needs to go after previous op to ensure predictable and consistent ordering // of write calls on the descriptor control connection and read calls on the // completion control connection. sendOps_.attemptTransition( opIter, /*from=*/SendOperation::UNINITIALIZED, /*to=*/SendOperation::READING_COMPLETION, /*cond=*/!error_ && prevOpState >= SendOperation::READING_COMPLETION, /*actions=*/ {&ChannelImpl::writeDescriptor, &ChannelImpl::readCompletion}); sendOps_.attemptTransition( opIter, /*from=*/SendOperation::READING_COMPLETION, /*to=*/SendOperation::FINISHED, /*cond=*/op.doneReadingCompletion, /*actions=*/{&ChannelImpl::callSendCallback}); } void ChannelImpl::writeDescriptor(SendOpIter opIter) { SendOperation& op = *opIter; auto nopHolder = std::make_shared>(); Descriptor& nopDescriptor = nopHolder->getObject(); // TODO: Store the PID upon channel/context instantiation. nopDescriptor.pid = ::getpid(); nopDescriptor.ptr = reinterpret_cast(op.ptr); TP_VLOG(6) << "Channel " << id_ << " is writing descriptor (#" << op.sequenceNumber << ")"; descriptorConnection_->write( *nopHolder, callbackWrapper_([sequenceNumber{op.sequenceNumber}, nopHolder](ChannelImpl& impl) { TP_VLOG(6) << "Channel " << impl.id_ << " done writing descriptor (#" << sequenceNumber << ")"; })); } void ChannelImpl::readCompletion(SendOpIter opIter) { SendOperation& op = *opIter; TP_VLOG(6) << "Channel " << id_ << " is reading completion (#" << op.sequenceNumber << ")"; completionConnection_->read( nullptr, 0, callbackWrapper_([opIter]( ChannelImpl& impl, const void* /* unused */, size_t /* unused */) { TP_VLOG(6) << "Channel " << impl.id_ << " done reading completion (#" << opIter->sequenceNumber << ")"; opIter->doneReadingCompletion = true; impl.sendOps_.advanceOperation(opIter); })); } void ChannelImpl::callSendCallback(SendOpIter opIter) { SendOperation& op = *opIter; op.callback(error_); // Reset callback to release the resources it was holding. op.callback = nullptr; } void ChannelImpl::recvImplFromLoop( uint64_t sequenceNumber, Buffer buffer, size_t length, TRecvCallback callback) { RecvOpIter opIter = recvOps_.emplaceBack(sequenceNumber); RecvOperation& op = *opIter; op.ptr = buffer.unwrap().ptr; op.length = length; op.callback = std::move(callback); recvOps_.advanceOperation(opIter); } void ChannelImpl::advanceRecvOperation( RecvOpIter opIter, RecvOperation::State prevOpState) { TP_DCHECK(context_->inLoop()); RecvOperation& op = *opIter; recvOps_.attemptTransition( opIter, /*from=*/RecvOperation::UNINITIALIZED, /*to=*/RecvOperation::FINISHED, /*cond=*/error_ || op.length == 0, /*actions=*/{&ChannelImpl::callRecvCallback}); // Needs to go after previous op to ensure predictable and consistent ordering // of read calls on the descriptor control connection. recvOps_.attemptTransition( opIter, /*from=*/RecvOperation::UNINITIALIZED, /*to=*/RecvOperation::READING_DESCRIPTOR, /*cond=*/!error_ && prevOpState >= RecvOperation::READING_DESCRIPTOR, /*actions=*/{&ChannelImpl::readDescriptor}); recvOps_.attemptTransition( opIter, /*from=*/RecvOperation::READING_DESCRIPTOR, /*to=*/RecvOperation::FINISHED, /*cond=*/error_ && op.doneReadingDescriptor, /*actions=*/{&ChannelImpl::callRecvCallback}); recvOps_.attemptTransition( opIter, /*from=*/RecvOperation::READING_DESCRIPTOR, /*to=*/RecvOperation::COPYING, /*cond=*/!error_ && op.doneReadingDescriptor, /*actions=*/{&ChannelImpl::copy}); recvOps_.attemptTransition( opIter, /*from=*/RecvOperation::COPYING, /*to=*/RecvOperation::FINISHED, /*cond=*/error_ && op.doneCopying, /*actions=*/{&ChannelImpl::callRecvCallback}); // Needs to go after previous op to ensure predictable and consistent ordering // of write calls on the completion control connection. recvOps_.attemptTransition( opIter, /*from=*/RecvOperation::COPYING, /*to=*/RecvOperation::FINISHED, /*cond=*/!error_ && op.doneCopying && prevOpState >= RecvOperation::FINISHED, /*actions=*/ {&ChannelImpl::callRecvCallback, &ChannelImpl::writeCompletion}); } void ChannelImpl::readDescriptor(RecvOpIter opIter) { RecvOperation& op = *opIter; TP_VLOG(6) << "Channel " << id_ << " is reading descriptor (#" << op.sequenceNumber << ")"; auto nopHolderIn = std::make_shared>(); descriptorConnection_->read( *nopHolderIn, callbackWrapper_([opIter, nopHolderIn](ChannelImpl& impl) { TP_VLOG(6) << "Channel " << impl.id_ << " done reading descriptor (#" << opIter->sequenceNumber << ")"; opIter->doneReadingDescriptor = true; if (!impl.error_) { Descriptor& nopDescriptor = nopHolderIn->getObject(); opIter->remotePid = nopDescriptor.pid; opIter->remotePtr = reinterpret_cast(nopDescriptor.ptr); } impl.recvOps_.advanceOperation(opIter); })); } void ChannelImpl::copy(RecvOpIter opIter) { RecvOperation& op = *opIter; TP_VLOG(6) << "Channel " << id_ << " is copying payload (#" << op.sequenceNumber << ")"; context_->requestCopy( op.remotePid, op.remotePtr, op.ptr, op.length, callbackWrapper_([opIter](ChannelImpl& impl) { TP_VLOG(6) << "Channel " << impl.id_ << " done copying payload (#" << opIter->sequenceNumber << ")"; opIter->doneCopying = true; impl.recvOps_.advanceOperation(opIter); })); } void ChannelImpl::callRecvCallback(RecvOpIter opIter) { RecvOperation& op = *opIter; op.callback(error_); // Reset callback to release the resources it was holding. op.callback = nullptr; } void ChannelImpl::writeCompletion(RecvOpIter opIter) { RecvOperation& op = *opIter; TP_VLOG(6) << "Channel " << id_ << " is writing completion (#" << op.sequenceNumber << ")"; completionConnection_->write( nullptr, 0, callbackWrapper_([sequenceNumber{op.sequenceNumber}](ChannelImpl& impl) { TP_VLOG(6) << "Channel " << impl.id_ << " done writing completion (#" << sequenceNumber << ")"; })); } void ChannelImpl::handleErrorImpl() { sendOps_.advanceAllOperations(); recvOps_.advanceAllOperations(); descriptorConnection_->close(); completionConnection_->close(); context_->unenroll(*this); } } // namespace cma } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/channel/cma/channel_impl.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include namespace tensorpipe { namespace channel { namespace cma { class ContextImpl; struct SendOperation { enum State { UNINITIALIZED, READING_COMPLETION, FINISHED }; // Fields used by the state machine uint64_t sequenceNumber{0}; State state{UNINITIALIZED}; // Progress flags bool doneReadingCompletion{false}; // Arguments at creation void* ptr; size_t length; TSendCallback callback; }; struct RecvOperation { enum State { UNINITIALIZED, READING_DESCRIPTOR, COPYING, FINISHED }; // Fields used by the state machine uint64_t sequenceNumber{0}; State state{UNINITIALIZED}; // Progress flags bool doneReadingDescriptor{false}; bool doneCopying{false}; // Arguments at creation void* ptr; size_t length; TRecvCallback callback; // Other data pid_t remotePid; void* remotePtr; }; class ChannelImpl final : public ChannelImplBoilerplate { public: ChannelImpl( ConstructorToken token, std::shared_ptr context, std::string id, std::shared_ptr descriptorConnection, std::shared_ptr completionConnection); protected: // Implement the entry points called by ChannelImplBoilerplate. void initImplFromLoop() override; void sendImplFromLoop( uint64_t sequenceNumber, Buffer buffer, size_t length, TSendCallback callback) override; void recvImplFromLoop( uint64_t sequenceNumber, Buffer buffer, size_t length, TRecvCallback callback) override; void handleErrorImpl() override; private: const std::shared_ptr descriptorConnection_; const std::shared_ptr completionConnection_; OpsStateMachine sendOps_{ *this, &ChannelImpl::advanceSendOperation}; using SendOpIter = decltype(sendOps_)::Iter; OpsStateMachine recvOps_{ *this, &ChannelImpl::advanceRecvOperation}; using RecvOpIter = decltype(recvOps_)::Iter; // State machines for send and recv ops. void advanceSendOperation( SendOpIter opIter, SendOperation::State prevOpState); void advanceRecvOperation( RecvOpIter opIter, RecvOperation::State prevOpState); // Actions (i.e., methods that begin a state transition). // For send operations: void writeDescriptor(SendOpIter opIter); void readCompletion(SendOpIter opIter); void callSendCallback(SendOpIter opIter); // For recv operations: void readDescriptor(RecvOpIter opIter); void copy(RecvOpIter opIter); void callRecvCallback(RecvOpIter opIter); void writeCompletion(RecvOpIter opIter); }; } // namespace cma } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/channel/cma/context_impl.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include namespace tensorpipe { namespace channel { namespace cma { namespace { // Prepend descriptor with transport name so it's easy to // disambiguate descriptors when debugging. const std::string kDomainDescriptorPrefix{"cma:"}; Error callProcessVmReadv( void* localPtr, void* remotePtr, size_t length, pid_t pid) { #ifdef SYS_process_vm_readv struct iovec localIov { .iov_base = localPtr, .iov_len = length }; struct iovec remoteIov { .iov_base = remotePtr, .iov_len = length }; ssize_t nread = static_cast(::syscall( SYS_process_vm_readv, pid, &localIov, /*liovcnt=*/static_cast(1), &remoteIov, /*riovcnt=*/static_cast(1), /*flags=*/static_cast(0))); if (nread < 0) { return TP_CREATE_ERROR(SystemError, "process_vm_readv", errno); } else if (nread != length) { return TP_CREATE_ERROR(ShortReadError, length, nread); } return Error::kSuccess; #else return TP_CREATE_ERROR(SystemError, "process_vm_readv", ENOSYS); #endif } class BadReadError final : public BaseError { public: BadReadError(uint64_t expected, uint64_t actual) : expected_(expected), actual_(actual) {} std::string what() const override { std::ostringstream oss; oss << "Expected to read " << expected_ << ", got " << actual_; return oss.str(); } private: const uint64_t expected_; const uint64_t actual_; }; // Old versions of Docker use a default seccomp-bpf rule that blocks some // ptrace-related syscalls. To find this out, we attempt such a call against // ourselves, which is always allowed (it shortcuts all checks, including LSMs), // hence a failure can only come from a "filter" on the syscall. // Or, in fact, it could also happen if the kernel doesn't support the syscall. Error attemptProcessVmReadvSyscallOnSelf() { uint64_t someSourceValue = 0x0123456789abcdef; uint64_t someTargetValue = 0; Error error = callProcessVmReadv( &someTargetValue, &someSourceValue, sizeof(uint64_t), ::getpid()); if (error) { return error; } if (someTargetValue != someSourceValue) { return TP_CREATE_ERROR(BadReadError, someSourceValue, someTargetValue); } return Error::kSuccess; } // According to read(2): // > On Linux, read() (and similar system calls) will transfer at most // > 0x7ffff000 (2,147,479,552) bytes, returning the number of bytes actually // > transferred. (This is true on both 32-bit and 64-bit systems.) constexpr size_t kMaxBytesReadableAtOnce = 0x7ffff000; Error performCopy( void* localPtr, void* remotePtr, size_t length, pid_t remotePid) { for (size_t offset = 0; offset < length; offset += kMaxBytesReadableAtOnce) { Error error = callProcessVmReadv( reinterpret_cast(localPtr) + offset, reinterpret_cast(remotePtr) + offset, std::min(length - offset, kMaxBytesReadableAtOnce), remotePid); if (error) { return error; } } return Error::kSuccess; } } // namespace std::shared_ptr ContextImpl::create() { int rv; std::ostringstream oss; oss << kDomainDescriptorPrefix; // This transport only works across processes on the same machine, and we // detect that by computing the boot ID. optional bootID = getBootID(); TP_THROW_ASSERT_IF(!bootID.has_value()) << "Unable to read boot_id"; oss << bootID.value(); // An endpoint can see the other through its PID if the latter is in a child // PID namespace of the former. Since the channel is bidirectional this must // be symmetric and thus the PID namespaces must be the same. optional pidNsID = getLinuxNamespaceId(LinuxNamespace::kPid); if (!pidNsID.has_value()) { TP_VLOG(5) << "Unable to read pid namespace ID"; return nullptr; } oss << '_' << pidNsID.value(); // The ability to call process_vm_readv on a target is controlled by the // PTRACE_MODE_ATTACH_REALCREDS check (see process_vm_readv(2)). We'll go // through its checklist, step by step (which is found in ptrace(2)). We will // ignore the CAP_SYS_PTRACE conditions (i.e., we'll assume we don't have that // capability) because they are hard to check, and typically not needed. // We'll skip the check on whether the endpoints are two threads of the same // process (in which case ptrace is always allowed) because it's hard to fit // it in the descriptor and because we have some other more specialized // channels for that case. // The next step involves comparing user and group IDs. If the processes are // in user namespaces the kernel first maps these IDs back to the top-level // ("initial") ones and compares those. We can't do such mapping, thus we // compare the IDs as integers as we see them and thus for this to work // properly we require that the two endpoints are in the same user namespace. // This does not in fact constitute an extra restriction since the later // commoncap/capability LSM check will need to enforce this too. optional userNsID = getLinuxNamespaceId(LinuxNamespace::kUser); if (!userNsID.has_value()) { TP_VLOG(5) << "Unable to read user namespace ID"; return nullptr; } oss << '_' << userNsID.value(); // It is required that our *real* user ID matches the real, effective and // saved-set user IDs of the target. And the same must hold for group IDs. // As the channel is bidirectional, the reverse must also hold, which means // our real, effective and saved-set IDs must all be equal and must match the // other endpoint's ones. uid_t realUserId, effectiveUserId, savedSetUserId; gid_t realGroupId, effectiveGroupId, savedSetGroupId; rv = ::getresuid(&realUserId, &effectiveUserId, &savedSetUserId); TP_THROW_SYSTEM_IF(rv < 0, errno); rv = ::getresgid(&realGroupId, &effectiveGroupId, &savedSetGroupId); TP_THROW_SYSTEM_IF(rv < 0, errno); if (realUserId != effectiveUserId || realUserId != savedSetUserId || realGroupId != effectiveGroupId || realGroupId != savedSetGroupId) { TP_VLOG(5) << "User IDs or group IDs aren't all equal. User IDs are " << realUserId << " (real), " << effectiveUserId << " (effective) and " << savedSetUserId << " (saved-set). Group IDs are " << realGroupId << " (real), " << effectiveGroupId << " (effective) and " << savedSetGroupId << " (saved-set)."; return nullptr; } oss << '_' << realUserId << '_' << realGroupId; // The target must be dumpable. Which, due to symmetry, means we must be // dumpable too. rv = ::prctl(PR_GET_DUMPABLE, 0, 0, 0, 0); TP_THROW_SYSTEM_IF(rv < 0, errno); // SUID_DUMP_USER has a value of 1. if (rv != 1) { TP_VLOG(5) << "Process isn't dumpable"; return nullptr; } // Next the Linux Security Modules (LSMs) kick in. Since users could register // third-party LSMs we'll need to draw a line in what we support. We have two // options with unsupported LSMs: play it safe and assume the LSM will reject // the check, or "trust" the user and make them responsible to deal with the // LSMs they added. We're leaning for the latter, as often some LSMs like // AppArmor or SELinux are enabled without actually restricting anything. For // now we'll support the LSMs that are found by default on common distros, // but we can include support for more of them if that becomes necessary. optional> lsms = getLinuxSecurityModules(); bool yamaOptional = false; if (!lsms.has_value()) { // This could happen if /sys/kernel/security/lsm cannot be opened. Although // that file looks like it resides on sysfs, it's actually on the securityfs // VFS, which is sometimes not bind-mounted inside containers. In such cases // rather than failing hard we'll check a couple of reasonable LSMs. TP_VLOG(5) << "Couldn't detect the active Linux Security Modules"; lsms.emplace(); *lsms = {"capability", "yama"}; // We don't know whether YAMA is really there, hence we'll remember to // tolerate any failures later on. yamaOptional = true; } else { TP_VLOG(5) << "Detected these Linux Security Modules: " << joinStrs(*lsms); } // FIXME Can we assume that the two endpoints will see the same list of LSMs, // or should we incorporate that into the domain descriptor? for (const std::string& lsm : lsms.value()) { if (lsm == "capability") { // We already checked that the endpoints are in the same user namespace. // We must check they have the same permitted capabilities in it. optional caps = getPermittedCapabilitiesID(); TP_THROW_ASSERT_IF(!caps.has_value()) << "Unable to obtain permitted capabilities"; oss << '_' << caps.value(); } else if (lsm == "yama") { optional yamaScope = getYamaPtraceScope(); if (!yamaScope.has_value()) { TP_THROW_ASSERT_IF(!yamaOptional) << "Unable to retrieve YAMA ptrace scope"; continue; } switch (yamaScope.value()) { case YamaPtraceScope::kClassicPtracePermissions: TP_VLOG(5) << "YAMA ptrace scope set to classic ptrace permissions"; break; case YamaPtraceScope::kRestrictedPtrace: TP_VLOG(5) << "YAMA ptrace scope set to restricted ptrace"; // FIXME It's not really great to change a global property of the // process, especially a security-related one. An "excuse" for doing // so is that UCT does the same: // https://github.com/openucx/ucx/blob/4d9976b6b8f8faae609c078c72aad8e5b842c43f/src/uct/sm/scopy/cma/cma_md.c#L61 #ifndef PR_SET_PTRACER // https://github.com/torvalds/linux/blob/master/include/uapi/linux/prctl.h #define PR_SET_PTRACER 0x59616d61 #endif #ifndef PR_SET_PTRACER_ANY // https://github.com/torvalds/linux/blob/master/include/uapi/linux/prctl.h #define PR_SET_PTRACER_ANY ((unsigned long)-1) #endif rv = ::prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY, 0, 0, 0); TP_THROW_SYSTEM_IF(rv < 0, errno); break; case YamaPtraceScope::kAdminOnlyAttach: TP_VLOG(5) << "YAMA ptrace scope set to admin-only attach"; return nullptr; case YamaPtraceScope::kNoAttach: TP_VLOG(5) << "YAMA ptrace scope set to no attach"; return nullptr; default: TP_THROW_ASSERT() << "Unknown YAMA ptrace scope"; } } } // In addition to the ptrace check, in some cases (I'm looking at you Docker) // the process_vm_readv syscall is outright blocked by seccomp-bpf. Or just // unsupported by the kernel. Error error = attemptProcessVmReadvSyscallOnSelf(); if (error) { TP_VLOG(5) << "The process_vm_readv syscall appears to be unavailable or blocked: " << error.what(); return nullptr; } std::string domainDescriptor = oss.str(); TP_VLOG(5) << "The domain descriptor for CMA is " << domainDescriptor; std::unordered_map deviceDescriptors = { {Device{kCpuDeviceType, 0}, std::move(domainDescriptor)}}; return std::make_shared(std::move(deviceDescriptors)); } ContextImpl::ContextImpl( std::unordered_map deviceDescriptors) : ContextImplBoilerplate( std::move(deviceDescriptors)) { thread_ = std::thread(&ContextImpl::handleCopyRequests, this); } std::shared_ptr ContextImpl::createChannel( std::vector> connections, Endpoint /* unused */) { TP_DCHECK_EQ(numConnectionsNeeded(), connections.size()); return createChannelInternal( std::move(connections[0]), std::move(connections[1])); } size_t ContextImpl::numConnectionsNeeded() const { return 2; } void ContextImpl::handleErrorImpl() { requests_.push(nullopt); } void ContextImpl::joinImpl() { thread_.join(); // TP_DCHECK(requests_.empty()); } bool ContextImpl::inLoop() const { return loop_.inLoop(); }; void ContextImpl::deferToLoop(std::function fn) { loop_.deferToLoop(std::move(fn)); }; void ContextImpl::requestCopy( pid_t remotePid, void* remotePtr, void* localPtr, size_t length, std::function fn) { uint64_t requestId = nextRequestId_++; TP_VLOG(4) << "Channel context " << id_ << " received a copy request (#" << requestId << ")"; fn = [this, requestId, fn{std::move(fn)}](const Error& error) { TP_VLOG(4) << "Channel context " << id_ << " is calling a copy request callback (#" << requestId << ")"; fn(error); TP_VLOG(4) << "Channel context " << id_ << " done calling a copy request callback (#" << requestId << ")"; }; requests_.push( CopyRequest{remotePid, remotePtr, localPtr, length, std::move(fn)}); } void ContextImpl::handleCopyRequests() { setThreadName("TP_CMA_loop"); while (true) { auto maybeRequest = requests_.pop(); if (!maybeRequest.has_value()) { break; } CopyRequest request = std::move(maybeRequest).value(); request.callback(performCopy( request.localPtr, request.remotePtr, request.length, request.remotePid)); } } } // namespace cma } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/channel/cma/context_impl.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include #include #include #include namespace tensorpipe { namespace channel { namespace cma { class ChannelImpl; class ContextImpl final : public ContextImplBoilerplate { public: static std::shared_ptr create(); explicit ContextImpl( std::unordered_map deviceDescriptors); std::shared_ptr createChannel( std::vector> connections, Endpoint endpoint); size_t numConnectionsNeeded() const override; // Implement the DeferredExecutor interface. bool inLoop() const override; void deferToLoop(std::function fn) override; using copy_request_callback_fn = std::function; void requestCopy( pid_t remotePid, void* remotePtr, void* localPtr, size_t length, copy_request_callback_fn fn); protected: // Implement the entry points called by ContextImplBoilerplate. void handleErrorImpl() override; void joinImpl() override; private: OnDemandDeferredExecutor loop_; struct CopyRequest { pid_t remotePid; void* remotePtr; void* localPtr; size_t length; copy_request_callback_fn callback; }; std::thread thread_; Queue> requests_{std::numeric_limits::max()}; // This is atomic because it may be accessed from outside the loop. std::atomic nextRequestId_{0}; void handleCopyRequests(); }; } // namespace cma } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/channel/cma/factory.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include namespace tensorpipe { namespace channel { namespace cma { std::shared_ptr create() { return std::make_shared>(); } } // namespace cma } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/channel/cma/factory.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include namespace tensorpipe { namespace channel { namespace cma { std::shared_ptr create(); } // namespace cma } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/channel/context.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include namespace tensorpipe { namespace channel { enum class Endpoint : bool { kConnect, kListen }; class Channel; // Abstract base class for channel context classes. // // Instances of these classes are expected to be registered with a // context. All registered instances are assumed to be eligible // channels for all pairs. // class Context { public: // Return whether the context is able to operate correctly. // // Some channel types may be unable to perform as intended under some // circumstances (e.g., specialized hardware unavailable, lack of // permissions). They can report it through this method in order for // the core context to avoid registering them in the first place. // virtual bool isViable() const = 0; // Return the number of control connections needed to create an instance of // this channel. // // Most channels require only one, but some require more (cuda_basic), and // some might require none. // virtual size_t numConnectionsNeeded() const = 0; // Return a map from supported devices to strings describing the device from // the channel's perspective. // // Two processes with a channel context of the same type can leverage this // channel to make two devices communicate if one side's device descriptor is // "accepted" by the other one, using the canCommunicateWithRemote method // below. That method must be symmetric, and unless overridden defaults to // string comparison. // virtual const std::unordered_map& deviceDescriptors() const = 0; // Compare local and remote device descriptors for compatibility. // // Determine whether a channel can be opened between a local device and // a remote one that has the given device descriptor. This function // needs to be symmetric: if we called this method on the remote // context with the local descriptor we should get the same answer. // Unless overridden it defaults to string comparison. // virtual bool canCommunicateWithRemote( const std::string& localDeviceDescriptor, const std::string& remoteDeviceDescriptor) const = 0; // Return newly created channel using the specified connections. // // It is up to the channel to either use these connections for further // initialization, or use them directly. Either way, the returned // channel should be immediately usable. If the channel isn't fully // initialized yet, take care to queue these operations to execute // as soon as initialization has completed. // virtual std::shared_ptr createChannel( std::vector>, Endpoint) = 0; // Tell the context what its identifier is. // // This is only supposed to be called from the high-level context. It will // only used for logging and debugging purposes. virtual void setId(std::string id) = 0; // Put the channel context in a terminal state, in turn closing all of its // channels, and release its resources. This may be done asynchronously, in // background. virtual void close() = 0; // Wait for all resources to be released and all background activity to stop. virtual void join() = 0; virtual ~Context() = default; private: std::string name_; }; } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/channel/context_boilerplate.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include #include namespace tensorpipe { namespace channel { template class ContextBoilerplate : public Context { public: template explicit ContextBoilerplate(Args&&... args); ContextBoilerplate(const ContextBoilerplate&) = delete; ContextBoilerplate(ContextBoilerplate&&) = delete; ContextBoilerplate& operator=(const ContextBoilerplate&) = delete; ContextBoilerplate& operator=(ContextBoilerplate&&) = delete; std::shared_ptr createChannel( std::vector> connections, Endpoint endpoint) override; size_t numConnectionsNeeded() const override; bool isViable() const override; const std::unordered_map& deviceDescriptors() const override; bool canCommunicateWithRemote( const std::string& localDeviceDescriptor, const std::string& remoteDeviceDescriptor) const override; void setId(std::string id) override; void close() override; void join() override; ~ContextBoilerplate() override; protected: // The implementation is managed by a shared_ptr because each child object // will also hold a shared_ptr to it. However, its lifetime is tied to the one // of this public object since when the latter is destroyed the implementation // is closed and joined. const std::shared_ptr impl_; }; template template ContextBoilerplate::ContextBoilerplate(Args&&... args) : impl_(TCtx::create(std::forward(args)...)) { static_assert( std::is_base_of, TChan>::value, ""); if (unlikely(!impl_)) { return; } impl_->init(); } template std::shared_ptr ContextBoilerplate::createChannel( std::vector> connections, Endpoint endpoint) { if (unlikely(!impl_)) { return std::make_shared>(nullptr); } return impl_->createChannel(std::move(connections), endpoint); } template size_t ContextBoilerplate::numConnectionsNeeded() const { if (unlikely(!impl_)) { return 0; } return impl_->numConnectionsNeeded(); } template bool ContextBoilerplate::isViable() const { return impl_ != nullptr; } template const std::unordered_map& ContextBoilerplate:: deviceDescriptors() const { if (unlikely(!impl_)) { // FIXME In C++-17 perhaps a global static inline variable would be better? static std::unordered_map empty = {}; return empty; } return impl_->deviceDescriptors(); } template bool ContextBoilerplate::canCommunicateWithRemote( const std::string& localDeviceDescriptor, const std::string& remoteDeviceDescriptor) const { if (unlikely(!impl_)) { return false; } return impl_->canCommunicateWithRemote( localDeviceDescriptor, remoteDeviceDescriptor); } template void ContextBoilerplate::setId(std::string id) { if (unlikely(!impl_)) { return; } impl_->setId(std::move(id)); } template void ContextBoilerplate::close() { if (unlikely(!impl_)) { return; } impl_->close(); } template void ContextBoilerplate::join() { if (unlikely(!impl_)) { return; } impl_->join(); } template ContextBoilerplate::~ContextBoilerplate() { join(); } } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/channel/context_impl_boilerplate.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include #include #include #include namespace tensorpipe { namespace channel { template class ContextImplBoilerplate : public virtual DeferredExecutor, public std::enable_shared_from_this { public: explicit ContextImplBoilerplate( std::unordered_map deviceDescriptors); ContextImplBoilerplate(const ContextImplBoilerplate&) = delete; ContextImplBoilerplate(ContextImplBoilerplate&&) = delete; ContextImplBoilerplate& operator=(const ContextImplBoilerplate&) = delete; ContextImplBoilerplate& operator=(ContextImplBoilerplate&&) = delete; void init(); virtual size_t numConnectionsNeeded() const; const std::unordered_map& deviceDescriptors() const; virtual bool canCommunicateWithRemote( const std::string& localDeviceDescriptor, const std::string& remoteDeviceDescriptor) const; // Enrolling dependent objects (channels) causes them to be kept alive for as // long as the context exists. These objects should enroll themselves as soon // as they're created (in their initImplFromLoop method) and unenroll // themselves after they've completed handling an error (either right in the // handleErrorImpl method or in a subsequent callback). The context, on the // other hand, should avoid terminating (i.e., complete joining) until all // objects have unenrolled themselves. void enroll(TChan& channel); void unenroll(TChan& channel); // Return whether the context is in a closed state. To avoid race conditions, // this must be called from within the loop. bool closed(); void setId(std::string id); void close(); void join(); virtual ~ContextImplBoilerplate() = default; protected: virtual void initImplFromLoop() {} virtual void handleErrorImpl() = 0; virtual void joinImpl() = 0; virtual void setIdImpl() {} void setError(Error error); template std::shared_ptr createChannelInternal(Args&&... args); Error error_{Error::kSuccess}; // An identifier for the context, composed of the identifier for the context, // combined with the channel's name. It will only be used for logging and // debugging purposes. std::string id_{"N/A"}; CallbackWrapper callbackWrapper_{*this, *this}; private: void initFromLoop(); void closeFromLoop(); void handleError(); std::atomic joined_{false}; const std::unordered_map deviceDescriptors_; // Sequence numbers for the channels created by this context, used to create // their identifiers based off this context's identifier. They will only be // used for logging and debugging. std::atomic channelCounter_{0}; // Store shared_ptrs to dependent objects that have enrolled themselves to // keep them alive. We use a map, indexed by raw pointers, rather than a set // of shared_ptrs so that we can erase objects without them having to create // a fresh shared_ptr just for that. std::unordered_map> channels_; // For some odd reason it seems we need to use a qualified name here... template friend class tensorpipe::CallbackWrapper; }; template ContextImplBoilerplate::ContextImplBoilerplate( std::unordered_map deviceDescriptors) : deviceDescriptors_(std::move(deviceDescriptors)) {} template template std::shared_ptr ContextImplBoilerplate:: createChannelInternal(Args&&... args) { std::string channelId = id_ + ".c" + std::to_string(channelCounter_++); TP_VLOG(4) << "Channel context " << id_ << " is opening channel " << channelId; return std::make_shared>( typename ChannelImplBoilerplate::ConstructorToken(), this->shared_from_this(), std::move(channelId), std::forward(args)...); } template void ContextImplBoilerplate::init() { deferToLoop([this]() { initFromLoop(); }); } template void ContextImplBoilerplate::initFromLoop() { TP_DCHECK(inLoop()); TP_DCHECK(!error_); initImplFromLoop(); } template size_t ContextImplBoilerplate::numConnectionsNeeded() const { return 1; } template const std::unordered_map& ContextImplBoilerplate< TCtx, TChan>::deviceDescriptors() const { return deviceDescriptors_; } template bool ContextImplBoilerplate::canCommunicateWithRemote( const std::string& localDeviceDescriptor, const std::string& remoteDeviceDescriptor) const { return localDeviceDescriptor == remoteDeviceDescriptor; } template void ContextImplBoilerplate::enroll(TChan& channel) { TP_DCHECK(inLoop()); bool wasInserted; std::tie(std::ignore, wasInserted) = channels_.emplace(&channel, channel.shared_from_this()); TP_DCHECK(wasInserted); } template void ContextImplBoilerplate::unenroll(TChan& channel) { TP_DCHECK(inLoop()); auto numRemoved = channels_.erase(&channel); TP_DCHECK_EQ(numRemoved, 1); } template bool ContextImplBoilerplate::closed() { TP_DCHECK(inLoop()); return error_; }; template void ContextImplBoilerplate::setId(std::string id) { TP_VLOG(4) << "Channel context " << id_ << " was renamed to " << id; id_ = std::move(id); setIdImpl(); } template void ContextImplBoilerplate::close() { deferToLoop([this]() { closeFromLoop(); }); } template void ContextImplBoilerplate::closeFromLoop() { TP_DCHECK(inLoop()); TP_VLOG(4) << "Channel context " << id_ << " is closing"; setError(TP_CREATE_ERROR(ContextClosedError)); TP_VLOG(4) << "Channel context " << id_ << " done closing"; } template void ContextImplBoilerplate::setError(Error error) { // Don't overwrite an error that's already set. if (error_ || !error) { return; } error_ = std::move(error); handleError(); } template void ContextImplBoilerplate::handleError() { TP_DCHECK(inLoop()); TP_VLOG(5) << "Channel context " << id_ << " is handling error " << error_.what(); // Make a copy as they could unenroll themselves inline. auto channelsCopy = channels_; // We call closeFromLoop, rather than just close, because we need these // objects to transition _immediately_ to error, "atomically". If we just // deferred closing to later, this could come after some already-enqueued // operations that could try to access the context, which would be closed, // and this could fail. for (auto& iter : channelsCopy) { iter.second->closeFromLoop(); } handleErrorImpl(); } template void ContextImplBoilerplate::join() { close(); if (!joined_.exchange(true)) { TP_VLOG(4) << "Channel context " << id_ << " is joining"; // As closing is deferred to the loop, we must wait for closeImpl to be // actually called before we call joinImpl, to avoid race conditions. For // this, we defer another task to the loop, which we know will run after the // closing, and then we wait for that task to be run. std::promise hasClosed; deferToLoop([&]() { hasClosed.set_value(); }); hasClosed.get_future().wait(); joinImpl(); TP_VLOG(4) << "Channel context " << id_ << " done joining"; // FIXME This may actually not be true, as channels could for example be // kept alive by the underlying transport, and thus outlive their context. // TP_DCHECK(channels_.empty()); } } } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/channel/cuda_basic/channel_impl.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include namespace tensorpipe { namespace channel { namespace cuda_basic { namespace { size_t ceilOfRatio(size_t n, size_t d) { return (n + d - 1) / d; } } // namespace ChannelImpl::ChannelImpl( ConstructorToken token, std::shared_ptr context, std::string id, std::shared_ptr connection, std::shared_ptr cpuChannel, CudaLoop& cudaLoop) : ChannelImplBoilerplate( token, std::move(context), std::move(id)), connection_(std::move(connection)), cpuChannel_(std::move(cpuChannel)), cudaLoop_(cudaLoop) {} void ChannelImpl::initImplFromLoop() { context_->enroll(*this); } void ChannelImpl::cudaCopy( void* dst, const void* src, size_t length, int deviceIdx, cudaStream_t stream, std::function callback) { { CudaDeviceGuard guard(deviceIdx); TP_CUDA_CHECK(cudaMemcpyAsync(dst, src, length, cudaMemcpyDefault, stream)); } cudaLoop_.addCallback(deviceIdx, stream, std::move(callback)); } void ChannelImpl::sendImplFromLoop( uint64_t sequenceNumber, Buffer buffer, size_t length, TSendCallback callback) { if (length == 0) { callback(error_); return; } const Device device = buffer.device(); const size_t chunkLength = kSlotSize; const size_t numChunks = ceilOfRatio(length, chunkLength); for (size_t offset = 0; offset < length; offset += chunkLength) { ChunkSendOpIter opIter = chunkSendOps_.emplaceBack(nextChunkBeingSent_++); ChunkSendOperation& op = *opIter; op.bufferSequenceNumber = sequenceNumber; op.chunkId = offset / chunkLength; op.numChunks = numChunks; op.length = std::min(length - offset, chunkLength); // Operations are processed in order, so we can afford to trigger the // callback once the last operation is done. if (op.chunkId == numChunks - 1) { op.callback = std::move(callback); } if (device.type == kCpuDeviceType) { op.isCpuBuffer = true; op.devicePtr = static_cast(buffer.unwrap().ptr) + offset; } else if (device.type == kCudaDeviceType) { op.isCpuBuffer = false; op.devicePtr = static_cast(buffer.unwrap().ptr) + offset; op.stream = buffer.unwrap().stream; op.deviceIdx = device.index; } else { TP_THROW_ASSERT() << "Unexpected device type: " << device.type; } chunkSendOps_.advanceOperation(opIter); } } void ChannelImpl::advanceChunkSendOperation( ChunkSendOpIter opIter, ChunkSendOperation::State prevOpState) { TP_DCHECK(context_->inLoop()); ChunkSendOperation& op = *opIter; // Needs to go after previous op invoked its callback because the last chunk // in a series (that corresponds to one operation) must invoke its callback // only when all chunks in the series are done. chunkSendOps_.attemptTransition( opIter, /*from=*/ChunkSendOperation::UNINITIALIZED, /*to=*/ChunkSendOperation::FINISHED, /*cond=*/error_ && prevOpState >= ChunkSendOperation::INVOKED_CALLBACK, /*actions=*/{&ChannelImpl::callSendCallback}); // Needs to go after previous op to ensure predictable and consistent ordering // of send calls on CPU channel. // This transition shortcuts the allocation of/copy to staging memory when the // buffer is already on CPU. chunkSendOps_.attemptTransition( opIter, /*from=*/ChunkSendOperation::UNINITIALIZED, /*to=*/ChunkSendOperation::SENDING_CPU_BUFFER, /*cond=*/!error_ && op.isCpuBuffer && prevOpState >= ChunkSendOperation::SENDING_CPU_BUFFER, /*actions=*/ {&ChannelImpl::writeReadyToSend, &ChannelImpl::sendCpuBuffer}); // Needs to go after previous op to ensure later operations are not holding // staging buffers while earlier ones are still blocked waiting for them, // because the staging buffer will only be returned to the allocator once the // operation is destroyed, but this won't happen until earlier operations have // completed, and if they are blocked waiting for buffers we may deadlock. chunkSendOps_.attemptTransition( opIter, /*from=*/ChunkSendOperation::UNINITIALIZED, /*to=*/ChunkSendOperation::ALLOCATING_CPU_BUFFER, /*cond=*/!error_ && !op.isCpuBuffer && prevOpState >= ChunkSendOperation::ALLOCATING_CPU_BUFFER, /*actions=*/{&ChannelImpl::allocateSendCpuBuffer}); // See above for why this needs to go after previous op. chunkSendOps_.attemptTransition( opIter, /*from=*/ChunkSendOperation::ALLOCATING_CPU_BUFFER, /*to=*/ChunkSendOperation::FINISHED, /*cond=*/error_ && op.doneAllocatingCpuStagingBuffer && prevOpState >= ChunkSendOperation::INVOKED_CALLBACK, /*actions=*/ {&ChannelImpl::callSendCallback, &ChannelImpl::returnSendCpuBuffer}); // Needs to go after previous op to ensure predictable and consistent ordering // of write calls on the control connection. chunkSendOps_.attemptTransition( opIter, /*from=*/ChunkSendOperation::ALLOCATING_CPU_BUFFER, /*to=*/ChunkSendOperation::COPYING_FROM_GPU_TO_CPU, /*cond=*/!error_ && op.doneAllocatingCpuStagingBuffer && prevOpState >= ChunkSendOperation::COPYING_FROM_GPU_TO_CPU, /*actions=*/ {&ChannelImpl::writeReadyToSend, &ChannelImpl::copyFromGpuToCpu}); // See above for why this needs to go after previous op. chunkSendOps_.attemptTransition( opIter, /*from=*/ChunkSendOperation::COPYING_FROM_GPU_TO_CPU, /*to=*/ChunkSendOperation::FINISHED, /*cond=*/error_ && op.doneCopyingFromGpuToCpu && prevOpState >= ChunkSendOperation::INVOKED_CALLBACK, /*actions=*/ {&ChannelImpl::callSendCallback, &ChannelImpl::returnSendCpuBuffer}); // See above for why this needs to go after previous op. chunkSendOps_.attemptTransition( opIter, /*from=*/ChunkSendOperation::COPYING_FROM_GPU_TO_CPU, /*to=*/ChunkSendOperation::INVOKED_CALLBACK, /*cond=*/!error_ && op.doneCopyingFromGpuToCpu && prevOpState >= ChunkSendOperation::INVOKED_CALLBACK, /*actions=*/{&ChannelImpl::callSendCallback}); chunkSendOps_.attemptTransition( opIter, /*from=*/ChunkSendOperation::INVOKED_CALLBACK, /*to=*/ChunkSendOperation::FINISHED, /*cond=*/error_, /*actions=*/{&ChannelImpl::returnSendCpuBuffer}); // Needs to go after previous op to ensure predictable and consistent ordering // of send calls on CPU channel. chunkSendOps_.attemptTransition( opIter, /*from=*/ChunkSendOperation::INVOKED_CALLBACK, /*to=*/ChunkSendOperation::SENDING_CPU_BUFFER, /*cond=*/!error_ && prevOpState >= ChunkSendOperation::SENDING_CPU_BUFFER, /*actions=*/{&ChannelImpl::sendCpuBuffer}); chunkSendOps_.attemptTransition( opIter, /*from=*/ChunkSendOperation::SENDING_CPU_BUFFER, /*to=*/ChunkSendOperation::FINISHED, /*cond=*/op.doneSendingCpuBuffer && op.isCpuBuffer, /*actions=*/{&ChannelImpl::callSendCallback}); chunkSendOps_.attemptTransition( opIter, /*from=*/ChunkSendOperation::SENDING_CPU_BUFFER, /*to=*/ChunkSendOperation::FINISHED, /*cond=*/op.doneSendingCpuBuffer && !op.isCpuBuffer, /*actions=*/{&ChannelImpl::returnSendCpuBuffer}); } void ChannelImpl::allocateSendCpuBuffer(ChunkSendOpIter opIter) { ChunkSendOperation& op = *opIter; TP_VLOG(5) << "Channel " << id_ << " is allocating temporary memory for chunk #" << op.chunkId << " of " << op.numChunks << " for buffer #" << op.bufferSequenceNumber; Allocator& cudaHostAllocator = context_->getCudaHostSendAllocator(op.deviceIdx); cudaHostAllocator.alloc( op.length, callbackWrapper_( [opIter](ChannelImpl& impl, std::shared_ptr tmpBuffer) { TP_VLOG(5) << "Channel " << impl.id_ << " is done allocating temporary memory for chunk #" << opIter->chunkId << " of " << opIter->numChunks << " for buffer #" << opIter->bufferSequenceNumber; opIter->doneAllocatingCpuStagingBuffer = true; if (!impl.error_) { opIter->tmpBuffer = std::move(tmpBuffer); } impl.chunkSendOps_.advanceOperation(opIter); })); } void ChannelImpl::writeReadyToSend(ChunkSendOpIter opIter) { ChunkSendOperation& op = *opIter; TP_VLOG(6) << "Channel " << id_ << " is sending ready-to-send notification for chunk #" << op.chunkId << " of " << op.numChunks << " for buffer #" << op.bufferSequenceNumber; connection_->write( nullptr, 0, callbackWrapper_([bufferSequenceNumber{op.bufferSequenceNumber}, chunkId{op.chunkId}, numChunks{op.numChunks}](ChannelImpl& impl) { TP_VLOG(6) << "Channel " << impl.id_ << " is done sending ready-to-send notification for chunk #" << chunkId << " of " << numChunks << " for buffer #" << bufferSequenceNumber; })); } void ChannelImpl::copyFromGpuToCpu(ChunkSendOpIter opIter) { ChunkSendOperation& op = *opIter; TP_VLOG(5) << "Channel " << id_ << " is copying chunk #" << op.chunkId << " of " << op.numChunks << " for buffer #" << op.bufferSequenceNumber << " from CUDA device to CPU"; cudaCopy( op.tmpBuffer.get(), op.devicePtr, op.length, op.deviceIdx, op.stream, callbackWrapper_([opIter](ChannelImpl& impl) { TP_VLOG(5) << "Channel " << impl.id_ << " is done copying chunk #" << opIter->chunkId << " of " << opIter->numChunks << " for buffer #" << opIter->bufferSequenceNumber << " from CUDA device to CPU"; opIter->doneCopyingFromGpuToCpu = true; impl.chunkSendOps_.advanceOperation(opIter); })); } void ChannelImpl::sendCpuBuffer(ChunkSendOpIter opIter) { ChunkSendOperation& op = *opIter; TP_VLOG(6) << "Channel " << id_ << " is sending chunk #" << op.chunkId << " of " << op.numChunks << " for buffer #" << op.bufferSequenceNumber << " through CPU channel"; cpuChannel_->send( CpuBuffer{.ptr = op.isCpuBuffer ? op.devicePtr : op.tmpBuffer.get()}, op.length, callbackWrapper_([opIter](ChannelImpl& impl) { TP_VLOG(6) << "Channel " << impl.id_ << " is done sending chunk #" << opIter->chunkId << " of " << opIter->numChunks << " for buffer #" << opIter->bufferSequenceNumber << " through CPU channel"; opIter->doneSendingCpuBuffer = true; impl.chunkSendOps_.advanceOperation(opIter); })); } void ChannelImpl::callSendCallback(ChunkSendOpIter opIter) { ChunkSendOperation& op = *opIter; if (op.callback) { op.callback(error_); // Reset callback to release the resources it was holding. op.callback = nullptr; } } void ChannelImpl::returnSendCpuBuffer(ChunkSendOpIter opIter) { ChunkSendOperation& op = *opIter; // The pointer's deleter will return the buffer to the allocator. op.tmpBuffer = nullptr; } void ChannelImpl::recvImplFromLoop( uint64_t sequenceNumber, Buffer buffer, size_t length, TRecvCallback callback) { if (length == 0) { callback(error_); return; } const Device device = buffer.device(); const size_t chunkLength = kSlotSize; const size_t numChunks = ceilOfRatio(length, chunkLength); for (size_t offset = 0; offset < length; offset += chunkLength) { ChunkRecvOpIter opIter = chunkRecvOps_.emplaceBack(nextChunkBeingReceived_++); ChunkRecvOperation& op = *opIter; op.bufferSequenceNumber = sequenceNumber; op.chunkId = offset / chunkLength; op.numChunks = numChunks; op.length = std::min(length - offset, chunkLength); // Operations are processed in order, so we can afford to trigger the // callback once the last operation is done. if (op.chunkId == numChunks - 1) { op.callback = std::move(callback); } if (device.type == kCpuDeviceType) { op.isCpuBuffer = true; op.devicePtr = static_cast(buffer.unwrap().ptr) + offset; } else if (device.type == kCudaDeviceType) { op.isCpuBuffer = false; op.devicePtr = static_cast(buffer.unwrap().ptr) + offset; op.stream = buffer.unwrap().stream; op.deviceIdx = device.index; } else { TP_THROW_ASSERT() << "Unexpected device type: " << device.type; } chunkRecvOps_.advanceOperation(opIter); } } void ChannelImpl::advanceChunkRecvOperation( ChunkRecvOpIter opIter, ChunkRecvOperation::State prevOpState) { TP_DCHECK(context_->inLoop()); ChunkRecvOperation& op = *opIter; // Needs to go after previous op invoked its callback because the last chunk // in a series (that corresponds to one operation) must invoke its callback // only when all chunks in the series are done. chunkRecvOps_.attemptTransition( opIter, /*from=*/ChunkRecvOperation::UNINITIALIZED, /*to=*/ChunkRecvOperation::FINISHED, /*cond=*/error_ && prevOpState >= ChunkRecvOperation::COPYING_FROM_CPU_TO_GPU_AND_INVOKED_CALLBACK, /*actions=*/{&ChannelImpl::callRecvCallback}); // Needs to go after previous op to ensure predictable and consistent ordering // of read calls on control connection. chunkRecvOps_.attemptTransition( opIter, /*from=*/ChunkRecvOperation::UNINITIALIZED, /*to=*/ChunkRecvOperation::READING_READY_TO_SEND, /*cond=*/!error_ && prevOpState >= ChunkRecvOperation::READING_READY_TO_SEND, /*actions=*/{&ChannelImpl::readReadyToSend}); // See above for why this needs to go after previous op. chunkRecvOps_.attemptTransition( opIter, /*from=*/ChunkRecvOperation::READING_READY_TO_SEND, /*to=*/ChunkRecvOperation::FINISHED, /*cond=*/error_ && op.doneReadingReadyToSend && prevOpState >= ChunkRecvOperation::COPYING_FROM_CPU_TO_GPU_AND_INVOKED_CALLBACK, /*actions=*/{&ChannelImpl::callRecvCallback}); // Needs to go after previous op to ensure predictable and consistent ordering // of recv calls on CPU channel. // This operation shortcuts allocating staging memory when receiving directly // on CPU. chunkRecvOps_.attemptTransition( opIter, /*from=*/ChunkRecvOperation::READING_READY_TO_SEND, /*to=*/ChunkRecvOperation::RECEIVING_CPU_BUFFER, /*cond=*/!error_ && op.doneReadingReadyToSend && op.isCpuBuffer && prevOpState >= ChunkRecvOperation::RECEIVING_CPU_BUFFER, /*actions=*/{&ChannelImpl::receiveCpuBuffer}); // Needs to go after previous op to ensure later operations are not holding // staging buffers while earlier ones are still blocked waiting for them, // because the staging buffer will only be returned to the allocator once the // operation is destroyed, but this won't happen until earlier operations have // completed, and if they are blocked waiting for buffers we may deadlock. chunkRecvOps_.attemptTransition( opIter, /*from=*/ChunkRecvOperation::READING_READY_TO_SEND, /*to=*/ChunkRecvOperation::ALLOCATING_CPU_BUFFER, /*cond=*/!error_ && op.doneReadingReadyToSend && !op.isCpuBuffer && prevOpState >= ChunkRecvOperation::ALLOCATING_CPU_BUFFER, /*actions=*/{&ChannelImpl::allocateRecvCpuBuffer}); // See above for why this needs to go after previous op. chunkRecvOps_.attemptTransition( opIter, /*from=*/ChunkRecvOperation::ALLOCATING_CPU_BUFFER, /*to=*/ChunkRecvOperation::FINISHED, /*cond=*/error_ && op.doneAllocatingCpuStagingBuffer && prevOpState >= ChunkRecvOperation::COPYING_FROM_CPU_TO_GPU_AND_INVOKED_CALLBACK, /*actions=*/ {&ChannelImpl::callRecvCallback, &ChannelImpl::returnRecvCpuBuffer}); // Needs to go after previous op to ensure predictable and consistent ordering // of recv calls on CPU channel. chunkRecvOps_.attemptTransition( opIter, /*from=*/ChunkRecvOperation::ALLOCATING_CPU_BUFFER, /*to=*/ChunkRecvOperation::RECEIVING_CPU_BUFFER, /*cond=*/!error_ && op.doneAllocatingCpuStagingBuffer && prevOpState >= ChunkRecvOperation::RECEIVING_CPU_BUFFER, /*actions=*/{&ChannelImpl::receiveCpuBuffer}); // See above for why this needs to go after previous op. chunkRecvOps_.attemptTransition( opIter, /*from=*/ChunkRecvOperation::RECEIVING_CPU_BUFFER, /*to=*/ChunkRecvOperation::FINISHED, /*cond=*/error_ && op.doneReceivingCpuBuffer && !op.isCpuBuffer && prevOpState >= ChunkRecvOperation::COPYING_FROM_CPU_TO_GPU_AND_INVOKED_CALLBACK, /*actions=*/ {&ChannelImpl::callRecvCallback, &ChannelImpl::returnRecvCpuBuffer}); // This transition shortcuts the copy to GPU when receiving on CPU memory. chunkRecvOps_.attemptTransition( opIter, /*from=*/ChunkRecvOperation::RECEIVING_CPU_BUFFER, /*to=*/ChunkRecvOperation::FINISHED, /*cond=*/op.doneReceivingCpuBuffer && op.isCpuBuffer, /*actions=*/{&ChannelImpl::callRecvCallback}); chunkRecvOps_.attemptTransition( opIter, /*from=*/ChunkRecvOperation::RECEIVING_CPU_BUFFER, /*to=*/ChunkRecvOperation::COPYING_FROM_CPU_TO_GPU, /*cond=*/!error_ && op.doneReceivingCpuBuffer && !op.isCpuBuffer, /*actions=*/{&ChannelImpl::copyFromCpuToGpu}); // See above for why this needs to go after previous op. chunkRecvOps_.attemptTransition( opIter, /*from=*/ChunkRecvOperation::COPYING_FROM_CPU_TO_GPU, /*to=*/ChunkRecvOperation::COPYING_FROM_CPU_TO_GPU_AND_INVOKED_CALLBACK, /*cond=*/prevOpState >= ChunkRecvOperation::COPYING_FROM_CPU_TO_GPU_AND_INVOKED_CALLBACK, /*actions=*/{&ChannelImpl::callRecvCallback}); chunkRecvOps_.attemptTransition( opIter, /*from=*/ChunkRecvOperation::COPYING_FROM_CPU_TO_GPU_AND_INVOKED_CALLBACK, /*to=*/ChunkRecvOperation::FINISHED, /*cond=*/op.doneCopyingFromCpuToGpu, /*actions=*/{&ChannelImpl::returnRecvCpuBuffer}); } void ChannelImpl::readReadyToSend(ChunkRecvOpIter opIter) { ChunkRecvOperation& op = *opIter; TP_VLOG(6) << "Channel " << id_ << " is reading ready-to-send notification for chunk #" << op.chunkId << " of " << op.numChunks << " for buffer #" << op.bufferSequenceNumber; connection_->read(callbackWrapper_( [opIter]( ChannelImpl& impl, const void* /* unused */, size_t /* unused */) { TP_VLOG(6) << "Channel " << impl.id_ << " is done reading ready-to-send notification for chunk #" << opIter->chunkId << " of " << opIter->numChunks << " for buffer #" << opIter->bufferSequenceNumber; opIter->doneReadingReadyToSend = true; impl.chunkRecvOps_.advanceOperation(opIter); })); } void ChannelImpl::allocateRecvCpuBuffer(ChunkRecvOpIter opIter) { ChunkRecvOperation& op = *opIter; TP_VLOG(5) << "Channel " << id_ << " is allocating temporary memory for chunk #" << op.chunkId << " of " << op.numChunks << " for buffer #" << op.bufferSequenceNumber; Allocator& cudaHostAllocator = context_->getCudaHostRecvAllocator(op.deviceIdx); cudaHostAllocator.alloc( op.length, callbackWrapper_( [opIter]( ChannelImpl& impl, std::shared_ptr tmpBuffer) mutable { TP_VLOG(5) << "Channel " << impl.id_ << " is done allocating temporary memory for chunk #" << opIter->chunkId << " of " << opIter->numChunks << " for buffer #" << opIter->bufferSequenceNumber; opIter->doneAllocatingCpuStagingBuffer = true; if (!impl.error_) { opIter->tmpBuffer = std::move(tmpBuffer); } impl.chunkRecvOps_.advanceOperation(opIter); })); } void ChannelImpl::receiveCpuBuffer(ChunkRecvOpIter opIter) { ChunkRecvOperation& op = *opIter; TP_VLOG(6) << "Channel " << id_ << " is sending chunk #" << op.chunkId << " of " << op.numChunks << " for buffer #" << op.bufferSequenceNumber << " through CPU channel"; cpuChannel_->recv( CpuBuffer{.ptr = op.isCpuBuffer ? op.devicePtr : op.tmpBuffer.get()}, op.length, callbackWrapper_([opIter](ChannelImpl& impl) { TP_VLOG(6) << "Channel " << impl.id_ << " is done sending chunk #" << opIter->chunkId << " of " << opIter->numChunks << " for buffer #" << opIter->bufferSequenceNumber << " through CPU channel"; opIter->doneReceivingCpuBuffer = true; impl.chunkRecvOps_.advanceOperation(opIter); })); } void ChannelImpl::copyFromCpuToGpu(ChunkRecvOpIter opIter) { ChunkRecvOperation& op = *opIter; TP_VLOG(5) << "Channel " << id_ << " is copying chunk #" << op.chunkId << " of " << op.numChunks << " for buffer #" << op.bufferSequenceNumber << " from CPU to CUDA device"; cudaCopy( op.devicePtr, op.tmpBuffer.get(), op.length, op.deviceIdx, op.stream, callbackWrapper_([opIter](ChannelImpl& impl) { TP_VLOG(5) << "Channel " << impl.id_ << " is done copying chunk #" << opIter->chunkId << " of " << opIter->numChunks << " for buffer #" << opIter->bufferSequenceNumber << " from CPU to CUDA device"; opIter->doneCopyingFromCpuToGpu = true; impl.chunkRecvOps_.advanceOperation(opIter); })); } void ChannelImpl::callRecvCallback(ChunkRecvOpIter opIter) { ChunkRecvOperation& op = *opIter; if (op.callback) { op.callback(error_); // Reset callback to release the resources it was holding. op.callback = nullptr; } } void ChannelImpl::returnRecvCpuBuffer(ChunkRecvOpIter opIter) { ChunkRecvOperation& op = *opIter; // The pointer's deleter will return the buffer to the allocator. op.tmpBuffer = nullptr; } void ChannelImpl::setIdImpl() { cpuChannel_->setId(id_ + ".cpu"); } void ChannelImpl::handleErrorImpl() { chunkSendOps_.advanceAllOperations(); chunkRecvOps_.advanceAllOperations(); connection_->close(); cpuChannel_->close(); context_->unenroll(*this); } } // namespace cuda_basic } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/channel/cuda_basic/channel_impl.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include #include #include #include namespace tensorpipe { namespace channel { namespace cuda_basic { class ContextImpl; struct ChunkSendOperation { enum State { UNINITIALIZED, ALLOCATING_CPU_BUFFER, COPYING_FROM_GPU_TO_CPU, INVOKED_CALLBACK, SENDING_CPU_BUFFER, FINISHED }; // Fields used by the state machine uint64_t sequenceNumber{0}; State state{UNINITIALIZED}; // Arguments at creation uint64_t bufferSequenceNumber{0}; bool isCpuBuffer{false}; void* devicePtr{nullptr}; size_t chunkId{0}; size_t numChunks{0}; size_t length{0}; std::function callback; // For CUDA buffers cudaStream_t stream{cudaStreamDefault}; int deviceIdx{0}; // Data collected during processing std::shared_ptr tmpBuffer; // Progress flags bool doneAllocatingCpuStagingBuffer{false}; bool doneCopyingFromGpuToCpu{false}; bool doneSendingCpuBuffer{false}; }; struct ChunkRecvOperation { enum State { UNINITIALIZED, READING_READY_TO_SEND, ALLOCATING_CPU_BUFFER, RECEIVING_CPU_BUFFER, COPYING_FROM_CPU_TO_GPU, COPYING_FROM_CPU_TO_GPU_AND_INVOKED_CALLBACK, FINISHED }; // Fields used by the state machine uint64_t sequenceNumber{0}; State state{UNINITIALIZED}; // Arguments at creation uint64_t bufferSequenceNumber{0}; bool isCpuBuffer{false}; void* devicePtr{nullptr}; size_t chunkId{0}; size_t numChunks{0}; size_t length{0}; std::function callback; // For CUDA buffers cudaStream_t stream{cudaStreamDefault}; int deviceIdx{0}; // Data collected during processing std::shared_ptr tmpBuffer; // Progress flags bool doneReadingReadyToSend{false}; bool doneAllocatingCpuStagingBuffer{false}; bool doneReceivingCpuBuffer{false}; bool doneCopyingFromCpuToGpu{false}; }; class ChannelImpl final : public ChannelImplBoilerplate { public: ChannelImpl( ConstructorToken token, std::shared_ptr context, std::string id, std::shared_ptr connection, std::shared_ptr cpuChannel, CudaLoop& cudaLoop); protected: // Implement the entry points called by ChannelImplBoilerplate. void initImplFromLoop() override; void sendImplFromLoop( uint64_t sequenceNumber, Buffer buffer, size_t length, TSendCallback callback) override; void recvImplFromLoop( uint64_t sequenceNumber, Buffer buffer, size_t length, TRecvCallback callback) override; void handleErrorImpl() override; void setIdImpl() override; private: const std::shared_ptr connection_; const std::shared_ptr cpuChannel_; CudaLoop& cudaLoop_; // A sequence number for the chunks. uint64_t nextChunkBeingSent_{0}; uint64_t nextChunkBeingReceived_{0}; OpsStateMachine chunkSendOps_{ *this, &ChannelImpl::advanceChunkSendOperation}; using ChunkSendOpIter = decltype(chunkSendOps_)::Iter; OpsStateMachine chunkRecvOps_{ *this, &ChannelImpl::advanceChunkRecvOperation}; using ChunkRecvOpIter = decltype(chunkRecvOps_)::Iter; // State machines for send and recv ops. void advanceChunkSendOperation( ChunkSendOpIter opIter, ChunkSendOperation::State prevOpState); void advanceChunkRecvOperation( ChunkRecvOpIter opIter, ChunkRecvOperation::State prevOpState); // Actions (i.e., methods that begin a state transition). // For send operations: void allocateSendCpuBuffer(ChunkSendOpIter opIter); void copyFromGpuToCpu(ChunkSendOpIter opIter); void callSendCallback(ChunkSendOpIter opIter); void sendCpuBuffer(ChunkSendOpIter opIter); void writeReadyToSend(ChunkSendOpIter opIter); void returnSendCpuBuffer(ChunkSendOpIter opIter); // For recv operations: void readReadyToSend(ChunkRecvOpIter opIter); void allocateRecvCpuBuffer(ChunkRecvOpIter opIter); void receiveCpuBuffer(ChunkRecvOpIter opIter); void copyFromCpuToGpu(ChunkRecvOpIter opIter); void callRecvCallback(ChunkRecvOpIter opIter); void returnRecvCpuBuffer(ChunkRecvOpIter opIter); void cudaCopy( void* dst, const void* src, size_t length, int deviceIdx, cudaStream_t stream, std::function callback); }; } // namespace cuda_basic } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/channel/cuda_basic/constants.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include namespace tensorpipe { namespace channel { namespace cuda_basic { // FIXME Avoid this anonymous namespace and use inline variables in C++-17. namespace { // Define all three (redundant) values to make them explicit and avoid // misunderstandings due to miscalculations. static constexpr size_t kStagingAreaSize = 16 * 1024 * 1024; static constexpr size_t kSlotSize = 1024 * 1024; static constexpr size_t kNumSlots = 16; static_assert(kStagingAreaSize == kSlotSize * kNumSlots, ""); } // namespace } // namespace cuda_basic } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/channel/cuda_basic/context_impl.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include #include #include #include #include namespace tensorpipe { namespace channel { namespace cuda_basic { namespace { struct DeviceDescriptor { std::string deviceType; std::string descriptor; NOP_STRUCTURE(DeviceDescriptor, deviceType, descriptor); }; DeviceDescriptor deserializeDeviceDescriptor( const std::string& deviceDescriptor) { NopHolder nopHolder; loadDescriptor(nopHolder, deviceDescriptor); return std::move(nopHolder.getObject()); } } // namespace std::shared_ptr ContextImpl::create( std::shared_ptr cpuContext) { Error error; CudaLib cudaLib; std::tie(error, cudaLib) = CudaLib::create(); if (error) { TP_VLOG(5) << "CUDA basic channel is not viable because libcuda could not be loaded: " << error.what(); return nullptr; } if (cpuContext->deviceDescriptors().count(Device{kCpuDeviceType, 0}) == 0) { TP_THROW_ASSERT() << "CUDA basic channel needs a CPU channel"; return nullptr; } if (!cpuContext->isViable()) { return nullptr; } std::unordered_map deviceDescriptors; // NOTE: Assume there is only one CPU. TP_DCHECK_EQ( cpuContext->deviceDescriptors().count(Device{kCpuDeviceType, 0}), 1); const auto cpuDeviceDescriptor = cpuContext->deviceDescriptors().begin()->second; NopHolder nopHolder; DeviceDescriptor& deviceDescriptor = nopHolder.getObject(); deviceDescriptor.descriptor = cpuDeviceDescriptor; deviceDescriptor.deviceType = kCpuDeviceType; deviceDescriptors[Device{kCpuDeviceType, 0}] = saveDescriptor(nopHolder); for (const auto& device : getCudaDevices(cudaLib)) { deviceDescriptor.deviceType = kCudaDeviceType; deviceDescriptors[device] = saveDescriptor(nopHolder); } return std::make_shared( std::move(cudaLib), std::move(cpuContext), std::move(deviceDescriptors)); } ContextImpl::ContextImpl( CudaLib cudaLib, std::shared_ptr cpuContext, std::unordered_map deviceDescriptors) : ContextImplBoilerplate( std::move(deviceDescriptors)), cudaLib_(std::move(cudaLib)), cpuContext_(std::move(cpuContext)) {} std::shared_ptr ContextImpl::createChannel( std::vector> connections, Endpoint endpoint) { TP_DCHECK_EQ(numConnectionsNeeded(), connections.size()); auto conn = std::move(connections.back()); connections.pop_back(); auto cpuChannel = cpuContext_->createChannel(std::move(connections), endpoint); return createChannelInternal( std::move(conn), std::move(cpuChannel), cudaLoop_); } size_t ContextImpl::numConnectionsNeeded() const { return 1 + cpuContext_->numConnectionsNeeded(); } bool ContextImpl::canCommunicateWithRemote( const std::string& localDeviceDescriptor, const std::string& remoteDeviceDescriptor) const { DeviceDescriptor nopLocalDeviceDescriptor = deserializeDeviceDescriptor(localDeviceDescriptor); DeviceDescriptor nopRemoteDeviceDescriptor = deserializeDeviceDescriptor(remoteDeviceDescriptor); // Prevent CudaBasic from being mistakenly used for CPU to CPU transfers, as // there are always better options. if (nopLocalDeviceDescriptor.deviceType == kCpuDeviceType && nopRemoteDeviceDescriptor.deviceType == kCpuDeviceType) { return false; } return nopLocalDeviceDescriptor.descriptor == nopRemoteDeviceDescriptor.descriptor; } const CudaLib& ContextImpl::getCudaLib() { return cudaLib_; } Allocator& ContextImpl::getCudaHostSendAllocator(int deviceIdx) { if (!cudaHostSendAllocator_.has_value()) { CudaPinnedBuffer buffer = makeCudaPinnedBuffer(kStagingAreaSize, deviceIdx); uint8_t* ptr = buffer.get(); cudaHostSendAllocator_.emplace(CudaHostAllocator{ std::move(buffer), Allocator(ptr, kNumSlots, kSlotSize)}); } return cudaHostSendAllocator_->allocator; } Allocator& ContextImpl::getCudaHostRecvAllocator(int deviceIdx) { if (!cudaHostRecvAllocator_.has_value()) { CudaPinnedBuffer buffer = makeCudaPinnedBuffer(kStagingAreaSize, deviceIdx); uint8_t* ptr = buffer.get(); cudaHostRecvAllocator_.emplace(CudaHostAllocator{ std::move(buffer), Allocator(ptr, kNumSlots, kSlotSize)}); } return cudaHostRecvAllocator_->allocator; } void ContextImpl::handleErrorImpl() { if (cpuContext_ != nullptr) { cpuContext_->close(); } cudaLoop_.close(); if (cudaHostSendAllocator_.has_value()) { cudaHostSendAllocator_->allocator.close(); } if (cudaHostRecvAllocator_.has_value()) { cudaHostRecvAllocator_->allocator.close(); } } void ContextImpl::joinImpl() { if (cpuContext_ != nullptr) { cpuContext_->join(); } cudaLoop_.join(); } bool ContextImpl::inLoop() const { return loop_.inLoop(); }; void ContextImpl::deferToLoop(std::function fn) { loop_.deferToLoop(std::move(fn)); }; void ContextImpl::setIdImpl() { cpuContext_->setId(id_ + ".cpu"); } } // namespace cuda_basic } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/channel/cuda_basic/context_impl.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include #include #include #include namespace tensorpipe { namespace channel { namespace cuda_basic { class ChannelImpl; class ContextImpl final : public ContextImplBoilerplate { public: static std::shared_ptr create( std::shared_ptr cpuContext); ContextImpl( CudaLib cudaLib, std::shared_ptr cpuContext, std::unordered_map deviceDescriptors); std::shared_ptr createChannel( std::vector> connections, Endpoint endpoint); size_t numConnectionsNeeded() const override; bool canCommunicateWithRemote( const std::string& localDeviceDescriptor, const std::string& remoteDeviceDescriptor) const override; const CudaLib& getCudaLib(); Allocator& getCudaHostSendAllocator(int deviceIdx); Allocator& getCudaHostRecvAllocator(int deviceIdx); // Implement the DeferredExecutor interface. bool inLoop() const override; void deferToLoop(std::function fn) override; protected: // Implement the entry points called by ContextImplBoilerplate. void handleErrorImpl() override; void joinImpl() override; void setIdImpl() override; private: OnDemandDeferredExecutor loop_; const CudaLib cudaLib_; const std::shared_ptr cpuContext_; // TODO: Lazy initialization of cuda loop. CudaLoop cudaLoop_; struct CudaHostAllocator { CudaPinnedBuffer buffer; Allocator allocator; }; optional cudaHostSendAllocator_; optional cudaHostRecvAllocator_; }; } // namespace cuda_basic } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/channel/cuda_basic/factory.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include namespace tensorpipe { namespace channel { namespace cuda_basic { std::shared_ptr create(std::shared_ptr cpuContext) { return std::make_shared>( std::move(cpuContext)); } } // namespace cuda_basic } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/channel/cuda_basic/factory.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include namespace tensorpipe { namespace channel { namespace cuda_basic { std::shared_ptr create(std::shared_ptr cpuContext); } // namespace cuda_basic } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/channel/cuda_gdr/channel_impl.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include #include #include #include #include #include #include #include namespace tensorpipe { namespace channel { namespace cuda_gdr { namespace { size_t ceilOfRatio(size_t n, size_t d) { return (n + d - 1) / d; } } // namespace ChannelImpl::ChannelImpl( ConstructorToken token, std::shared_ptr context, std::string id, std::shared_ptr descriptorConnection, std::shared_ptr readyToReceiveConnection) : ChannelImplBoilerplate( token, std::move(context), std::move(id)), descriptorConnection_(std::move(descriptorConnection)), readyToReceiveConnection_(std::move(readyToReceiveConnection)) {} void ChannelImpl::initImplFromLoop() { TP_DCHECK(context_->inLoop()); TP_DCHECK_EQ(state_, INITIALIZING); TP_DCHECK(!error_); context_->enroll(*this); localGpuToNic_ = context_->getGpuToNicMapping(); numLocalNics_ = *std::max_element(localGpuToNic_.begin(), localGpuToNic_.end()) + 1; auto nopHolderOut = std::make_shared>(); HandshakeNumNics& nopHandshakeNumNics = nopHolderOut->getObject(); nopHandshakeNumNics.numNics = numLocalNics_; TP_VLOG(6) << "Channel " << id_ << " is writing nop object (handshake num NICs)"; readyToReceiveConnection_->write( *nopHolderOut, callbackWrapper_([nopHolderOut](ChannelImpl& impl) { TP_VLOG(6) << "Channel " << impl.id_ << " done writing nop object (handshake num NICs)"; })); auto nopHolderIn = std::make_shared>(); TP_VLOG(6) << "Channel " << id_ << " is reading nop object (handshake num NICs)"; readyToReceiveConnection_->read( *nopHolderIn, callbackWrapper_([nopHolderIn](ChannelImpl& impl) { TP_VLOG(6) << "Channel " << impl.id_ << " done reading nop object (handshake num NICs)"; if (!impl.error_) { impl.onReadHandshakeNumNics(nopHolderIn->getObject()); } })); state_ = WAITING_FOR_HANDSHAKE_NUM_NICS; } void ChannelImpl::onReadHandshakeNumNics( const HandshakeNumNics& nopHandshakeNumNics) { TP_DCHECK(context_->inLoop()); TP_DCHECK_EQ(state_, WAITING_FOR_HANDSHAKE_NUM_NICS); TP_DCHECK(!error_); numRemoteNics_ = nopHandshakeNumNics.numNics; std::vector> allSetupInfo; queuePairs_.resize(numLocalNics_); allSetupInfo.resize(numLocalNics_); for (size_t localNicIdx = 0; localNicIdx < numLocalNics_; localNicIdx++) { queuePairs_[localNicIdx].resize(numRemoteNics_); allSetupInfo[localNicIdx].resize(numRemoteNics_); IbvNic& localNic = context_->getIbvNic(localNicIdx); for (size_t remoteNicIdx = 0; remoteNicIdx < numRemoteNics_; remoteNicIdx++) { IbvLib::qp_init_attr initAttr; std::memset(&initAttr, 0, sizeof(initAttr)); initAttr.qp_type = IbvLib::QPT_RC; initAttr.send_cq = localNic.getIbvCq().get(); initAttr.recv_cq = localNic.getIbvCq().get(); initAttr.cap.max_send_wr = kNumSends; initAttr.cap.max_send_sge = 1; initAttr.cap.max_recv_wr = kNumRecvs; initAttr.cap.max_recv_sge = 1; initAttr.sq_sig_all = 1; IbvQueuePair qp = createIbvQueuePair( context_->getIbvLib(), localNic.getIbvPd(), initAttr); transitionIbvQueuePairToInit( context_->getIbvLib(), qp, localNic.getIbvAddress()); IbvSetupInformation setupInfo = makeIbvSetupInformation(localNic.getIbvAddress(), qp); // The maximum message size will be filled in later. queuePairs_[localNicIdx][remoteNicIdx] = QueuePair{std::move(qp), /*maximumMessageSize=*/0}; allSetupInfo[localNicIdx][remoteNicIdx].fromIbvSetupInformation( setupInfo); } } auto nopHolderOut = std::make_shared>(); HandshakeSetupInfo& nopHandshakeSetupInfo = nopHolderOut->getObject(); nopHandshakeSetupInfo.setupInfo = std::move(allSetupInfo); TP_VLOG(6) << "Channel " << id_ << " is writing nop object (handshake two)"; readyToReceiveConnection_->write( *nopHolderOut, callbackWrapper_([nopHolderOut](ChannelImpl& impl) { TP_VLOG(6) << "Channel " << impl.id_ << " done writing nop object (handshake two)"; })); auto nopHolderIn = std::make_shared>(); TP_VLOG(6) << "Channel " << id_ << " is reading nop object (handshake two)"; readyToReceiveConnection_->read( *nopHolderIn, callbackWrapper_([nopHolderIn](ChannelImpl& impl) { TP_VLOG(6) << "Channel " << impl.id_ << " done reading nop object (handshake two)"; if (!impl.error_) { impl.onReadHandshakeSetupInfo(nopHolderIn->getObject()); } })); state_ = WAITING_FOR_HANDSHAKE_SETUP_INFO; } void ChannelImpl::onReadHandshakeSetupInfo( const HandshakeSetupInfo& nopHandshakeSetupInfo) { TP_DCHECK(context_->inLoop()); TP_DCHECK_EQ(state_, WAITING_FOR_HANDSHAKE_SETUP_INFO); TP_DCHECK(!error_); const std::vector>& remoteSetupInfo = nopHandshakeSetupInfo.setupInfo; TP_DCHECK_EQ(remoteSetupInfo.size(), numRemoteNics_); for (size_t remoteNicIdx = 0; remoteNicIdx < numRemoteNics_; remoteNicIdx++) { TP_DCHECK_EQ(remoteSetupInfo[remoteNicIdx].size(), numLocalNics_); for (size_t localNicIdx = 0; localNicIdx < numLocalNics_; localNicIdx++) { IbvNic& localNic = context_->getIbvNic(localNicIdx); IbvSetupInformation setupInfo = remoteSetupInfo[remoteNicIdx][localNicIdx].toIbvSetupInformation(); const IbvAddress& localAddress = localNic.getIbvAddress(); transitionIbvQueuePairToReadyToReceive( context_->getIbvLib(), queuePairs_[localNicIdx][remoteNicIdx].queuePair, localAddress, setupInfo); transitionIbvQueuePairToReadyToSend( context_->getIbvLib(), queuePairs_[localNicIdx][remoteNicIdx].queuePair); queuePairs_[localNicIdx][remoteNicIdx].maximumMessageSize = std::min( localAddress.maximumMessageSize, setupInfo.maximumMessageSize); } } state_ = ESTABLISHED; sendOps_.advanceAllOperations(); recvOps_.advanceAllOperations(); } void ChannelImpl::sendImplFromLoop( uint64_t sequenceNumber, Buffer buffer, size_t length, TSendCallback callback) { size_t localGpuIdx = cudaDeviceForPointer( context_->getCudaLib(), buffer.unwrap().ptr); size_t localNicIdx = context_->getGpuToNicMapping()[localGpuIdx]; SendOpIter opIter = sendOps_.emplaceBack( sequenceNumber, buffer.unwrap(), length, std::move(callback), localGpuIdx, localNicIdx); opIter->event.record(buffer.unwrap().stream); sendOps_.advanceOperation(opIter); } void ChannelImpl::advanceSendOperation( SendOpIter opIter, SendOperation::State prevOpState) { TP_DCHECK(context_->inLoop()); SendOperation& op = *opIter; sendOps_.attemptTransition( opIter, /*from=*/SendOperation::UNINITIALIZED, /*to=*/SendOperation::FINISHED, /*cond=*/error_ || op.length == 0, /*actions=*/{&ChannelImpl::callSendCallback}); // Needs to go after previous op to ensure predictable and consistent ordering // of write calls on the descriptor control connection and read calls on the // completion control connection. sendOps_.attemptTransition( opIter, /*from=*/SendOperation::UNINITIALIZED, /*to=*/SendOperation::READING_READY_TO_RECEIVE, /*cond=*/!error_ && state_ == ESTABLISHED && prevOpState >= SendOperation::READING_READY_TO_RECEIVE, /*actions=*/ {&ChannelImpl::writeDescriptor, &ChannelImpl::readReadyToReceive}); sendOps_.attemptTransition( opIter, /*from=*/SendOperation::READING_READY_TO_RECEIVE, /*to=*/SendOperation::FINISHED, /*cond=*/error_ && op.doneReadingReadyToReceive, /*actions=*/{&ChannelImpl::callSendCallback}); // This doesn't strictly need to go after the previous op, but it doesn't make // sense to busy poll multiple events if only one of them is actually able to // then make progress. sendOps_.attemptTransition( opIter, /*from=*/SendOperation::READING_READY_TO_RECEIVE, /*to=*/SendOperation::WAITING_FOR_CUDA_EVENT, /*cond=*/!error_ && op.doneReadingReadyToReceive && prevOpState >= SendOperation::SENDING_OVER_IB, /*actions=*/{&ChannelImpl::waitForSendCudaEvent}); sendOps_.attemptTransition( opIter, /*from=*/SendOperation::WAITING_FOR_CUDA_EVENT, /*to=*/SendOperation::FINISHED, /*cond=*/error_ && op.doneWaitingForCudaEvent, /*actions=*/{&ChannelImpl::callSendCallback}); // Needs to go after previous op to ensure predictable and consistent ordering // of send calls on InfiniBand queue pair. sendOps_.attemptTransition( opIter, /*from=*/SendOperation::WAITING_FOR_CUDA_EVENT, /*to=*/SendOperation::SENDING_OVER_IB, /*cond=*/!error_ && op.doneWaitingForCudaEvent && prevOpState >= SendOperation::SENDING_OVER_IB, /*actions=*/{&ChannelImpl::sendOverIb}); sendOps_.attemptTransition( opIter, /*from=*/SendOperation::SENDING_OVER_IB, /*to=*/SendOperation::FINISHED, /*cond=*/op.numChunksBeingSent == 0, /*actions=*/{&ChannelImpl::callSendCallback}); } void ChannelImpl::writeDescriptor(SendOpIter opIter) { TP_DCHECK(context_->inLoop()); SendOperation& op = *opIter; auto nopHolder = std::make_shared>(); Descriptor& nopDescriptor = nopHolder->getObject(); nopDescriptor.originNicIdx = op.localNicIdx; TP_VLOG(6) << "Channel " << id_ << " is writing descriptor (#" << op.sequenceNumber << ")"; descriptorConnection_->write( *nopHolder, callbackWrapper_([sequenceNumber{op.sequenceNumber}, nopHolder](ChannelImpl& impl) { TP_VLOG(6) << "Channel " << impl.id_ << " done writing descriptor (# " << sequenceNumber << ")"; })); } void ChannelImpl::readReadyToReceive(SendOpIter opIter) { TP_DCHECK(context_->inLoop()); SendOperation& op = *opIter; auto nopHolderIn = std::make_shared>(); TP_VLOG(6) << "Channel " << id_ << " is reading ready-to-receive (#" << op.sequenceNumber << ")"; readyToReceiveConnection_->read( *nopHolderIn, callbackWrapper_([opIter, nopHolderIn](ChannelImpl& impl) { TP_VLOG(6) << "Channel " << impl.id_ << " done reading ready-to-receive (# " << opIter->sequenceNumber << ")"; opIter->doneReadingReadyToReceive = true; if (!impl.error_) { const auto& readyToReceive = nopHolderIn->getObject(); opIter->remoteNicIdx = readyToReceive.destinationNicIdx; } impl.sendOps_.advanceOperation(opIter); })); } void ChannelImpl::waitForSendCudaEvent(SendOpIter opIter) { TP_DCHECK(context_->inLoop()); SendOperation& op = *opIter; TP_VLOG(6) << "Channel " << id_ << " is waiting for CUDA event to send (#" << op.sequenceNumber << ")"; context_->waitForCudaEvent( op.event, callbackWrapper_([opIter](ChannelImpl& impl) { TP_VLOG(6) << "Channel " << impl.id_ << " done waiting for CUDA event to send (# " << opIter->sequenceNumber << ")"; opIter->doneWaitingForCudaEvent = true; impl.sendOps_.advanceOperation(opIter); })); } void ChannelImpl::sendOverIb(SendOpIter opIter) { TP_DCHECK(context_->inLoop()); SendOperation& op = *opIter; IbvNic& localNic = context_->getIbvNic(op.localNicIdx); IbvQueuePair& qp = queuePairs_[op.localNicIdx][op.remoteNicIdx].queuePair; size_t chunkSize = queuePairs_[op.localNicIdx][op.remoteNicIdx].maximumMessageSize; // This could be VEEERY slow the first time we encounter the buffer, but the // result will be cached and subsequent calls will be much faster. IbvMemoryRegion& mr = localNic.registerMemory(op.buffer); size_t numChunks = ceilOfRatio(op.length, chunkSize); for (size_t chunkIdx = 0; chunkIdx < numChunks; chunkIdx++) { IbvNic::SendInfo info; info.addr = reinterpret_cast(op.buffer.ptr) + chunkIdx * chunkSize; info.length = std::min(op.length - chunkIdx * chunkSize, chunkSize); info.lkey = mr->lkey; TP_VLOG(6) << "Channel " << id_ << " is sending chunk #" << chunkIdx << " (out of " << numChunks << ") of tensor #" << op.sequenceNumber << " on QP " << qp->qp_num; localNic.postSend( qp, info, callbackWrapper_([opIter, chunkIdx](ChannelImpl& impl) { TP_VLOG(6) << "Channel " << impl.id_ << " done sending chunk #" << chunkIdx << " of tensor #" << opIter->sequenceNumber; opIter->numChunksBeingSent--; impl.sendOps_.advanceOperation(opIter); impl.numSendsInFlight_--; impl.tryCleanup(); })); op.numChunksBeingSent++; numSendsInFlight_++; } } void ChannelImpl::callSendCallback(SendOpIter opIter) { TP_DCHECK(context_->inLoop()); SendOperation& op = *opIter; op.callback(error_); // Reset callback to release the resources it was holding. op.callback = nullptr; } void ChannelImpl::recvImplFromLoop( uint64_t sequenceNumber, Buffer buffer, size_t length, TRecvCallback callback) { size_t localGpuIdx = cudaDeviceForPointer( context_->getCudaLib(), buffer.unwrap().ptr); size_t localNicIdx = context_->getGpuToNicMapping()[localGpuIdx]; RecvOpIter opIter = recvOps_.emplaceBack( sequenceNumber, buffer.unwrap(), length, std::move(callback), localGpuIdx, localNicIdx); opIter->event.record(buffer.unwrap().stream); recvOps_.advanceOperation(opIter); } void ChannelImpl::advanceRecvOperation( RecvOpIter opIter, RecvOperation::State prevOpState) { TP_DCHECK(context_->inLoop()); RecvOperation& op = *opIter; recvOps_.attemptTransition( opIter, /*from=*/RecvOperation::UNINITIALIZED, /*to=*/RecvOperation::FINISHED, /*cond=*/error_ || op.length == 0, /*actions=*/{&ChannelImpl::callRecvCallback}); // Needs to go after previous op to ensure predictable and consistent ordering // of write calls on the descriptor control connection. recvOps_.attemptTransition( opIter, /*from=*/RecvOperation::UNINITIALIZED, /*to=*/RecvOperation::READING_DESCRIPTOR, /*cond=*/!error_ && state_ == ESTABLISHED && prevOpState >= RecvOperation::READING_DESCRIPTOR, /*actions=*/{&ChannelImpl::readDescriptor}); recvOps_.attemptTransition( opIter, /*from=*/RecvOperation::READING_DESCRIPTOR, /*to=*/RecvOperation::FINISHED, /*cond=*/error_ && op.doneReadingDescriptor, /*actions=*/{&ChannelImpl::callRecvCallback}); // This doesn't strictly need to go after the previous op, but it doesn't make // sense to busy poll multiple events if only one of them is actually able to // then make progress. recvOps_.attemptTransition( opIter, /*from=*/RecvOperation::READING_DESCRIPTOR, /*to=*/RecvOperation::WAITING_FOR_CUDA_EVENT, /*cond=*/!error_ && op.doneReadingDescriptor && prevOpState >= RecvOperation::RECEIVING_OVER_IB, /*actions=*/{&ChannelImpl::waitForRecvCudaEvent}); recvOps_.attemptTransition( opIter, /*from=*/RecvOperation::WAITING_FOR_CUDA_EVENT, /*to=*/RecvOperation::FINISHED, /*cond=*/error_ && op.doneWaitingForCudaEvent, /*actions=*/{&ChannelImpl::callRecvCallback}); // Needs to go after previous op to ensure predictable and consistent ordering // of recv calls on InfiniBand queue pair and write calls on the completion // control connection. recvOps_.attemptTransition( opIter, /*from=*/RecvOperation::WAITING_FOR_CUDA_EVENT, /*to=*/RecvOperation::RECEIVING_OVER_IB, /*cond=*/!error_ && op.doneWaitingForCudaEvent && prevOpState >= RecvOperation::RECEIVING_OVER_IB, /*actions=*/{&ChannelImpl::recvOverIbAndWriteReadyToRecive}); recvOps_.attemptTransition( opIter, /*from=*/RecvOperation::RECEIVING_OVER_IB, /*to=*/RecvOperation::FINISHED, /*cond=*/op.numChunksBeingReceived == 0, /*actions=*/{&ChannelImpl::callRecvCallback}); } void ChannelImpl::readDescriptor(RecvOpIter opIter) { TP_DCHECK(context_->inLoop()); RecvOperation& op = *opIter; TP_VLOG(6) << "Channel " << id_ << " is reading descriptor (#" << op.sequenceNumber << ")"; auto nopHolderIn = std::make_shared>(); descriptorConnection_->read( *nopHolderIn, callbackWrapper_([opIter, nopHolderIn](ChannelImpl& impl) { TP_VLOG(6) << "Channel " << impl.id_ << " done reading descriptor (# " << opIter->sequenceNumber << ")"; opIter->doneReadingDescriptor = true; if (!impl.error_) { Descriptor& nopDescriptor = nopHolderIn->getObject(); opIter->remoteNicIdx = nopDescriptor.originNicIdx; } impl.recvOps_.advanceOperation(opIter); })); } void ChannelImpl::waitForRecvCudaEvent(RecvOpIter opIter) { TP_DCHECK(context_->inLoop()); RecvOperation& op = *opIter; TP_VLOG(6) << "Channel " << id_ << " is waiting for CUDA event to recv (#" << op.sequenceNumber << ")"; context_->waitForCudaEvent( op.event, callbackWrapper_([opIter](ChannelImpl& impl) { TP_VLOG(6) << "Channel " << impl.id_ << " done waiting for CUDA event to recv (# " << opIter->sequenceNumber << ")"; opIter->doneWaitingForCudaEvent = true; impl.recvOps_.advanceOperation(opIter); })); } void ChannelImpl::recvOverIbAndWriteReadyToRecive(RecvOpIter opIter) { TP_DCHECK(context_->inLoop()); RecvOperation& op = *opIter; IbvNic& localNic = context_->getIbvNic(op.localNicIdx); IbvQueuePair& qp = queuePairs_[op.localNicIdx][op.remoteNicIdx].queuePair; size_t chunkSize = queuePairs_[op.localNicIdx][op.remoteNicIdx].maximumMessageSize; // This could be VEEERY slow the first time we encounter the buffer, but the // result will be cached and subsequent calls will be much faster. IbvMemoryRegion& mr = localNic.registerMemory(op.buffer); size_t numChunks = ceilOfRatio(op.length, chunkSize); for (size_t chunkIdx = 0; chunkIdx < numChunks; chunkIdx++) { IbvNic::RecvInfo info; info.addr = reinterpret_cast(op.buffer.ptr) + chunkIdx * chunkSize; info.length = std::min(op.length - chunkIdx * chunkSize, chunkSize); info.lkey = mr->lkey; TP_VLOG(6) << "Channel " << id_ << " is receiving chunk #" << chunkIdx << " (out of " << numChunks << ") of tensor #" << op.sequenceNumber << " on QP " << qp->qp_num; localNic.postRecv( qp, info, callbackWrapper_([opIter, chunkIdx](ChannelImpl& impl) { TP_VLOG(6) << "Channel " << impl.id_ << " done receiving chunk #" << chunkIdx << " of tensor #" << opIter->sequenceNumber; opIter->numChunksBeingReceived--; impl.recvOps_.advanceOperation(opIter); impl.numRecvsInFlight_--; impl.tryCleanup(); })); op.numChunksBeingReceived++; numRecvsInFlight_++; } auto nopHolderOut = std::make_shared>(); ReadyToReceive& nopReadyToReceive = nopHolderOut->getObject(); nopReadyToReceive.destinationNicIdx = op.localNicIdx; TP_VLOG(6) << "Channel " << id_ << " is writing ready-to-receive (#" << op.sequenceNumber << ")"; readyToReceiveConnection_->write( *nopHolderOut, callbackWrapper_([sequenceNumber{opIter->sequenceNumber}, nopHolderOut](ChannelImpl& impl) { TP_VLOG(6) << "Channel " << impl.id_ << " done writing ready-to-receive (#" << sequenceNumber << ")"; })); } void ChannelImpl::callRecvCallback(RecvOpIter opIter) { TP_DCHECK(context_->inLoop()); RecvOperation& op = *opIter; op.callback(error_); // Reset callback to release the resources it was holding. op.callback = nullptr; } void ChannelImpl::handleErrorImpl() { sendOps_.advanceAllOperations(); recvOps_.advanceAllOperations(); for (size_t localNicIdx = 0; localNicIdx < numLocalNics_; localNicIdx++) { for (size_t remoteNicIdx = 0; remoteNicIdx < numRemoteNics_; remoteNicIdx++) { transitionIbvQueuePairToError( context_->getIbvLib(), queuePairs_[localNicIdx][remoteNicIdx].queuePair); } } tryCleanup(); descriptorConnection_->close(); readyToReceiveConnection_->close(); } void ChannelImpl::tryCleanup() { TP_DCHECK(context_->inLoop()); if (error_) { if (numSendsInFlight_ == 0 && numRecvsInFlight_ == 0) { cleanup(); } else { TP_VLOG(9) << "Connection " << id_ << " cannot proceed to cleanup because it has " << numSendsInFlight_ << " pending send requests and " << numRecvsInFlight_ << " pending recv requests"; } } } void ChannelImpl::cleanup() { TP_DCHECK(context_->inLoop()); TP_VLOG(8) << "Connection " << id_ << " is cleaning up"; queuePairs_.clear(); context_->unenroll(*this); } } // namespace cuda_gdr } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/channel/cuda_gdr/channel_impl.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include #include #include #include #include #include #include #include namespace tensorpipe { namespace channel { namespace cuda_gdr { class ContextImpl; // Ideally we would use NOP_EXTERNAL_STRUCTURE instead of defining the following // two structs, but we tried so in D26460332 and failed because a bug in GCC 5.5 // (and probably other versions) requires every nop structure used inside a // std::vector to have an explicit non-defaulted default constructor, which is // something we cannot do with NOP_EXTERNAL_STRUCTURE and forces us to re-define // separate structs. // Replicate the IbvLib::gid struct so we can serialize it with libnop. struct NopIbvGid { uint64_t subnetPrefix; uint64_t interfaceId; NOP_STRUCTURE(NopIbvGid, subnetPrefix, interfaceId); void fromIbvGid(const IbvLib::gid& globalIdentifier) { subnetPrefix = globalIdentifier.global.subnet_prefix; interfaceId = globalIdentifier.global.interface_id; } IbvLib::gid toIbvGid() const { IbvLib::gid globalIdentifier; globalIdentifier.global.subnet_prefix = subnetPrefix; globalIdentifier.global.interface_id = interfaceId; return globalIdentifier; } }; // Replicate the IbvSetupInformation struct so we can serialize it with libnop. struct NopIbvSetupInformation { // This pointless constructor is needed to work around a bug in GCC 5.5 (and // possibly other versions). It appears to be needed in the nop types that // are used inside std::vectors. NopIbvSetupInformation() {} uint32_t localIdentifier; NopIbvGid globalIdentifier; uint32_t queuePairNumber; IbvLib::mtu maximumTransmissionUnit; uint32_t maximumMessageSize; NOP_STRUCTURE( NopIbvSetupInformation, localIdentifier, globalIdentifier, queuePairNumber, maximumTransmissionUnit, maximumMessageSize); void fromIbvSetupInformation(const IbvSetupInformation& setupInfo) { localIdentifier = setupInfo.localIdentifier; globalIdentifier.fromIbvGid(setupInfo.globalIdentifier); queuePairNumber = setupInfo.queuePairNumber; maximumTransmissionUnit = setupInfo.maximumTransmissionUnit; maximumMessageSize = setupInfo.maximumMessageSize; } IbvSetupInformation toIbvSetupInformation() const { IbvSetupInformation setupInfo; setupInfo.localIdentifier = localIdentifier; setupInfo.globalIdentifier = globalIdentifier.toIbvGid(); setupInfo.queuePairNumber = queuePairNumber; setupInfo.maximumTransmissionUnit = maximumTransmissionUnit; setupInfo.maximumMessageSize = maximumMessageSize; return setupInfo; } }; struct SendOperation { enum State { UNINITIALIZED, READING_READY_TO_RECEIVE, WAITING_FOR_CUDA_EVENT, SENDING_OVER_IB, FINISHED }; // Fields used by the state machine uint64_t sequenceNumber{0}; State state{UNINITIALIZED}; // Progress flags bool doneReadingReadyToReceive{false}; bool doneWaitingForCudaEvent{false}; uint64_t numChunksBeingSent{0}; // Arguments at creation const CudaBuffer buffer; const size_t length; const size_t localNicIdx; TSendCallback callback; // Other stuff CudaEvent event; size_t remoteNicIdx; SendOperation( CudaBuffer buffer, size_t length, TSendCallback callback, size_t localGpuIdx, size_t localNicIdx) : buffer(buffer), length(length), localNicIdx(localNicIdx), callback(std::move(callback)), event(localGpuIdx) {} }; struct RecvOperation { enum State { UNINITIALIZED, READING_DESCRIPTOR, WAITING_FOR_CUDA_EVENT, RECEIVING_OVER_IB, FINISHED }; // Fields used by the state machine uint64_t sequenceNumber{0}; State state{UNINITIALIZED}; // Progress flags bool doneReadingDescriptor{false}; bool doneWaitingForCudaEvent{false}; uint64_t numChunksBeingReceived{0}; // Arguments at creation const CudaBuffer buffer; const size_t length; const size_t localNicIdx; TSendCallback callback; // Other stuff size_t remoteNicIdx; CudaEvent event; RecvOperation( CudaBuffer buffer, size_t length, TSendCallback callback, size_t deviceIdx, size_t localNicIdx) : buffer(buffer), length(length), localNicIdx(localNicIdx), callback(std::move(callback)), event(deviceIdx) {} }; // First "round" of handshake. struct HandshakeNumNics { size_t numNics; NOP_STRUCTURE(HandshakeNumNics, numNics); }; // Second "round" of handshake. struct HandshakeSetupInfo { std::vector> setupInfo; NOP_STRUCTURE(HandshakeSetupInfo, setupInfo); }; // From sender to receiver (through pipe). struct Descriptor { size_t originNicIdx; NOP_STRUCTURE(Descriptor, originNicIdx); }; // From receiver to sender (through channel's connection). struct ReadyToReceive { size_t destinationNicIdx; NOP_STRUCTURE(ReadyToReceive, destinationNicIdx); }; class ChannelImpl final : public ChannelImplBoilerplate { public: ChannelImpl( ConstructorToken token, std::shared_ptr context, std::string id, std::shared_ptr descriptorConnection, std::shared_ptr readyToReceiveConnection); protected: // Implement the entry points called by ChannelImplBoilerplate. void initImplFromLoop() override; void sendImplFromLoop( uint64_t sequenceNumber, Buffer buffer, size_t length, TSendCallback callback) override; void recvImplFromLoop( uint64_t sequenceNumber, Buffer buffer, size_t length, TRecvCallback callback) override; void handleErrorImpl() override; private: const std::shared_ptr descriptorConnection_; const std::shared_ptr readyToReceiveConnection_; enum State { INITIALIZING = 1, WAITING_FOR_HANDSHAKE_NUM_NICS, WAITING_FOR_HANDSHAKE_SETUP_INFO, ESTABLISHED, }; State state_{INITIALIZING}; std::vector localGpuToNic_; size_t numLocalNics_{0}; size_t numRemoteNics_{0}; // This struct is used to bundle the queue pair with some additional metadata. struct QueuePair { IbvQueuePair queuePair; // The CUDA GDR channel could be asked to transmit arbitrarily large tensors // and in principle it could directly forward them to the NIC as they are. // However IB NICs have limits on the size of each message. Hence we // determine these sizes, one per queue pair (as the minimum of the local // and remote sizes) and then split our tensors in chunks of that size. uint32_t maximumMessageSize; }; std::vector> queuePairs_; OpsStateMachine sendOps_{ *this, &ChannelImpl::advanceSendOperation}; using SendOpIter = decltype(sendOps_)::Iter; OpsStateMachine recvOps_{ *this, &ChannelImpl::advanceRecvOperation}; using RecvOpIter = decltype(recvOps_)::Iter; uint32_t numSendsInFlight_{0}; uint32_t numRecvsInFlight_{0}; // Callbacks for the initial handshake phase. void onReadHandshakeNumNics(const HandshakeNumNics& nopHandshakeNumNics); void onReadHandshakeSetupInfo( const HandshakeSetupInfo& nopHandshakeSetupInfo); // Cleanup methods for teardown. void tryCleanup(); void cleanup(); // State machines for send and recv ops. void advanceSendOperation( SendOpIter opIter, SendOperation::State prevOpState); void advanceRecvOperation( RecvOpIter opIter, RecvOperation::State prevOpState); // Actions (i.e., methods that begin a state transition). // For send operations: void writeDescriptor(SendOpIter opIter); void readReadyToReceive(SendOpIter opIter); void waitForSendCudaEvent(SendOpIter opIter); void sendOverIb(SendOpIter opIter); void callSendCallback(SendOpIter opIter); // For recv operations: void readDescriptor(RecvOpIter opIter); void waitForRecvCudaEvent(RecvOpIter opIter); void recvOverIbAndWriteReadyToRecive(RecvOpIter opIter); void callRecvCallback(RecvOpIter opIter); }; } // namespace cuda_gdr } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/channel/cuda_gdr/constants.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include namespace tensorpipe { namespace channel { namespace cuda_gdr { namespace { // We should probably allow these to be user-configured. But, for now, we'll set // them to the lowest value they can have, the rationale being that this way // they will always be valid. constexpr uint8_t kPortNum = 1; constexpr uint8_t kGlobalIdentifierIndex = 0; // FIXME Instead of hardcoding the next three values, we could use // ibv_query_device to obtain max_cqe, max_qp_wr and max_srq_wr and deduce from // them the maximum allowed values for these parameters. constexpr uint32_t kNumRecvs = 1024; constexpr uint32_t kNumSends = 1024; // How many elements the completion queue should be able to hold. These elements // will be either the completed receive requests of the SRQ, or the completed // send requests from a connection's queue pair. We can bound the former value // but not the latter, so we try to add some margin. constexpr int kCompletionQueueSize = kNumRecvs + kNumSends; // How many work completions to poll from the completion queue at each reactor // iteration. constexpr int kNumPolledWorkCompletions = 32; } // namespace } // namespace cuda_gdr } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/channel/cuda_gdr/context_impl.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include namespace tensorpipe { namespace channel { namespace cuda_gdr { namespace { // NOTE: This is an incomplete implementation of C++17's `std::apply`. // It's intended to only work for methods of IbvNic. template auto applyFuncImpl( IbvNic& subject, TMethod&& method, TArgsTuple&& args, std::index_sequence /* unused */) { return ((subject).*(method))(std::get(std::forward(args))...); } template auto applyFunc(IbvNic& subject, TMethod&& method, TArgsTuple&& args) { return applyFuncImpl( subject, std::forward(method), std::forward(args), std::make_index_sequence< std::tuple_size>::value>{}); } // We can only pass CUDA pointers to InfiniBand (for example when registering // some memory) if InfiniBand "knows about" CUDA. Those pointers refer to the // section of the process's virtual address space that is being used by CUDA to // represent device memory (as part of CUDA's unified memory approach). Thus // InfiniBand needs to talk to CUDA to translate those pointers to physical PCIe // hardware addresses. // This is achieved by CUDA providing a so-called "peer memory client" and // registering it with the InfiniBand kernel module. The peer memory client is // itself a kernel module, see https://github.com/Mellanox/nv_peer_memory. // The "catch" is that the whole "peer memory client" system is not part of the // official Linux InfiniBand. It's provided by a Mellanox extension, and it's // part of their "OpenFabrics Enterprise Distribution" (MLNX_OFED), see // https://www.mellanox.com/products/infiniband-drivers/linux/mlnx_ofed. (In // particular, on Ubuntu, this seems to be provided by the mlnx-ofed-kernel-dkms // package). Note that this difference between "vanilla" InfiniBand and OFED is // only in kernel space; from our perspective the two have the same API. Also // note that Mellanox has tried at least a couple of time to upstream this, but // apparently without success: // https://lore.kernel.org/linux-rdma/1412602019-30659-1-git-send-email-yishaih@mellanox.com/ // https://lore.kernel.org/linux-rdma/1455207177-11949-1-git-send-email-artemyko@mellanox.com/ // The check we use to verify if the peer memory client is active is the same as // NCCL's one, see // https://github.com/NVIDIA/nccl/blob/ca8485b0d01ca6dfa02f4454932011e68b461175/src/transport/net_ib.cc#L216-L230 // Whereas TensorFlow does it slightly differently, see // https://github.com/tensorflow/networking/blob/671e2548b602f93a6c6502432b8bc131b5cc4914/tensorflow_networking/gdr/gdr_memory_manager.cc#L43-L60 static std::string kNvMemModulePath = "/sys/kernel/mm/memory_peers/nv_mem/version"; static std::string kNvidiaPeermemModulePath = "/sys/kernel/mm/memory_peers/nvidia-peermem/version"; bool isNvidiaPeerMemoryClientActive() { int rv1 = ::access(kNvMemModulePath.c_str(), F_OK); int rv2 = ::access(kNvidiaPeermemModulePath.c_str(), F_OK); return rv1 >= 0 || rv2 >= 0; } // The PCI topology is a tree, with the root being the host bridge, the leaves // being the devices, and the other nodes being switches. We want to match each // GPU to the InfiniBand NIC with which it shares the longest "prefix" in this // tree, as that will route the data transfer away from the most "central" // switches and from the host bridge. We extract the "path" of a device in the // PCI tree by obtaining its "canonical" path in Linux's sysfs, which contains // one component for each other device that is traversed. The format of such a // path is /sys/devices/pci0123:45(/0123:45:67.8)+"); // See https://www.kernel.org/doc/ols/2005/ols2005v1-pages-321-334.pdf for more // info on sysfs. const std::string kPciPathPrefix = "/sys/devices/pci"; std::string getPciPathForIbvNic(const std::string& nicName) { std::array pciPath; char* rv = ::realpath( ("/sys/class/infiniband/" + nicName + "/device").c_str(), pciPath.data()); TP_THROW_SYSTEM_IF(rv == nullptr, errno); TP_DCHECK(rv == pciPath.data()); std::string res(pciPath.data()); TP_DCHECK(res.substr(0, kPciPathPrefix.size()) == kPciPathPrefix) << "Bad PCI path for InfiniBand NIC " << nicName << ": " << res; return res; } std::string getPciPathForGpu(int gpuIdx) { // The CUDA documentation says the ID will consist of a domain (16 bits), a // bus (8 bits), a device (5 bits) and a function (3 bits). When represented // as hex, including the separators and the null terminator, this takes up 13 // bytes. However NCCL seems to suggests that sometimes the domain takes twice // that size, and hence 17 bytes are necessary. // https://github.com/NVIDIA/nccl/blob/c6dbdb00849027b4e2c277653cbef53729f7213d/src/misc/utils.cc#L49-L53 std::array pciDeviceId; TP_CUDA_CHECK( cudaDeviceGetPCIBusId(pciDeviceId.data(), pciDeviceId.size(), gpuIdx)); // Fun fact: CUDA seems to format hex letters as uppercase, but Linux's sysfs // expects them as lowercase. for (char& c : pciDeviceId) { if ('A' <= c && c <= 'F') { c = c - 'A' + 'a'; } } std::array pciPath; char* rv = ::realpath( ("/sys/bus/pci/devices/" + std::string(pciDeviceId.data())).c_str(), pciPath.data()); TP_THROW_SYSTEM_IF(rv == nullptr, errno); TP_DCHECK(rv == pciPath.data()); std::string res(pciPath.data()); TP_DCHECK(res.substr(0, kPciPathPrefix.size()) == kPciPathPrefix) << "Bad PCI path for GPU #" << gpuIdx << ": " << res; return res; } size_t commonPrefixLength(const std::string& a, const std::string& b) { // The length of the longest common prefix is the index of the first char on // which the two strings differ. size_t maxLength = std::min(a.size(), b.size()); for (size_t idx = 0; idx < maxLength; idx++) { if (a[idx] != b[idx]) { return idx; } } return maxLength; } std::vector matchGpusToIbvNics( IbvLib& ibvLib, IbvDeviceList& deviceList) { struct NicInfo { std::string name; std::string pciPath; }; std::vector nicInfos; for (size_t deviceIdx = 0; deviceIdx < deviceList.size(); deviceIdx++) { IbvLib::device& device = deviceList[deviceIdx]; std::string deviceName(TP_CHECK_IBV_PTR(ibvLib.get_device_name(&device))); std::string pciPath = getPciPathForIbvNic(deviceName); TP_VLOG(5) << "Resolved InfiniBand NIC " << deviceName << " to PCI path " << pciPath; nicInfos.push_back(NicInfo{std::move(deviceName), std::move(pciPath)}); } int numGpus; TP_CUDA_CHECK(cudaGetDeviceCount(&numGpus)); std::vector gpuIdxToIbvNicName; for (int gpuIdx = 0; gpuIdx < numGpus; gpuIdx++) { std::string gpuPciPath = getPciPathForGpu(gpuIdx); TP_VLOG(5) << "Resolved GPU #" << gpuIdx << " to PCI path " << gpuPciPath; ssize_t bestMatchLength = -1; const std::string* bestMatchName = nullptr; for (const auto& nicInfo : nicInfos) { ssize_t matchLength = commonPrefixLength(gpuPciPath, nicInfo.pciPath); if (matchLength > bestMatchLength) { bestMatchLength = matchLength; bestMatchName = &nicInfo.name; } } TP_DCHECK_GE(bestMatchLength, 0); TP_DCHECK(bestMatchName != nullptr); gpuIdxToIbvNicName.push_back(*bestMatchName); } return gpuIdxToIbvNicName; } // In GpuDirect, the way an InfiniBand NIC accesses the GPU's memory is by // issuing a PCIe read to some address within the GPU's "base address register" // (BAR), i.e., a slice of the "physical" PCIe address space that belongs to the // GPU. BARs in principle provide only "windows" into a device's memory, and // could be re-mapped over time. When a CUDA allocation is registered on // InfiniBand, its backing memory is mapped into the BAR and its address is // given to the InfiniBand driver. That mapping must remain in place until the // registration is destroyed. See // https://docs.nvidia.com/cuda/gpudirect-rdma/index.html#how-gpudirect-rdma-works. // CUDA GDR doesn't work well with that, because: // - It attempts to register the entire user allocation with InfiniBand, hence // allocations that exceed the BAR's size can never be transferred. // - It "caches" (or "leaks") the InfiniBand registration, because creating it // is expensive, so that this can be done once and then reused. This means // that even if each tensor that is sent is smaller than the BAR, we'd start // seeing failures if their cumulative size exceeded the one of the BAR. // On some GPUs though the BAR size spans the entire GPU memory. In such cases // what CUDA GDR is doing should be "safe". In all other cases, however, it // isn't, and it's better to thus disable CUDA GDR entirely in these scenarios, // so that users end up using a fully functioning (but slower) CUDA channel. // There are multiple BARs for each GPU, but from an experimental investigation // it seems the one that maps to the device's memory is BAR1. The programmatic // way that the Linux kernel offers to access information about PCIe and its // BARs is through sysfs. See // https://www.kernel.org/doc/html/latest/PCI/sysfs-pci.html. size_t getBar1SizeOfGpu(int gpuIdx) { std::string pciPath = getPciPathForGpu(gpuIdx); pciPath += "/resource1"; struct stat bar1Stats; int rv = ::stat(pciPath.c_str(), &bar1Stats); TP_THROW_SYSTEM_IF(rv < 0, errno); return bar1Stats.st_size; } bool allGpusHaveEnoughBar1Size() { int numGpus; TP_CUDA_CHECK(cudaGetDeviceCount(&numGpus)); for (int gpuIdx = 0; gpuIdx < numGpus; gpuIdx++) { cudaDeviceProp gpuProps; TP_CUDA_CHECK(cudaGetDeviceProperties(&gpuProps, gpuIdx)); size_t memorySize = gpuProps.totalGlobalMem; size_t bar1Size = getBar1SizeOfGpu(gpuIdx); TP_VLOG(5) << "GPU #" << gpuIdx << " has " << memorySize << " bytes of memory and the size of its PCIe BAR1 is " << bar1Size << " bytes"; if (bar1Size < memorySize) { return false; } } return true; } } // namespace IbvNic::IbvNic( std::string name, IbvLib::device& device, const IbvLib& ibvLib, const CudaLib& cudaLib) : name_(std::move(name)), cudaLib_(cudaLib), ibvLib_(ibvLib) { ctx_ = createIbvContext(ibvLib_, device); pd_ = createIbvProtectionDomain(ibvLib_, ctx_); cq_ = createIbvCompletionQueue( ibvLib_, ctx_, kCompletionQueueSize, /*cq_context=*/nullptr, /*channel=*/nullptr, /*comp_vector=*/0); addr_ = makeIbvAddress(ibvLib_, ctx_, kPortNum, kGlobalIdentifierIndex); } bool IbvNic::pollOnce() { std::array wcs; auto rv = ibvLib_.poll_cq(cq_.get(), wcs.size(), wcs.data()); if (rv == 0) { return false; } TP_THROW_SYSTEM_IF(rv < 0, errno); int numSends = 0; int numRecvs = 0; for (int wcIdx = 0; wcIdx < rv; wcIdx++) { IbvLib::wc& wc = wcs[wcIdx]; TP_VLOG(6) << "Channel context " << id_ << " got work completion on device " << name_ << " for request " << wc.wr_id << " for QP " << wc.qp_num << " with status " << ibvLib_.wc_status_str(wc.status) << " and opcode " << ibvWorkCompletionOpcodeToStr(wc.opcode) << " (byte length: " << wc.byte_len << ")"; auto iter = requestsInFlight_.find(wc.wr_id); TP_THROW_ASSERT_IF(iter == requestsInFlight_.end()) << "Got work completion with unknown ID " << wc.wr_id; IbvLib::wc_opcode opcode = std::move(std::get<0>(iter->second)); std::function cb = std::move(std::get<1>(iter->second)); requestsInFlight_.erase(iter); if (wc.status != IbvLib::WC_SUCCESS) { cb(TP_CREATE_ERROR(IbvError, ibvLib_.wc_status_str(wc.status))); } else { cb(Error::kSuccess); } switch (opcode) { case IbvLib::WC_RECV: numRecvs++; break; case IbvLib::WC_SEND: numSends++; break; default: TP_THROW_ASSERT() << "Unknown opcode: " << opcode; } } numAvailableSendSlots_ += numSends; while (!sendsWaitingForSlots_.empty() && numAvailableSendSlots_ > 0) { applyFunc( *this, &IbvNic::postSend, std::move(sendsWaitingForSlots_.front())); sendsWaitingForSlots_.pop_front(); } numAvailableRecvSlots_ += numRecvs; while (!recvsWaitingForSlots_.empty() && numAvailableRecvSlots_ > 0) { applyFunc( *this, &IbvNic::postRecv, std::move(recvsWaitingForSlots_.front())); recvsWaitingForSlots_.pop_front(); } return true; } void IbvNic::postSend( IbvQueuePair& qp, SendInfo info, std::function cb) { if (numAvailableSendSlots_ > 0) { IbvLib::sge list; list.addr = reinterpret_cast(info.addr); list.length = info.length; list.lkey = info.lkey; IbvLib::send_wr wr; std::memset(&wr, 0, sizeof(wr)); wr.wr_id = nextRequestId_++; wr.sg_list = &list; wr.num_sge = 1; wr.opcode = IbvLib::WR_SEND; IbvLib::send_wr* badWr = nullptr; TP_VLOG(6) << "Channel context " << id_ << " posting send on device " << name_ << " for QP " << qp->qp_num; TP_CHECK_IBV_INT(ibvLib_.post_send(qp.get(), &wr, &badWr)); TP_THROW_ASSERT_IF(badWr != nullptr); numAvailableSendSlots_--; requestsInFlight_.emplace( wr.wr_id, std::make_tuple(IbvLib::WC_SEND, std::move(cb))); } else { TP_VLOG(6) << "Channel context " << id_ << " queueing up send on device " << name_ << " for QP " << qp->qp_num; sendsWaitingForSlots_.emplace_back(qp, info, std::move(cb)); } } void IbvNic::postRecv( IbvQueuePair& qp, RecvInfo info, std::function cb) { if (numAvailableRecvSlots_ > 0) { IbvLib::sge list; list.addr = reinterpret_cast(info.addr); list.length = info.length; list.lkey = info.lkey; IbvLib::recv_wr wr; std::memset(&wr, 0, sizeof(wr)); wr.wr_id = nextRequestId_++; wr.sg_list = &list; wr.num_sge = 1; IbvLib::recv_wr* badWr = nullptr; TP_VLOG(6) << "Channel context " << id_ << " posting recv on device " << name_ << " for QP " << qp->qp_num; TP_CHECK_IBV_INT(ibvLib_.post_recv(qp.get(), &wr, &badWr)); TP_THROW_ASSERT_IF(badWr != nullptr); numAvailableRecvSlots_--; requestsInFlight_.emplace( wr.wr_id, std::make_tuple(IbvLib::WC_RECV, std::move(cb))); } else { TP_VLOG(6) << "Channel context " << id_ << " queueing up recv on device " << name_ << " for QP " << qp->qp_num; recvsWaitingForSlots_.emplace_back(qp, info, std::move(cb)); } } IbvMemoryRegion& IbvNic::registerMemory(CudaBuffer buffer) { // FIXME Instead of re-querying the device, have the caller provide it. CudaDeviceGuard guard(cudaDeviceForPointer(cudaLib_, buffer.ptr)); CUdeviceptr basePtr; size_t allocSize; TP_CUDA_DRIVER_CHECK( cudaLib_, cudaLib_.memGetAddressRange( &basePtr, &allocSize, reinterpret_cast(buffer.ptr))); unsigned long long bufferId; TP_CUDA_DRIVER_CHECK( cudaLib_, cudaLib_.pointerGetAttribute( &bufferId, CU_POINTER_ATTRIBUTE_BUFFER_ID, basePtr)); auto iter = memoryRegions_.find(bufferId); if (iter != memoryRegions_.end()) { return iter->second; } std::tie(iter, std::ignore) = memoryRegions_.emplace( bufferId, createIbvMemoryRegion( ibvLib_, pd_, reinterpret_cast(basePtr), allocSize, IbvLib::ACCESS_LOCAL_WRITE)); return iter->second; } bool IbvNic::readyToClose() const { return requestsInFlight_.empty(); } void IbvNic::setId(std::string id) { id_ = std::move(id); } std::shared_ptr ContextImpl::create( optional> gpuIdxToNicName) { Error error; CudaLib cudaLib; std::tie(error, cudaLib) = CudaLib::create(); // FIXME Instead of throwing away the error and setting a bool, we should have // a way to set the context in an error state, and use that for viability. if (error) { TP_VLOG(5) << "CUDA GDR channel is not viable because libcuda could not be loaded: " << error.what(); return nullptr; } IbvLib ibvLib; std::tie(error, ibvLib) = IbvLib::create(); // FIXME Instead of throwing away the error and setting a bool, we should have // a way to set the context in an error state, and use that for viability. if (error) { TP_VLOG(5) << "CUDA GDR channel is not viable because libibverbs could not be loaded: " << error.what(); return nullptr; } if (!isNvidiaPeerMemoryClientActive()) { TP_VLOG(5) << "CUDA GDR channel is not viable because the nv_peer_mem kernel module isn't active"; return nullptr; } IbvDeviceList deviceList; std::tie(error, deviceList) = IbvDeviceList::create(ibvLib); if (error && error.isOfType() && error.castToType()->errorCode() == ENOSYS) { TP_VLOG(5) << "CUDA GDR channel couldn't get list of InfiniBand devices because the kernel module isn't " << "loaded"; return nullptr; } TP_THROW_ASSERT_IF(error) << "Couldn't get list of InfiniBand devices: " << error.what(); if (deviceList.size() == 0) { TP_VLOG(5) << "CUDA GDR channel is not viable because it couldn't find any InfiniBand NICs"; return nullptr; } // FIXME In principle we could just exclude the GPUs that violate this check // but keep working with the other ones (if any). if (!allGpusHaveEnoughBar1Size()) { TP_VLOG(5) << "CUDA GDR channel is not viable because some GPUs don't have a large enough PCIe BAR1 size"; return nullptr; } std::unordered_map deviceDescriptors; for (const auto& device : getCudaDevices(cudaLib)) { deviceDescriptors[device] = "*"; } return std::make_shared( std::move(deviceDescriptors), std::move(cudaLib), std::move(ibvLib), std::move(deviceList), std::move(gpuIdxToNicName)); } ContextImpl::ContextImpl( std::unordered_map deviceDescriptors, CudaLib cudaLib, IbvLib ibvLib, IbvDeviceList deviceList, optional> gpuIdxToNicName) : ContextImplBoilerplate( std::move(deviceDescriptors)), cudaLib_(std::move(cudaLib)), ibvLib_(std::move(ibvLib)) { std::vector actualGpuIdxToNicName; if (gpuIdxToNicName.has_value()) { int numGpus; TP_CUDA_CHECK(cudaGetDeviceCount(&numGpus)); TP_THROW_ASSERT_IF(numGpus != gpuIdxToNicName->size()) << "The mapping from GPUs to InfiniBand NICs contains an unexpected " << "number of items: found " << gpuIdxToNicName->size() << ", expected " << numGpus; actualGpuIdxToNicName = std::move(gpuIdxToNicName.value()); } else { actualGpuIdxToNicName = matchGpusToIbvNics(ibvLib, deviceList); } for (int gpuIdx = 0; gpuIdx < actualGpuIdxToNicName.size(); gpuIdx++) { TP_VLOG(5) << "CUDA GDR channel mapped GPU #" << gpuIdx << " to InfiniBand NIC " << actualGpuIdxToNicName[gpuIdx]; } std::unordered_set nicNames; for (const auto& nicName : actualGpuIdxToNicName) { nicNames.insert(nicName); } std::unordered_map nicNameToNicIdx; // The device index is among all available devices, the NIC index is among the // ones we will use. size_t nicIdx = 0; for (size_t deviceIdx = 0; deviceIdx < deviceList.size(); deviceIdx++) { IbvLib::device& device = deviceList[deviceIdx]; std::string deviceName(TP_CHECK_IBV_PTR(ibvLib.get_device_name(&device))); auto iter = nicNames.find(deviceName); if (iter != nicNames.end()) { TP_VLOG(5) << "CUDA GDR channel is using InfiniBand NIC " << deviceName << " as device #" << nicIdx; ibvNics_.emplace_back(*iter, device, ibvLib_, cudaLib_); nicNameToNicIdx[*iter] = nicIdx; nicIdx++; nicNames.erase(iter); } } TP_THROW_ASSERT_IF(!nicNames.empty()) << "Couldn't find all the devices I was supposed to use"; for (size_t gpuIdx = 0; gpuIdx < actualGpuIdxToNicName.size(); gpuIdx++) { gpuToNic_.push_back(nicNameToNicIdx[actualGpuIdxToNicName[gpuIdx]]); } startThread("TP_CUDA_GDR_loop"); } const CudaLib& ContextImpl::getCudaLib() { return cudaLib_; } const std::vector& ContextImpl::getGpuToNicMapping() { return gpuToNic_; } const IbvLib& ContextImpl::getIbvLib() { return ibvLib_; } IbvNic& ContextImpl::getIbvNic(size_t nicIdx) { TP_DCHECK_LT(nicIdx, ibvNics_.size()); return ibvNics_[nicIdx]; } bool ContextImpl::pollOnce() { for (IbvNic& ibvNic : ibvNics_) { if (ibvNic.pollOnce()) { return true; } } return pollCudaOnce(); } bool ContextImpl::pollCudaOnce() { bool any = false; for (auto iter = pendingCudaEvents_.begin(); iter != pendingCudaEvents_.end(); iter++) { const CudaEvent& event = std::get<0>(*iter); if (event.query()) { std::function cb = std::move(std::get<1>(*iter)); cb(Error::kSuccess); iter = pendingCudaEvents_.erase(iter); any = true; } } return any; } void ContextImpl::waitForCudaEvent( const CudaEvent& event, std::function cb) { deferToLoop([this, &event, cb{std::move(cb)}]() mutable { waitForCudaEventFromLoop(event, std::move(cb)); }); } void ContextImpl::waitForCudaEventFromLoop( const CudaEvent& event, std::function cb) { TP_DCHECK(inLoop()); pendingCudaEvents_.emplace_back(event, std::move(cb)); } bool ContextImpl::readyToClose() { for (const IbvNic& ibvNic : ibvNics_) { if (!ibvNic.readyToClose()) { return false; } } return pendingCudaEvents_.empty(); } void ContextImpl::handleErrorImpl() { stopBusyPolling(); } void ContextImpl::joinImpl() { joinThread(); // FIXME It would be nice if this could be done by the thread itself just // before it returns, rather than by the user. ibvNics_.clear(); } void ContextImpl::setIdImpl() { for (IbvNic& ibvNic : ibvNics_) { ibvNic.setId(id_); } } std::shared_ptr ContextImpl::createChannel( std::vector> connections, Endpoint /* unused */) { TP_DCHECK_EQ(numConnectionsNeeded(), connections.size()); return createChannelInternal( std::move(connections[0]), std::move(connections[1])); } size_t ContextImpl::numConnectionsNeeded() const { return 2; } } // namespace cuda_gdr } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/channel/cuda_gdr/context_impl.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include namespace tensorpipe { namespace channel { namespace cuda_gdr { class ChannelImpl; class IbvNic { public: IbvNic( std::string name, IbvLib::device& device, const IbvLib& ibvLib, const CudaLib& cudaLib); IbvProtectionDomain& getIbvPd() { return pd_; } IbvCompletionQueue& getIbvCq() { return cq_; } const IbvAddress& getIbvAddress() { return addr_; } struct SendInfo { void* addr; size_t length; uint32_t lkey; }; void postSend( IbvQueuePair& qp, SendInfo info, std::function cb); struct RecvInfo { void* addr; size_t length; uint32_t lkey; }; void postRecv( IbvQueuePair& qp, RecvInfo info, std::function cb); bool pollOnce(); IbvMemoryRegion& registerMemory(CudaBuffer buffer); bool readyToClose() const; void setId(std::string id); private: // The ID of the context, for use in verbose logging. std::string id_{"N/A"}; // The name of the InfiniBand device. const std::string name_; const CudaLib& cudaLib_; const IbvLib& ibvLib_; IbvContext ctx_; IbvProtectionDomain pd_; IbvCompletionQueue cq_; IbvAddress addr_; size_t numAvailableRecvSlots_ = kNumRecvs; std::deque< std::tuple>> recvsWaitingForSlots_; size_t numAvailableSendSlots_ = kNumSends; std::deque< std::tuple>> sendsWaitingForSlots_; // We need one common map for both send and recv requests because in principle // we cannot access the opcode of a failed operation, meaning we couldn't // match it to its callback. However, we could group them by QP number or, in // fact, we could have the QP store these requests and we just wake it up when // a completion occurs. std::unordered_map< uint64_t, std::tuple>> requestsInFlight_; uint64_t nextRequestId_ = 0; // The ibverbs memory regions are indexed by the CUDA driver's buffer ID for // the GPU allocation, which is unique (within the process) and never reused. // This will prevent us from re-using the memory region if a buffer gets // deallocated and reallocated (although we will not clean up the old memory // region until we close the context). std::map memoryRegions_; }; class ContextImpl final : public BusyPollingLoop, public ContextImplBoilerplate { public: static std::shared_ptr create( optional> gpuIdxToNicName = nullopt); ContextImpl( std::unordered_map deviceDescriptors, CudaLib cudaLib, IbvLib ibvLib, IbvDeviceList deviceList, optional> gpuIdxToNicName); std::shared_ptr createChannel( std::vector> connections, Endpoint endpoint); size_t numConnectionsNeeded() const override; const CudaLib& getCudaLib(); const std::vector& getGpuToNicMapping(); const IbvLib& getIbvLib(); IbvNic& getIbvNic(size_t nicIdx); void waitForCudaEvent( const CudaEvent& event, std::function cb); protected: // Implement BusyPollingLoop hooks. bool pollOnce() override; bool readyToClose() override; // Implement the entry points called by ContextImplBoilerplate. void handleErrorImpl() override; void joinImpl() override; void setIdImpl() override; private: const CudaLib cudaLib_; const IbvLib ibvLib_; std::vector ibvNics_; std::vector gpuToNic_; std::list>> pendingCudaEvents_; bool pollCudaOnce(); void waitForCudaEventFromLoop( const CudaEvent& event, std::function cb); }; } // namespace cuda_gdr } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/channel/cuda_gdr/error.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include namespace tensorpipe { namespace channel { namespace cuda_gdr { class IbvError final : public BaseError { public: explicit IbvError(std::string error) : error_(error) {} std::string what() const override { return error_; } private: std::string error_; }; } // namespace cuda_gdr } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/channel/cuda_gdr/factory.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include namespace tensorpipe { namespace channel { namespace cuda_gdr { std::shared_ptr create( optional> gpuIdxToNicName) { return std::make_shared>( std::move(gpuIdxToNicName)); } } // namespace cuda_gdr } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/channel/cuda_gdr/factory.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include namespace tensorpipe { namespace channel { namespace cuda_gdr { std::shared_ptr create( optional> gpuIdxToNicName = nullopt); } // namespace cuda_gdr } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/channel/cuda_ipc/channel_impl.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include namespace tensorpipe { namespace channel { namespace cuda_ipc { NOP_EXTERNAL_STRUCTURE( ContextImpl::OutboxInfo, processIdentifier, memHandle, eventHandles); namespace { size_t ceilOfRatio(size_t n, size_t d) { return (n + d - 1) / d; } struct Descriptor { int deviceIdx; size_t slotIdx; nop::Optional outboxInfo; NOP_STRUCTURE(Descriptor, deviceIdx, slotIdx, outboxInfo); }; } // namespace ChunkSendOperation::ChunkSendOperation( uint64_t bufferSequenceNumber, size_t chunkId, size_t numChunks, TSendCallback callback, int deviceIdx, const void* ptr, size_t length, cudaStream_t stream) : bufferSequenceNumber(bufferSequenceNumber), chunkId(chunkId), numChunks(numChunks), ptr(ptr), length(length), deviceIdx(deviceIdx), stream(stream), callback(std::move(callback)) {} ChunkRecvOperation::ChunkRecvOperation( uint64_t bufferSequenceNumber, size_t chunkId, size_t numChunks, TRecvCallback callback, int deviceIdx, void* ptr, size_t length, cudaStream_t stream) : bufferSequenceNumber(bufferSequenceNumber), chunkId(chunkId), numChunks(numChunks), ptr(ptr), length(length), deviceIdx(deviceIdx), stream(stream), callback(std::move(callback)) {} ChannelImpl::ChannelImpl( ConstructorToken token, std::shared_ptr context, std::string id, std::shared_ptr descriptorConnection, std::shared_ptr replyConnection) : ChannelImplBoilerplate( token, std::move(context), std::move(id)), descriptorConnection_(std::move(descriptorConnection)), replyConnection_(std::move(replyConnection)) {} void ChannelImpl::initImplFromLoop() { context_->enroll(*this); } void ChannelImpl::sendImplFromLoop( uint64_t sequenceNumber, Buffer buffer, size_t length, TSendCallback callback) { if (length == 0) { callback(error_); return; } int deviceIdx = cudaDeviceForPointer( context_->getCudaLib(), buffer.unwrap().ptr); const size_t numChunks = ceilOfRatio(length, kSlotSize); for (size_t chunkIdx = 0; chunkIdx < numChunks; chunkIdx += 1) { size_t offset = chunkIdx * kSlotSize; ChunkSendOpIter opIter = chunkSendOps_.emplaceBack( nextChunkBeingSent_++, sequenceNumber, chunkIdx, numChunks, chunkIdx == numChunks - 1 ? std::move(callback) : nullptr, deviceIdx, reinterpret_cast(buffer.unwrap().ptr) + offset, std::min(length - offset, kSlotSize), buffer.unwrap().stream); chunkSendOps_.advanceOperation(opIter); } } void ChannelImpl::advanceChunkSendOperation( ChunkSendOpIter opIter, ChunkSendOperation::State prevOpState) { TP_DCHECK(context_->inLoop()); ChunkSendOperation& op = *opIter; // Needs to go after previous op invoked its callback because the last chunk // in a series (that corresponds to one operation) must invoke its callback // only when all chunks in the series are done. chunkSendOps_.attemptTransition( opIter, /*from=*/ChunkSendOperation::UNINITIALIZED, /*to=*/ChunkSendOperation::FINISHED, /*cond=*/error_ && prevOpState >= ChunkSendOperation::FINISHED, /*actions=*/ {&ChannelImpl::callSendCallback}); // Needs to go after previous op to ensure later operations are not holding // events while earlier ones are still blocked waiting for them, because the // events will only be returned after the control messages have been written // and sent, and this won't happen for later operations until earlier ones // have reached that stage too, and if those are blocked waiting for events // then we may deadlock. chunkSendOps_.attemptTransition( opIter, /*from=*/ChunkSendOperation::UNINITIALIZED, /*to=*/ChunkSendOperation::ALLOCATING_STAGING_BUFFER, /*cond=*/!error_ && prevOpState >= ChunkSendOperation::ALLOCATING_STAGING_BUFFER, /*actions=*/ {&ChannelImpl::allocateStagingBuffer}); // See above for why this needs to go after previous op. chunkSendOps_.attemptTransition( opIter, /*from=*/ ChunkSendOperation::ALLOCATING_STAGING_BUFFER, /*to=*/ChunkSendOperation::FINISHED, /*cond=*/error_ && op.doneAllocatingStagingBuffer && prevOpState >= ChunkSendOperation::FINISHED, /*actions=*/ {&ChannelImpl::callSendCallback, &ChannelImpl::releaseStagingBuffer}); // Needs to go after previous op to ensure predictable and consistent ordering // of write calls on the descriptor control connection and read calls on the // reply control connection. chunkSendOps_.attemptTransition( opIter, /*from=*/ ChunkSendOperation::ALLOCATING_STAGING_BUFFER, /*to=*/ChunkSendOperation::READING_REPLY, /*cond=*/!error_ && op.doneAllocatingStagingBuffer && prevOpState >= ChunkSendOperation::READING_REPLY, /*actions=*/ {&ChannelImpl::copyFromSourceToStaging, &ChannelImpl::writeDescriptor, &ChannelImpl::readReply, &ChannelImpl::callSendCallback}); // See above for why this needs to go after previous op. chunkSendOps_.attemptTransition( opIter, /*from=*/ChunkSendOperation::READING_REPLY, /*to=*/ChunkSendOperation::FINISHED, /*cond=*/op.doneReadingReply && prevOpState >= ChunkSendOperation::FINISHED, /*actions=*/ {&ChannelImpl::releaseStagingBuffer}); } void ChannelImpl::allocateStagingBuffer(ChunkSendOpIter opIter) { ChunkSendOperation& op = *opIter; TP_VLOG(5) << "Channel " << id_ << " is allocating temporary memory for chunk #" << op.chunkId << " of " << op.numChunks << " for buffer #" << op.bufferSequenceNumber; context_->allocateSlot( op.deviceIdx, op.length, callbackWrapper_([opIter]( ChannelImpl& impl, size_t slotIdx, Allocator::TChunk buffer, CudaEvent* event) { TP_VLOG(5) << "Channel " << impl.id_ << " is done allocating temporary memory for chunk #" << opIter->chunkId << " of " << opIter->numChunks << " for buffer #" << opIter->bufferSequenceNumber; opIter->doneAllocatingStagingBuffer = true; if (!impl.error_) { opIter->slotIdx = slotIdx; opIter->stagingBuffer = std::move(buffer); opIter->event = event; } impl.chunkSendOps_.advanceOperation(opIter); })); } void ChannelImpl::copyFromSourceToStaging(ChunkSendOpIter opIter) { ChunkSendOperation& op = *opIter; op.event->wait(op.stream, op.deviceIdx); { CudaDeviceGuard guard(op.deviceIdx); TP_CUDA_CHECK(cudaMemcpyAsync( op.stagingBuffer.get(), op.ptr, op.length, cudaMemcpyDeviceToDevice, op.stream)); } op.event->record(op.stream); } void ChannelImpl::writeDescriptor(ChunkSendOpIter opIter) { ChunkSendOperation& op = *opIter; const CudaLib& cudaLib = context_->getCudaLib(); auto nopDescriptorHolder = std::make_shared>(); Descriptor& nopDescriptor = nopDescriptorHolder->getObject(); nopDescriptor.deviceIdx = op.deviceIdx; nopDescriptor.slotIdx = op.slotIdx; if (localOutboxesSent_.size() <= op.deviceIdx) { localOutboxesSent_.resize(op.deviceIdx + 1, false); } if (!localOutboxesSent_[op.deviceIdx]) { localOutboxesSent_[op.deviceIdx] = true; nopDescriptor.outboxInfo = context_->getLocalOutboxInfo(op.deviceIdx); } TP_VLOG(6) << "Channel " << id_ << " is writing nop object (descriptor #" << op.sequenceNumber << ")"; descriptorConnection_->write( *nopDescriptorHolder, callbackWrapper_([nopDescriptorHolder, sequenceNumber{op.sequenceNumber}](ChannelImpl& impl) { TP_VLOG(6) << "Channel " << impl.id_ << " done writing nop object (descriptor #" << sequenceNumber << ")"; })); } void ChannelImpl::readReply(ChunkSendOpIter opIter) { ChunkSendOperation& op = *opIter; TP_VLOG(6) << "Channel " << id_ << " is reading nop object (reply #" << op.sequenceNumber << ")"; replyConnection_->read( nullptr, 0, callbackWrapper_([opIter]( ChannelImpl& impl, const void* /* unused */, size_t /* unused */) { TP_VLOG(6) << "Channel " << impl.id_ << " done reading nop object (reply #" << opIter->sequenceNumber << ")"; opIter->doneReadingReply = true; impl.chunkSendOps_.advanceOperation(opIter); })); } void ChannelImpl::releaseStagingBuffer(ChunkSendOpIter opIter) { ChunkSendOperation& op = *opIter; op.stagingBuffer = nullptr; } void ChannelImpl::callSendCallback(ChunkSendOpIter opIter) { ChunkSendOperation& op = *opIter; if (op.callback) { op.callback(error_); // Reset callback to release the resources it was holding. op.callback = nullptr; } } void ChannelImpl::recvImplFromLoop( uint64_t sequenceNumber, Buffer buffer, size_t length, TRecvCallback callback) { if (length == 0) { callback(error_); return; } int deviceIdx = cudaDeviceForPointer( context_->getCudaLib(), buffer.unwrap().ptr); const size_t numChunks = ceilOfRatio(length, kSlotSize); for (size_t chunkIdx = 0; chunkIdx < numChunks; chunkIdx += 1) { size_t offset = chunkIdx * kSlotSize; ChunkRecvOpIter opIter = chunkRecvOps_.emplaceBack( nextChunkBeingReceived_++, sequenceNumber, chunkIdx, numChunks, chunkIdx == numChunks - 1 ? std::move(callback) : nullptr, deviceIdx, reinterpret_cast(buffer.unwrap().ptr) + offset, std::min(length - offset, kSlotSize), buffer.unwrap().stream); chunkRecvOps_.advanceOperation(opIter); } } void ChannelImpl::advanceChunkRecvOperation( ChunkRecvOpIter opIter, ChunkRecvOperation::State prevOpState) { TP_DCHECK(context_->inLoop()); ChunkRecvOperation& op = *opIter; // Needs to go after previous op invoked its callback because the last chunk // in a series (that corresponds to one operation) must invoke its callback // only when all chunks in the series are done. chunkRecvOps_.attemptTransition( opIter, /*from=*/ChunkRecvOperation::UNINITIALIZED, /*to=*/ChunkRecvOperation::FINISHED, /*cond=*/error_ && prevOpState >= ChunkRecvOperation::FINISHED, /*actions=*/{&ChannelImpl::callRecvCallback}); // Needs to go after previous op to ensure predictable and consistent ordering // of read calls on descriptor control connection. chunkRecvOps_.attemptTransition( opIter, /*from=*/ChunkRecvOperation::UNINITIALIZED, /*to=*/ChunkRecvOperation::READING_DESCRIPTOR, /*cond=*/!error_ && prevOpState >= ChunkRecvOperation::READING_DESCRIPTOR, /*actions=*/{&ChannelImpl::readDescriptor}); // See above for why this needs to go after previous op. chunkRecvOps_.attemptTransition( opIter, /*from=*/ChunkRecvOperation::READING_DESCRIPTOR, /*to=*/ChunkRecvOperation::FINISHED, /*cond=*/error_ && op.doneReadingDescriptor && prevOpState >= ChunkRecvOperation::FINISHED, /*actions=*/{&ChannelImpl::callRecvCallback}); // Needs to go after previous op to ensure predictable and consistent ordering // of write calls on reply control connection. chunkRecvOps_.attemptTransition( opIter, /*from=*/ChunkRecvOperation::READING_DESCRIPTOR, /*to=*/ChunkRecvOperation::FINISHED, /*cond=*/!error_ && op.doneReadingDescriptor && prevOpState >= ChunkRecvOperation::FINISHED, /*actions=*/ {&ChannelImpl::copyFromStagingToTarget, &ChannelImpl::writeReply, &ChannelImpl::callRecvCallback}); } void ChannelImpl::readDescriptor(ChunkRecvOpIter opIter) { ChunkRecvOperation& op = *opIter; TP_VLOG(6) << "Channel " << id_ << " is reading nop object (descriptor #" << op.sequenceNumber << ")"; auto nopDescriptorHolder = std::make_shared>(); descriptorConnection_->read( *nopDescriptorHolder, callbackWrapper_([opIter, nopDescriptorHolder](ChannelImpl& impl) { TP_VLOG(6) << "Channel " << impl.id_ << " done reading nop object (descriptor #" << opIter->sequenceNumber << ")"; opIter->doneReadingDescriptor = true; if (!impl.error_) { Descriptor& nopDescriptor = nopDescriptorHolder->getObject(); opIter->remoteDeviceIdx = nopDescriptor.deviceIdx; opIter->remoteSlotIdx = nopDescriptor.slotIdx; if (!nopDescriptor.outboxInfo.empty()) { if (impl.remoteOutboxesReceived_.size() <= opIter->remoteDeviceIdx) { impl.remoteOutboxesReceived_.resize(opIter->remoteDeviceIdx + 1); } TP_DCHECK(!impl.remoteOutboxesReceived_[opIter->remoteDeviceIdx] .has_value()); impl.remoteOutboxesReceived_[opIter->remoteDeviceIdx] = std::move(nopDescriptor.outboxInfo.take()); } } impl.chunkRecvOps_.advanceOperation(opIter); })); } void ChannelImpl::copyFromStagingToTarget(ChunkRecvOpIter opIter) { ChunkRecvOperation& op = *opIter; if (remoteOutboxesOpened_.size() <= op.remoteDeviceIdx) { remoteOutboxesOpened_.resize(op.remoteDeviceIdx + 1); } if (remoteOutboxesOpened_[op.remoteDeviceIdx].size() <= op.deviceIdx) { remoteOutboxesOpened_[op.remoteDeviceIdx].resize(op.deviceIdx + 1, nullptr); } if (remoteOutboxesOpened_[op.remoteDeviceIdx][op.deviceIdx] == nullptr) { remoteOutboxesOpened_[op.remoteDeviceIdx][op.deviceIdx] = &context_->openRemoteOutbox( op.deviceIdx, op.remoteDeviceIdx, remoteOutboxesReceived_[op.remoteDeviceIdx].value()); } const ContextImpl::RemoteOutboxHandle& outbox = *remoteOutboxesOpened_[op.remoteDeviceIdx][op.deviceIdx]; TP_VLOG(6) << "Channel " << id_ << " is copying payload (#" << op.sequenceNumber << ")"; outbox.events[op.remoteSlotIdx]->wait(op.stream, op.deviceIdx); { CudaDeviceGuard guard(op.deviceIdx); TP_CUDA_CHECK(cudaMemcpyAsync( op.ptr, outbox.buffer.ptr() + kSlotSize * op.remoteSlotIdx, op.length, cudaMemcpyDeviceToDevice, op.stream)); } outbox.events[op.remoteSlotIdx]->record(op.stream); TP_VLOG(6) << "Channel " << id_ << " done copying payload (#" << op.sequenceNumber << ")"; } void ChannelImpl::callRecvCallback(ChunkRecvOpIter opIter) { ChunkRecvOperation& op = *opIter; if (op.callback) { op.callback(error_); // Reset callback to release the resources it was holding. op.callback = nullptr; } } void ChannelImpl::writeReply(ChunkRecvOpIter opIter) { ChunkRecvOperation& op = *opIter; TP_VLOG(6) << "Channel " << id_ << " is writing reply notification (#" << op.sequenceNumber << ")"; replyConnection_->write( nullptr, 0, callbackWrapper_([sequenceNumber{op.sequenceNumber}](ChannelImpl& impl) { TP_VLOG(6) << "Channel " << impl.id_ << " done writing reply notification (#" << sequenceNumber << ")"; })); } void ChannelImpl::handleErrorImpl() { chunkSendOps_.advanceAllOperations(); chunkRecvOps_.advanceAllOperations(); descriptorConnection_->close(); replyConnection_->close(); context_->unenroll(*this); } } // namespace cuda_ipc } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/channel/cuda_ipc/channel_impl.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include #include #include #include #include #include namespace tensorpipe { namespace channel { namespace cuda_ipc { class ContextImpl; struct ChunkSendOperation { enum State { UNINITIALIZED, ALLOCATING_STAGING_BUFFER, READING_REPLY, FINISHED }; // Fields used by the state machine uint64_t sequenceNumber{0}; State state{UNINITIALIZED}; // Progress flags bool doneAllocatingStagingBuffer{false}; bool doneReadingReply{false}; // Arguments at creation const uint64_t bufferSequenceNumber; const size_t chunkId; const size_t numChunks; const void* const ptr; const size_t length; const int deviceIdx; const cudaStream_t stream; TSendCallback callback; // Other data size_t slotIdx{static_cast(-1)}; Allocator::TChunk stagingBuffer; CudaEvent* event{nullptr}; ChunkSendOperation( uint64_t bufferSequenceNumber, size_t chunkId, size_t numChunks, TSendCallback callback, int deviceIdx, const void* ptr, size_t length, cudaStream_t stream); }; struct ChunkRecvOperation { enum State { UNINITIALIZED, READING_DESCRIPTOR, FINISHED }; // Fields used by the state machine uint64_t sequenceNumber{0}; State state{UNINITIALIZED}; // Progress flags bool doneReadingDescriptor{false}; bool doneRequestingEvent{false}; bool doneReadingAck{false}; // Arguments at creation const uint64_t bufferSequenceNumber; const size_t chunkId; const size_t numChunks; void* const ptr; const size_t length; const int deviceIdx; const cudaStream_t stream; TRecvCallback callback; // Other data int remoteDeviceIdx; size_t remoteSlotIdx; ChunkRecvOperation( uint64_t bufferSequenceNumber, size_t chunkId, size_t numChunks, TRecvCallback callback, int deviceIdx, void* ptr, size_t length, cudaStream_t stream); }; class ChannelImpl final : public ChannelImplBoilerplate { public: ChannelImpl( ConstructorToken token, std::shared_ptr context, std::string id, std::shared_ptr descriptorConnection, std::shared_ptr replyConnection); protected: // Implement the entry points called by ChannelImplBoilerplate. void initImplFromLoop() override; void sendImplFromLoop( uint64_t sequenceNumber, Buffer buffer, size_t length, TSendCallback callback) override; void recvImplFromLoop( uint64_t sequenceNumber, Buffer buffer, size_t length, TRecvCallback callback) override; void handleErrorImpl() override; private: const std::shared_ptr descriptorConnection_; const std::shared_ptr replyConnection_; // For each local device, whether we've already sent the information about the // device's outbox to the remote, who needs it to open a handle to the outbox. // Used during the send path. std::vector localOutboxesSent_; // For each remote device, the information about the remote's outbox for that // device (or nullopt, if we haven't received it yet). We store it because we // will only receive it once (for the first buffer coming from that device) // but we might need it multiple time, as we need to open it for every local // target device where it might be needed. Used during the receive path. std::vector> remoteOutboxesReceived_; // For each remote and local device, the handle to the opened remote outbox // for that device (or nullptr if we haven't opened it yet). Used during the // receive path. std::vector> remoteOutboxesOpened_; // A sequence number for the chunks. uint64_t nextChunkBeingSent_{0}; uint64_t nextChunkBeingReceived_{0}; OpsStateMachine chunkSendOps_{ *this, &ChannelImpl::advanceChunkSendOperation}; using ChunkSendOpIter = decltype(chunkSendOps_)::Iter; OpsStateMachine chunkRecvOps_{ *this, &ChannelImpl::advanceChunkRecvOperation}; using ChunkRecvOpIter = decltype(chunkRecvOps_)::Iter; // State machines for send and recv ops. void advanceChunkSendOperation( ChunkSendOpIter opIter, ChunkSendOperation::State prevOpState); void advanceChunkRecvOperation( ChunkRecvOpIter opIter, ChunkRecvOperation::State prevOpState); // Actions (i.e., methods that begin a state transition). // For send operations: void allocateStagingBuffer(ChunkSendOpIter opIter); void copyFromSourceToStaging(ChunkSendOpIter opIter); void writeDescriptor(ChunkSendOpIter opIter); void readReply(ChunkSendOpIter opIter); void releaseStagingBuffer(ChunkSendOpIter opIter); void callSendCallback(ChunkSendOpIter opIter); // For recv operations: void readDescriptor(ChunkRecvOpIter opIter); void copyFromStagingToTarget(ChunkRecvOpIter opIter); void callRecvCallback(ChunkRecvOpIter opIter); void writeReply(ChunkRecvOpIter opIter); }; } // namespace cuda_ipc } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/channel/cuda_ipc/constants.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include namespace tensorpipe { namespace channel { namespace cuda_ipc { // FIXME Avoid this anonymous namespace and use inline variables in C++-17. namespace { // Define all three (redundant) values to make them explicit and avoid // misunderstandings due to miscalculations. static constexpr size_t kStagingAreaSize = 32 * 1024 * 1024; static constexpr size_t kSlotSize = 8 * 1024 * 1024; static constexpr size_t kNumSlots = 4; static_assert(kStagingAreaSize == kSlotSize * kNumSlots, ""); } // namespace } // namespace cuda_ipc } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/channel/cuda_ipc/context_impl.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include namespace tensorpipe { namespace channel { namespace cuda_ipc { namespace { std::tuple, std::vector>> getGlobalUuidsAndP2pSupport(const NvmlLib& nvmlLib) { unsigned int numDevices; TP_NVML_CHECK(nvmlLib, nvmlLib.deviceGetCount_v2(&numDevices)); std::vector devices(numDevices); std::vector uuids(numDevices); for (unsigned int devIdx = 0; devIdx < numDevices; devIdx++) { TP_NVML_CHECK( nvmlLib, nvmlLib.deviceGetHandleByIndex_v2(devIdx, &devices[devIdx])); // NVML_DEVICE_UUID_V2_BUFFER_SIZE was introduced in CUDA 11.0. #ifdef NVML_DEVICE_UUID_V2_BUFFER_SIZE std::array uuid; #else std::array uuid; #endif TP_NVML_CHECK( nvmlLib, nvmlLib.deviceGetUUID(devices[devIdx], uuid.data(), uuid.size())); std::string uuidStr(uuid.data()); TP_THROW_ASSERT_IF(uuidStr.substr(0, 4) != "GPU-") << "Couldn't obtain valid UUID for GPU #" << devIdx << " from CUDA driver. Got: " << uuidStr; uuidStr = uuidStr.substr(4); TP_THROW_ASSERT_IF(!isValidUuid(uuidStr)) << "Couldn't obtain valid UUID for GPU #" << devIdx << " from NVML. Got: " << uuidStr; uuids[devIdx] = std::move(uuidStr); } std::vector> p2pSupport(numDevices); for (int devIdx = 0; devIdx < numDevices; devIdx++) { p2pSupport[devIdx].resize(numDevices); for (int otherDevIdx = 0; otherDevIdx < numDevices; otherDevIdx++) { if (devIdx == otherDevIdx) { p2pSupport[devIdx][otherDevIdx] = true; continue; } nvmlGpuP2PStatus_t p2pStatus; TP_NVML_CHECK( nvmlLib, nvmlLib.deviceGetP2PStatus( devices[devIdx], devices[otherDevIdx], NVML_P2P_CAPS_INDEX_READ, &p2pStatus)); p2pSupport[devIdx][otherDevIdx] = (p2pStatus == NVML_P2P_STATUS_OK); } } return std::make_tuple(std::move(uuids), std::move(p2pSupport)); } int globalIdxForDevice( const std::vector& globalUuids, const std::string& uuid) { auto iter = std::find(globalUuids.begin(), globalUuids.end(), uuid); TP_THROW_ASSERT_IF(iter == globalUuids.end()) << "Couldn't find GPU with UUID " << uuid; return iter - globalUuids.begin(); } struct DeviceDescriptor { std::string bootId; int64_t pid; std::string deviceUuid; NOP_STRUCTURE(DeviceDescriptor, bootId, pid, deviceUuid); }; DeviceDescriptor deserializeDeviceDescriptor( const std::string& deviceDescriptor) { NopHolder nopHolder; loadDescriptor(nopHolder, deviceDescriptor); return std::move(nopHolder.getObject()); } std::string generateBootId() { auto bootID = getBootID(); TP_THROW_ASSERT_IF(!bootID) << "Unable to read boot_id"; return bootID.value(); } // FIXME We'd want this to return a std::vector, but CudaEvents // aren't default-constructible nor movable. Hence either we make them such, // or we use some pointer magic (like placement new). For now, we work around // this by using a unique_ptr and wrapping them in optional<>, but it's silly. std::unique_ptr[]> createIpcEventArray( int deviceIdx, size_t numEvents) { auto events = std::make_unique[]>(numEvents); // The CUDA driver has a bug where creating and/or destroying IPC events // sometimes causes a deadlock (it's unclear which of the two steps is the // cause). The deadlock tends to manifest as a cudaStreamSynchronize call // never returning. Just to be safe, and to catch such a deadlock early and // clearly, let's add extra syncs here. (The bug is fixed in v460). { CudaDeviceGuard guard(deviceIdx); TP_CUDA_CHECK(cudaDeviceSynchronize()); } for (size_t idx = 0; idx < numEvents; idx++) { events[idx].emplace(deviceIdx, true); // One day we might get tempted to have CudaEvent lazily initialize its // cudaEvent_t, just like PyTorch does. However here we explicitly want to // eagerly initialize IPC events, as creating them late might deadlock with // old CUDA driver versions. This check should hopefully catch if the event // is lazy-initialized. TP_THROW_ASSERT_IF(events[idx]->raw() == nullptr); } { CudaDeviceGuard guard(deviceIdx); TP_CUDA_CHECK(cudaDeviceSynchronize()); } return events; } std::vector getIpcHandlesForEventArray( optional events[], size_t numEvents) { std::vector eventHandles(numEvents); for (size_t idx = 0; idx < numEvents; idx++) { eventHandles[idx] = events[idx]->getIpcHandle(); } return eventHandles; } } // namespace ContextImpl::Outbox::Outbox(int deviceIdx) : buffer(kStagingAreaSize, deviceIdx), events(createIpcEventArray(deviceIdx, kNumSlots)), handle(this->buffer.getIpcHandle()), eventHandles(getIpcHandlesForEventArray(this->events.get(), kNumSlots)), allocator(this->buffer.ptr(), kNumSlots, kSlotSize) {} ContextImpl::Outbox::~Outbox() { // The CUDA driver has a bug where creating and/or destroying IPC events // sometimes causes a deadlock (it's unclear which of the two steps is the // cause). The deadlock tends to manifest as a cudaStreamSynchronize call // never returning. Just to be safe, and to catch such a deadlock early and // clearly, let's add extra syncs here. (The bug is fixed in v460). { CudaDeviceGuard guard(buffer.deviceIdx()); TP_CUDA_CHECK(cudaDeviceSynchronize()); } events.reset(); { CudaDeviceGuard guard(buffer.deviceIdx()); TP_CUDA_CHECK(cudaDeviceSynchronize()); } } std::shared_ptr ContextImpl::create() { Error error; CudaLib cudaLib; std::tie(error, cudaLib) = CudaLib::create(); if (error) { TP_VLOG(5) << "CUDA IPC channel is not viable because libcuda could not be loaded: " << error.what(); return nullptr; } NvmlLib nvmlLib; std::tie(error, nvmlLib) = NvmlLib::create(); if (error) { TP_VLOG(5) << "CUDA IPC channel is not viable because libnvidia-ml could not be loaded: " << error.what(); return nullptr; } const std::string bootId = generateBootId(); const pid_t pid = ::getpid(); std::unordered_map deviceDescriptors; for (const auto& device : getCudaDevices(cudaLib)) { // This part is largely inspired from // https://github.com/NVIDIA/cuda-samples/blob/master/Samples/simpleIPC/simpleIPC.cu. cudaDeviceProp props; TP_CUDA_CHECK(cudaGetDeviceProperties(&props, device.index)); // Unified addressing is required for IPC. if (!props.unifiedAddressing) { TP_VLOG(4) << "CUDA IPC channel is not viable because CUDA device " << device.index << " does not have unified addressing"; return nullptr; } // The other two compute modes are "exclusive" and "prohibited", both of // which prevent access from an other process. int computeMode = -1; TP_CUDA_CHECK(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, device.index)); if (computeMode != cudaComputeModeDefault) { TP_VLOG(4) << "CUDA IPC channel is not viable because CUDA device " << device.index << " is not in default compute mode"; return nullptr; } NopHolder nopHolder; DeviceDescriptor& deviceDescriptor = nopHolder.getObject(); deviceDescriptor.bootId = bootId; deviceDescriptor.pid = static_cast(pid); deviceDescriptor.deviceUuid = getUuidOfDevice(cudaLib, device.index); deviceDescriptors[device] = saveDescriptor(nopHolder); } std::vector globalUuids; std::vector> p2pSupport; std::tie(globalUuids, p2pSupport) = getGlobalUuidsAndP2pSupport(nvmlLib); TP_VLOG(4) << "The UUIDs of all the GPUs found by the CUDA IPC channel are " << joinStrs(globalUuids); TP_VLOG(4) << "The peer-to-peer support found by the CUDA IPC channel is " << formatMatrix(p2pSupport); std::ostringstream oss; optional nsId = getLinuxNamespaceId(LinuxNamespace::kPid); if (!nsId.has_value()) { TP_VLOG(4) << "CUDA IPC channel is not viable because it couldn't determine the PID namespace ID"; return nullptr; } oss << nsId.value() << "_" << pid; std::string processIdentifier = oss.str(); return std::make_shared( std::move(deviceDescriptors), std::move(cudaLib), std::move(nvmlLib), std::move(globalUuids), std::move(p2pSupport), std::move(processIdentifier)); } ContextImpl::ContextImpl( std::unordered_map deviceDescriptors, CudaLib cudaLib, NvmlLib nvmlLib, std::vector globalUuids, std::vector> p2pSupport, std::string processIdentifier) : ContextImplBoilerplate( std::move(deviceDescriptors)), cudaLib_(std::move(cudaLib)), nvmlLib_(std::move(nvmlLib)), globalUuids_(std::move(globalUuids)), p2pSupport_(std::move(p2pSupport)), processIdentifier_(processIdentifier) {} std::shared_ptr ContextImpl::createChannel( std::vector> connections, Endpoint /* unused */) { TP_DCHECK_EQ(numConnectionsNeeded(), connections.size()); return createChannelInternal( std::move(connections[0]), std::move(connections[1])); } size_t ContextImpl::numConnectionsNeeded() const { // The control connection needs to carry two unrelated streams in each // direction (the descriptors and the replies), and it's thus simpler to just // use two such connections. return 2; } bool ContextImpl::canCommunicateWithRemote( const std::string& localDeviceDescriptor, const std::string& remoteDeviceDescriptor) const { DeviceDescriptor nopLocalDeviceDescriptor = deserializeDeviceDescriptor(localDeviceDescriptor); DeviceDescriptor nopRemoteDeviceDescriptor = deserializeDeviceDescriptor(remoteDeviceDescriptor); if (nopLocalDeviceDescriptor.bootId != nopRemoteDeviceDescriptor.bootId) { return false; } // Disable CudaIpc when both endpoints are in the same process, as a CUDA IPC // handle cannot be opened in the same process in which it was created. if (nopLocalDeviceDescriptor.pid == nopRemoteDeviceDescriptor.pid) { return false; } int localGlobalIdx = globalIdxForDevice(globalUuids_, nopLocalDeviceDescriptor.deviceUuid); int remoteGlobalIdx = globalIdxForDevice(globalUuids_, nopRemoteDeviceDescriptor.deviceUuid); return p2pSupport_[localGlobalIdx][remoteGlobalIdx] && p2pSupport_[remoteGlobalIdx][localGlobalIdx]; } const CudaLib& ContextImpl::getCudaLib() { return cudaLib_; } void ContextImpl::allocateSlot( int deviceIdx, size_t length, SlotAllocCallback callback) { if (outboxes_.size() <= deviceIdx) { outboxes_.resize(deviceIdx + 1); } if (outboxes_[deviceIdx] == nullptr) { outboxes_[deviceIdx] = std::make_unique(deviceIdx); } // We don't need to wrap this callback with the callbackWrapper_ because the // callback that was passed to this method already is, and because all we're // doing here is wrap that callback and do read-only accesses to the outbox. Outbox& outbox = *outboxes_[deviceIdx]; outboxes_[deviceIdx]->allocator.alloc( length, [&outbox, callback{std::move(callback)}]( const Error& error, Allocator::TChunk chunk) { if (error) { callback(error, 0, std::move(chunk), nullptr); return; } size_t slotIdx = (chunk.get() - outbox.buffer.ptr()) / kSlotSize; callback( error, slotIdx, std::move(chunk), &outbox.events[slotIdx].value()); }); } ContextImpl::OutboxInfo ContextImpl::getLocalOutboxInfo(int deviceIdx) { TP_DCHECK(outboxes_.size() > deviceIdx); TP_DCHECK(outboxes_[deviceIdx] != nullptr); OutboxInfo info; info.processIdentifier = processIdentifier_; info.memHandle = std::string( reinterpret_cast(&outboxes_[deviceIdx]->handle), sizeof(cudaIpcMemHandle_t)); info.eventHandles.reserve(kNumSlots); for (size_t slotIdx = 0; slotIdx < kNumSlots; slotIdx++) { info.eventHandles.emplace_back( reinterpret_cast( &outboxes_[deviceIdx]->eventHandles[slotIdx]), sizeof(cudaIpcEventHandle_t)); } return info; } const ContextImpl::RemoteOutboxHandle& ContextImpl::openRemoteOutbox( int localDeviceIdx, int remoteDeviceIdx, OutboxInfo remoteOutboxInfo) { RemoteOutboxKey key{ std::move(remoteOutboxInfo.processIdentifier), remoteDeviceIdx, localDeviceIdx}; decltype(remoteOutboxes_)::iterator iter; bool didntExist; std::tie(iter, didntExist) = remoteOutboxes_.emplace(std::move(key), RemoteOutboxHandle{}); RemoteOutboxHandle& outbox = iter->second; if (didntExist) { CudaDeviceGuard guard(localDeviceIdx); outbox.buffer = CudaIpcBuffer( localDeviceIdx, *reinterpret_cast( remoteOutboxInfo.memHandle.data())); outbox.events = std::make_unique[]>(kNumSlots); for (size_t slotIdx = 0; slotIdx < kNumSlots; slotIdx++) { outbox.events[slotIdx].emplace( localDeviceIdx, *reinterpret_cast( remoteOutboxInfo.eventHandles[slotIdx].data())); } } return outbox; } void ContextImpl::handleErrorImpl() { for (std::unique_ptr& outbox : outboxes_) { if (outbox != nullptr) { outbox->allocator.close(); } } } void ContextImpl::joinImpl() {} bool ContextImpl::inLoop() const { return loop_.inLoop(); }; void ContextImpl::deferToLoop(std::function fn) { loop_.deferToLoop(std::move(fn)); }; } // namespace cuda_ipc } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/channel/cuda_ipc/context_impl.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include #include #include #include #include #include #include namespace tensorpipe { namespace channel { namespace cuda_ipc { class ChannelImpl; class ContextImpl final : public ContextImplBoilerplate { public: static std::shared_ptr create(); ContextImpl( std::unordered_map deviceDescriptors, CudaLib cudaLib, NvmlLib nvmlLib, std::vector globalUuids, std::vector> p2pSupport, std::string processIdentifier); std::shared_ptr createChannel( std::vector> connections, Endpoint endpoint); size_t numConnectionsNeeded() const override; bool canCommunicateWithRemote( const std::string& localDeviceDescriptor, const std::string& remoteDeviceDescriptor) const override; const CudaLib& getCudaLib(); // Takes the index of the slot, the (smart) pointer to the slot, and the (raw) // pointer to the event for the slot. using SlotAllocCallback = std::function; void allocateSlot(int deviceIdx, size_t length, SlotAllocCallback callback); struct OutboxInfo { std::string processIdentifier; std::string memHandle; std::vector eventHandles; }; OutboxInfo getLocalOutboxInfo(int deviceIdx); struct RemoteOutboxHandle { CudaIpcBuffer buffer; std::unique_ptr[]> events; }; const RemoteOutboxHandle& openRemoteOutbox( int localDeviceIdx, int remoteDeviceIdx, OutboxInfo remoteOutboxInfo); // Implement the DeferredExecutor interface. bool inLoop() const override; void deferToLoop(std::function fn) override; protected: // Implement the entry points called by ContextImplBoilerplate. void handleErrorImpl() override; void joinImpl() override; private: OnDemandDeferredExecutor loop_; const CudaLib cudaLib_; const NvmlLib nvmlLib_; const std::vector globalUuids_; const std::vector> p2pSupport_; // A combination of the process's PID namespace and its PID, which combined // with the device index allows us to uniquely identify each staging buffer on // the current machine. const std::string processIdentifier_; // A CUDA on-device allocation that acts as the outbox for all the channels of // this context. We cannot directly get and open IPC handles of the user's // buffers, as this will fail if the user already opened such a handle (this // limitation was lifted in CUDA 11.1). Moreover, since we "leak" the opened // IPC handles (i.e., we leave them open, and close them all when the context // closes), if we opened an IPC handle to a user buffer and the user freed // that buffer we would prevent CUDA from really making that memory available // again (this is an undocumented behavior which was observed experimentally). // As a solution, we create our own allocation and get and open an IPC handle // to that, as we can guarantee its lifetime and that no other IPC handle // exists. We then use it as a staging ground for outgoing transfers, copying // chunks to it from source buffers, and having the remote copy them to the // target buffer. struct Outbox { const CudaDeviceBuffer buffer; std::unique_ptr[]> events; const cudaIpcMemHandle_t handle; const std::vector eventHandles; Allocator allocator; explicit Outbox(int deviceIdx); ~Outbox(); }; std::vector> outboxes_; struct RemoteOutboxKey { std::string processIdentifier; int remoteDeviceIdx; int localDeviceIdx; bool operator==(const RemoteOutboxKey& other) const noexcept { return processIdentifier == other.processIdentifier && remoteDeviceIdx == other.remoteDeviceIdx && localDeviceIdx == other.localDeviceIdx; } }; struct RemoteOutboxKeyHash { size_t operator()(const RemoteOutboxKey& key) const noexcept { size_t h1 = std::hash{}(key.processIdentifier); size_t h2 = std::hash{}(key.remoteDeviceIdx); size_t h3 = std::hash{}(key.localDeviceIdx); // Byte-shift hashes in order to "capture" the order of members. // FIXME Should we use a proper hash combiner? We can copy Boost's one. return h1 ^ (h2 << 1) ^ (h3 << 2); } }; std::unordered_map remoteOutboxes_; }; } // namespace cuda_ipc } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/channel/cuda_ipc/factory.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include namespace tensorpipe { namespace channel { namespace cuda_ipc { std::shared_ptr create() { return std::make_shared>(); } } // namespace cuda_ipc } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/channel/cuda_ipc/factory.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include namespace tensorpipe { namespace channel { namespace cuda_ipc { std::shared_ptr create(); } // namespace cuda_ipc } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/channel/cuda_xth/channel_impl.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include #include #include #include #include #include #include namespace tensorpipe { namespace channel { namespace cuda_xth { namespace { struct Descriptor { uintptr_t startEvent; uintptr_t srcPtr; int srcDeviceIdx; uintptr_t srcStream; NOP_STRUCTURE(Descriptor, startEvent, srcPtr, srcDeviceIdx, srcStream); }; } // namespace SendOperation::SendOperation( int deviceIdx, void* ptr, size_t length, cudaStream_t stream, TSendCallback callback) : deviceIdx(deviceIdx), ptr(ptr), length(length), stream(stream), callback(std::move(callback)), startEv(deviceIdx) { startEv.record(stream); } RecvOperation::RecvOperation( int deviceIdx, CudaBuffer buffer, size_t length, TRecvCallback callback) : ptr(buffer.ptr), length(length), deviceIdx(deviceIdx), stream(buffer.stream), callback(std::move(callback)) {} void RecvOperation::process() { { CudaDeviceGuard guard(deviceIdx); TP_CUDA_CHECK(cudaStreamWaitEvent(stream, startEvent, 0)); TP_CUDA_CHECK( cudaMemcpyAsync(ptr, srcPtr, length, cudaMemcpyDeviceToDevice, stream)); } CudaEvent stopEv(deviceIdx); stopEv.record(stream); stopEv.wait(srcStream, srcDeviceIdx); } ChannelImpl::ChannelImpl( ConstructorToken token, std::shared_ptr context, std::string id, std::shared_ptr descriptorConnection, std::shared_ptr completionConnection) : ChannelImplBoilerplate( token, std::move(context), std::move(id)), descriptorConnection_(std::move(descriptorConnection)), completionConnection_(std::move(completionConnection)) {} void ChannelImpl::initImplFromLoop() { context_->enroll(*this); } void ChannelImpl::sendImplFromLoop( uint64_t sequenceNumber, Buffer buffer, size_t length, TSendCallback callback) { int deviceIdx = cudaDeviceForPointer( context_->getCudaLib(), buffer.unwrap().ptr); SendOpIter opIter = sendOps_.emplaceBack( sequenceNumber, deviceIdx, buffer.unwrap().ptr, length, buffer.unwrap().stream, std::move(callback)); sendOps_.advanceOperation(opIter); } void ChannelImpl::advanceSendOperation( SendOpIter opIter, SendOperation::State prevOpState) { TP_DCHECK(context_->inLoop()); SendOperation& op = *opIter; sendOps_.attemptTransition( opIter, /*from=*/SendOperation::UNINITIALIZED, /*to=*/SendOperation::FINISHED, /*cond=*/error_ || op.length == 0, /*actions=*/{&ChannelImpl::callSendCallback}); // Needs to go after previous op to ensure predictable and consistent ordering // of write calls on the descriptor control connection and read calls on the // completion control connection. sendOps_.attemptTransition( opIter, /*from=*/SendOperation::UNINITIALIZED, /*to=*/SendOperation::READING_COMPLETION, /*cond=*/!error_ && prevOpState >= SendOperation::READING_COMPLETION, /*actions=*/ {&ChannelImpl::writeDescriptor, &ChannelImpl::readCompletion}); sendOps_.attemptTransition( opIter, /*from=*/SendOperation::READING_COMPLETION, /*to=*/SendOperation::FINISHED, /*cond=*/op.doneReadingCompletion, /*actions=*/{&ChannelImpl::callSendCallback}); } void ChannelImpl::writeDescriptor(SendOpIter opIter) { SendOperation& op = *opIter; auto nopHolder = std::make_shared>(); Descriptor& nopDescriptor = nopHolder->getObject(); static_assert(std::is_pointer::value, ""); static_assert(std::is_pointer::value, ""); nopDescriptor.startEvent = reinterpret_cast(op.startEv.raw()); nopDescriptor.srcDeviceIdx = op.deviceIdx; nopDescriptor.srcPtr = reinterpret_cast(op.ptr); nopDescriptor.srcStream = reinterpret_cast(op.stream); TP_VLOG(6) << "Channel " << id_ << " is writing descriptor (#" << op.sequenceNumber << ")"; descriptorConnection_->write( *nopHolder, callbackWrapper_([sequenceNumber{op.sequenceNumber}, nopHolder](ChannelImpl& impl) { TP_VLOG(6) << "Channel " << impl.id_ << " done writing descriptor (#" << sequenceNumber << ")"; })); } void ChannelImpl::readCompletion(SendOpIter opIter) { SendOperation& op = *opIter; TP_VLOG(6) << "Channel " << id_ << " is reading completion (#" << op.sequenceNumber << ")"; completionConnection_->read( nullptr, 0, callbackWrapper_([opIter]( ChannelImpl& impl, const void* /* unused */, size_t /* unused */) { TP_VLOG(6) << "Channel " << impl.id_ << " done reading completion (#" << opIter->sequenceNumber << ")"; opIter->doneReadingCompletion = true; impl.sendOps_.advanceOperation(opIter); })); } void ChannelImpl::callSendCallback(SendOpIter opIter) { SendOperation& op = *opIter; op.callback(error_); // Reset callback to release the resources it was holding. op.callback = nullptr; } void ChannelImpl::recvImplFromLoop( uint64_t sequenceNumber, Buffer buffer, size_t length, TRecvCallback callback) { int deviceIdx = cudaDeviceForPointer( context_->getCudaLib(), buffer.unwrap().ptr); RecvOpIter opIter = recvOps_.emplaceBack( sequenceNumber, deviceIdx, buffer.unwrap(), length, std::move(callback)); recvOps_.advanceOperation(opIter); } void ChannelImpl::advanceRecvOperation( RecvOpIter opIter, RecvOperation::State prevOpState) { TP_DCHECK(context_->inLoop()); RecvOperation& op = *opIter; recvOps_.attemptTransition( opIter, /*from=*/RecvOperation::UNINITIALIZED, /*to=*/RecvOperation::FINISHED, /*cond=*/error_ || op.length == 0, /*actions=*/{&ChannelImpl::callRecvCallback}); // Needs to go after previous op to ensure predictable and consistent ordering // of read calls on the descriptor control connection. recvOps_.attemptTransition( opIter, /*from=*/RecvOperation::UNINITIALIZED, /*to=*/RecvOperation::READING_DESCRIPTOR, /*cond=*/!error_ && prevOpState >= RecvOperation::READING_DESCRIPTOR, /*actions=*/{&ChannelImpl::readDescriptor}); recvOps_.attemptTransition( opIter, /*from=*/RecvOperation::READING_DESCRIPTOR, /*to=*/RecvOperation::FINISHED, /*cond=*/error_ && op.doneReadingDescriptor, /*actions=*/{&ChannelImpl::callRecvCallback}); // Needs to go after previous op to ensure predictable and consistent ordering // of write calls on the completion control connection. recvOps_.attemptTransition( opIter, /*from=*/RecvOperation::READING_DESCRIPTOR, /*to=*/RecvOperation::FINISHED, /*cond=*/!error_ && op.doneReadingDescriptor && prevOpState >= RecvOperation::FINISHED, /*actions=*/ {&ChannelImpl::waitOnStartEventAndCopyAndSyncWithSourceStream, &ChannelImpl::callRecvCallback, &ChannelImpl::writeCompletion}); } void ChannelImpl::readDescriptor(RecvOpIter opIter) { RecvOperation& op = *opIter; TP_VLOG(6) << "Channel " << id_ << " is reading descriptor (#" << op.sequenceNumber << ")"; auto nopHolderIn = std::make_shared>(); descriptorConnection_->read( *nopHolderIn, callbackWrapper_([opIter, nopHolderIn](ChannelImpl& impl) { TP_VLOG(6) << "Channel " << impl.id_ << " done reading descriptor (#" << opIter->sequenceNumber << ")"; opIter->doneReadingDescriptor = true; if (!impl.error_) { Descriptor& nopDescriptor = nopHolderIn->getObject(); static_assert(std::is_pointer::value, ""); static_assert(std::is_pointer::value, ""); opIter->startEvent = reinterpret_cast(nopDescriptor.startEvent); opIter->srcPtr = reinterpret_cast(nopDescriptor.srcPtr); opIter->srcDeviceIdx = nopDescriptor.srcDeviceIdx; opIter->srcStream = reinterpret_cast(nopDescriptor.srcStream); } impl.recvOps_.advanceOperation(opIter); })); } void ChannelImpl::waitOnStartEventAndCopyAndSyncWithSourceStream( RecvOpIter opIter) { RecvOperation& op = *opIter; TP_VLOG(6) << "Channel " << id_ << " is copying payload (#" << op.sequenceNumber << ")"; op.process(); TP_VLOG(6) << "Channel " << id_ << " done copying payload (#" << op.sequenceNumber << ")"; } void ChannelImpl::callRecvCallback(RecvOpIter opIter) { RecvOperation& op = *opIter; op.callback(error_); // Reset callback to release the resources it was holding. op.callback = nullptr; } void ChannelImpl::writeCompletion(RecvOpIter opIter) { RecvOperation& op = *opIter; TP_VLOG(6) << "Channel " << id_ << " is writing completion (#" << op.sequenceNumber << ")"; completionConnection_->write( nullptr, 0, callbackWrapper_([sequenceNumber{op.sequenceNumber}](ChannelImpl& impl) { TP_VLOG(6) << "Channel " << impl.id_ << " done writing completion (#" << sequenceNumber << ")"; })); } void ChannelImpl::handleErrorImpl() { sendOps_.advanceAllOperations(); recvOps_.advanceAllOperations(); descriptorConnection_->close(); completionConnection_->close(); context_->unenroll(*this); } } // namespace cuda_xth } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/channel/cuda_xth/channel_impl.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include #include namespace tensorpipe { namespace channel { namespace cuda_xth { class ContextImpl; struct SendOperation { enum State { UNINITIALIZED, READING_COMPLETION, FINISHED }; // Fields used by the state machine uint64_t sequenceNumber{0}; State state{UNINITIALIZED}; // Progress flags bool doneReadingCompletion{false}; // Arguments at creation int deviceIdx; void* ptr; size_t length; cudaStream_t stream; TSendCallback callback; // Other stuff CudaEvent startEv; SendOperation( int deviceIdx, void* ptr, size_t length, cudaStream_t stream, TSendCallback callback); }; struct RecvOperation { enum State { UNINITIALIZED, READING_DESCRIPTOR, FINISHED }; // Fields used by the state machine uint64_t sequenceNumber{0}; State state{UNINITIALIZED}; // Progress flags bool doneReadingDescriptor{false}; // Arguments at creation void* const ptr; const size_t length; const int deviceIdx; const cudaStream_t stream; TRecvCallback callback; // Other data cudaEvent_t startEvent; const void* srcPtr; int srcDeviceIdx; cudaStream_t srcStream; RecvOperation( int deviceIdx, CudaBuffer buffer, size_t length, TRecvCallback callback); void process(); }; class ChannelImpl final : public ChannelImplBoilerplate { public: ChannelImpl( ConstructorToken token, std::shared_ptr context, std::string id, std::shared_ptr descriptorConnection, std::shared_ptr completionConnection); protected: // Implement the entry points called by ChannelImplBoilerplate. void initImplFromLoop() override; void sendImplFromLoop( uint64_t sequenceNumber, Buffer buffer, size_t length, TSendCallback callback) override; void recvImplFromLoop( uint64_t sequenceNumber, Buffer buffer, size_t length, TRecvCallback callback) override; void handleErrorImpl() override; private: const std::shared_ptr descriptorConnection_; const std::shared_ptr completionConnection_; OpsStateMachine sendOps_{ *this, &ChannelImpl::advanceSendOperation}; using SendOpIter = decltype(sendOps_)::Iter; OpsStateMachine recvOps_{ *this, &ChannelImpl::advanceRecvOperation}; using RecvOpIter = decltype(recvOps_)::Iter; // State machines for send and recv ops. void advanceSendOperation( SendOpIter opIter, SendOperation::State prevOpState); void advanceRecvOperation( RecvOpIter opIter, RecvOperation::State prevOpState); // Actions (i.e., methods that begin a state transition). // For send operations: void writeDescriptor(SendOpIter opIter); void readCompletion(SendOpIter opIter); void callSendCallback(SendOpIter opIter); // For recv operations: void readDescriptor(RecvOpIter opIter); void waitOnStartEventAndCopyAndSyncWithSourceStream(RecvOpIter opIter); void callRecvCallback(RecvOpIter opIter); void writeCompletion(RecvOpIter opIter); }; } // namespace cuda_xth } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/channel/cuda_xth/context_impl.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include #include #include #include #include #include #include namespace tensorpipe { namespace channel { namespace cuda_xth { std::shared_ptr ContextImpl::create() { Error error; CudaLib cudaLib; std::tie(error, cudaLib) = CudaLib::create(); if (error) { TP_VLOG(5) << "CUDA XTH channel is not viable because libcuda could not be loaded: " << error.what(); return nullptr; } std::ostringstream oss; auto bootID = getBootID(); TP_THROW_ASSERT_IF(!bootID) << "Unable to read boot_id"; auto nsID = getLinuxNamespaceId(LinuxNamespace::kPid); if (!nsID) { TP_VLOG(5) << "CUDA XTH channel is not viable because it couldn't determine the PID namespace ID"; return nullptr; } oss << bootID.value() << "_" << nsID.value() << "_" << ::getpid(); const std::string domainDescriptor = oss.str(); std::unordered_map deviceDescriptors; for (const auto& device : getCudaDevices(cudaLib)) { cudaDeviceProp props; TP_CUDA_CHECK(cudaGetDeviceProperties(&props, device.index)); // Unified addressing is required for cross-device `cudaMemcpyAsync()`. We // could lift this requirement by adding a fallback to // `cudaMemcpyPeerAsync()`. if (!props.unifiedAddressing) { TP_VLOG(4) << "CUDA XTH channel is not viable because CUDA device " << device.index << " does not have unified addressing"; return nullptr; } deviceDescriptors[device] = domainDescriptor; } if (deviceDescriptors.empty()) { return nullptr; } return std::make_shared( std::move(cudaLib), std::move(deviceDescriptors)); } ContextImpl::ContextImpl( CudaLib cudaLib, std::unordered_map deviceDescriptors) : ContextImplBoilerplate( std::move(deviceDescriptors)), cudaLib_(std::move(cudaLib)) {} std::shared_ptr ContextImpl::createChannel( std::vector> connections, Endpoint /* unused */) { TP_DCHECK_EQ(numConnectionsNeeded(), connections.size()); return createChannelInternal( std::move(connections[0]), std::move(connections[1])); } size_t ContextImpl::numConnectionsNeeded() const { return 2; } const CudaLib& ContextImpl::getCudaLib() { return cudaLib_; } void ContextImpl::handleErrorImpl() {} void ContextImpl::joinImpl() {} bool ContextImpl::inLoop() const { return loop_.inLoop(); }; void ContextImpl::deferToLoop(std::function fn) { loop_.deferToLoop(std::move(fn)); }; } // namespace cuda_xth } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/channel/cuda_xth/context_impl.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include namespace tensorpipe { namespace channel { namespace cuda_xth { class ChannelImpl; class ContextImpl final : public ContextImplBoilerplate { public: static std::shared_ptr create(); ContextImpl( CudaLib cudaLib, std::unordered_map deviceDescriptors); std::shared_ptr createChannel( std::vector> connections, Endpoint endpoint); size_t numConnectionsNeeded() const override; const CudaLib& getCudaLib(); // Implement the DeferredExecutor interface. bool inLoop() const override; void deferToLoop(std::function fn) override; protected: // Implement the entry points called by ContextImplBoilerplate. void handleErrorImpl() override; void joinImpl() override; private: OnDemandDeferredExecutor loop_; const CudaLib cudaLib_; }; } // namespace cuda_xth } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/channel/cuda_xth/factory.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include namespace tensorpipe { namespace channel { namespace cuda_xth { std::shared_ptr create() { return std::make_shared>(); } } // namespace cuda_xth } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/channel/cuda_xth/factory.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include namespace tensorpipe { namespace channel { namespace cuda_xth { std::shared_ptr create(); } // namespace cuda_xth } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/channel/error.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include namespace tensorpipe { namespace channel { std::string ContextClosedError::what() const { return "context closed"; } std::string ChannelClosedError::what() const { return "channel closed"; } std::string ContextNotViableError::what() const { return "context not viable"; } } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/channel/error.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include namespace tensorpipe { namespace channel { class ContextClosedError final : public BaseError { public: ContextClosedError() {} std::string what() const override; }; class ChannelClosedError final : public BaseError { public: ChannelClosedError() {} std::string what() const override; }; class ContextNotViableError final : public BaseError { public: ContextNotViableError() {} std::string what() const override; }; } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/channel/helpers.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include namespace tensorpipe { namespace channel { std::string saveDescriptor(const AbstractNopHolder& object) { const size_t len = object.getSize(); std::string out(len, '\0'); NopWriter writer( const_cast(reinterpret_cast(out.data())), len); nop::Status status = object.write(writer); TP_THROW_ASSERT_IF(status.has_error()) << "Error saving descriptor: " << status.GetErrorMessage(); return out; } void loadDescriptor(AbstractNopHolder& object, const std::string& in) { const size_t len = in.size(); NopReader reader(reinterpret_cast(in.data()), len); nop::Status status = object.read(reader); TP_THROW_ASSERT_IF(status.has_error()) << "Error loading descriptor: " << status.GetErrorMessage(); } } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/channel/helpers.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once // Note: never include this file from headers! #include #include namespace tensorpipe { namespace channel { std::string saveDescriptor(const AbstractNopHolder& object); void loadDescriptor(AbstractNopHolder& object, const std::string& in); } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/channel/mpt/channel_impl.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include #include #include #include #include namespace tensorpipe { namespace channel { namespace mpt { ChannelImpl::ChannelImpl( ConstructorToken token, std::shared_ptr context, std::string id, std::shared_ptr connection, Endpoint endpoint, uint64_t numLanes) : ChannelImplBoilerplate( token, std::move(context), std::move(id)), connection_(std::move(connection)), endpoint_(endpoint), numLanes_(numLanes), lanes_(numLanes_) {} void ChannelImpl::initImplFromLoop() { context_->enroll(*this); TP_DCHECK_EQ(state_, UNINITIALIZED); if (endpoint_ == Endpoint::kConnect) { state_ = CLIENT_READING_HELLO; auto nopHolderIn = std::make_shared>(); TP_VLOG(6) << "Channel " << id_ << " reading nop object (server hello)"; connection_->read( *nopHolderIn, callbackWrapper_([nopHolderIn](ChannelImpl& impl) { TP_VLOG(6) << "Channel " << impl.id_ << " done reading nop object (server hello)"; if (!impl.error_) { impl.onClientReadHelloOnConnection(nopHolderIn->getObject()); } })); } else if (endpoint_ == Endpoint::kListen) { state_ = SERVER_ACCEPTING_LANES; const std::vector& addresses = context_->addresses(); TP_DCHECK_EQ(addresses.size(), numLanes_); auto nopHolderOut = std::make_shared>(); Packet& nopPacket = nopHolderOut->getObject(); nopPacket.Become(nopPacket.index_of()); ServerHello& nopServerHello = *nopPacket.get(); for (uint64_t laneIdx = 0; laneIdx < numLanes_; ++laneIdx) { nopServerHello.laneAdvertisements.emplace_back(); LaneAdvertisement& nopLaneAdvertisement = nopServerHello.laneAdvertisements.back(); nopLaneAdvertisement.address = addresses[laneIdx]; TP_VLOG(6) << "Channel " << id_ << " requesting connection (for lane " << laneIdx << ")"; uint64_t token = context_->registerConnectionRequest( laneIdx, callbackWrapper_( [laneIdx]( ChannelImpl& impl, std::shared_ptr connection) { TP_VLOG(6) << "Channel " << impl.id_ << " done requesting connection (for lane " << laneIdx << ")"; if (!impl.error_) { impl.onServerAcceptOfLane(laneIdx, std::move(connection)); } })); laneRegistrationIds_.emplace(laneIdx, token); nopLaneAdvertisement.registrationId = token; numLanesBeingAccepted_++; } TP_VLOG(6) << "Channel " << id_ << " writing nop object (server hello)"; connection_->write( *nopHolderOut, callbackWrapper_([nopHolderOut](ChannelImpl& impl) { TP_VLOG(6) << "Channel " << impl.id_ << " done writing nop object (server hello)"; })); } else { TP_THROW_ASSERT() << "unknown endpoint"; } } void ChannelImpl::onClientReadHelloOnConnection(const Packet& nopPacketIn) { TP_DCHECK(context_->inLoop()); TP_DCHECK_EQ(state_, CLIENT_READING_HELLO); TP_DCHECK_EQ(nopPacketIn.index(), nopPacketIn.index_of()); const ServerHello& nopServerHello = *nopPacketIn.get(); TP_DCHECK_EQ(nopServerHello.laneAdvertisements.size(), numLanes_); lanes_.resize(numLanes_); for (uint64_t laneIdx = 0; laneIdx < numLanes_; ++laneIdx) { const LaneAdvertisement& nopLaneAdvertisement = nopServerHello.laneAdvertisements[laneIdx]; std::shared_ptr lane = context_->connect(laneIdx, nopLaneAdvertisement.address); auto nopHolderOut = std::make_shared>(); Packet& nopPacket = nopHolderOut->getObject(); nopPacket.Become(nopPacket.index_of()); ClientHello& nopClientHello = *nopPacket.get(); nopClientHello.registrationId = nopLaneAdvertisement.registrationId; TP_VLOG(6) << "Channel " << id_ << " writing nop object (client hello) on lane " << laneIdx; lane->write( *nopHolderOut, callbackWrapper_([laneIdx, nopHolderOut](ChannelImpl& impl) { TP_VLOG(6) << "Channel " << impl.id_ << " done writing nop object (client hello) on lane " << laneIdx; })); lanes_[laneIdx] = std::move(lane); } state_ = ESTABLISHED; sendOps_.advanceAllOperations(); recvOps_.advanceAllOperations(); } void ChannelImpl::onServerAcceptOfLane( uint64_t laneIdx, std::shared_ptr connection) { TP_DCHECK(context_->inLoop()); TP_DCHECK_EQ(state_, SERVER_ACCEPTING_LANES); TP_DCHECK(!lanes_[laneIdx]); TP_DCHECK_LT(laneIdx, lanes_.size()); lanes_[laneIdx] = std::move(connection); auto laneRegistrationIter = laneRegistrationIds_.find(laneIdx); TP_DCHECK(laneRegistrationIter != laneRegistrationIds_.end()); context_->unregisterConnectionRequest(laneRegistrationIter->second); laneRegistrationIds_.erase(laneRegistrationIter); numLanesBeingAccepted_--; if (numLanesBeingAccepted_ == 0) { state_ = ESTABLISHED; sendOps_.advanceAllOperations(); recvOps_.advanceAllOperations(); } } void ChannelImpl::sendImplFromLoop( uint64_t sequenceNumber, Buffer buffer, size_t length, TSendCallback callback) { SendOpIter opIter = sendOps_.emplaceBack(sequenceNumber); SendOperation& op = *opIter; op.ptr = buffer.unwrap().ptr; op.length = length; op.callback = std::move(callback); sendOps_.advanceOperation(opIter); } void ChannelImpl::advanceSendOperation( SendOpIter opIter, SendOperation::State prevOpState) { TP_DCHECK(context_->inLoop()); SendOperation& op = *opIter; sendOps_.attemptTransition( opIter, /*from=*/SendOperation::UNINITIALIZED, /*to=*/SendOperation::FINISHED, /*cond=*/error_ || op.length == 0, /*actions=*/{&ChannelImpl::callSendCallback}); // Needs to go after previous op to ensure predictable and consistent ordering // of write calls on lanes. sendOps_.attemptTransition( opIter, /*from=*/SendOperation::UNINITIALIZED, /*to=*/SendOperation::WRITING_CHUNKS, /*cond=*/!error_ && state_ == ESTABLISHED && prevOpState >= SendOperation::WRITING_CHUNKS, /*actions=*/{&ChannelImpl::writeChunks}); sendOps_.attemptTransition( opIter, /*from=*/SendOperation::WRITING_CHUNKS, /*to=*/SendOperation::FINISHED, /*cond=*/op.numChunksBeingWritten == 0, /*actions=*/{&ChannelImpl::callSendCallback}); } void ChannelImpl::writeChunks(SendOpIter opIter) { SendOperation& op = *opIter; for (uint64_t laneIdx = 0; laneIdx < lanes_.size(); laneIdx++) { // Insert "cutpoints" at equally-spaced intervals in the buffer, rounding // them down if they don't end up being at an integer position. uint64_t offsetStart = op.length * laneIdx / lanes_.size(); uint64_t offsetEnd = op.length * (laneIdx + 1) / lanes_.size(); // As void "has no size" we cannot do pointer arithmetic on it. We need to // temporarily convert the pointer to a type that has a size of 1 byte. const void* ptr = reinterpret_cast(op.ptr) + offsetStart; uint64_t length = offsetEnd - offsetStart; // Write payload. TP_VLOG(6) << "Channel " << id_ << " writing payload #" << op.sequenceNumber << " on lane " << laneIdx; lanes_[laneIdx]->write( ptr, length, callbackWrapper_([opIter, laneIdx](ChannelImpl& impl) { TP_VLOG(6) << "Channel " << impl.id_ << " done writing payload #" << opIter->sequenceNumber << " on lane " << laneIdx; --opIter->numChunksBeingWritten; impl.sendOps_.advanceOperation(opIter); })); ++op.numChunksBeingWritten; } } void ChannelImpl::callSendCallback(SendOpIter opIter) { SendOperation& op = *opIter; op.callback(error_); // Reset callback to release the resources it was holding. op.callback = nullptr; } void ChannelImpl::recvImplFromLoop( uint64_t sequenceNumber, Buffer buffer, size_t length, TRecvCallback callback) { RecvOpIter opIter = recvOps_.emplaceBack(sequenceNumber); RecvOperation& op = *opIter; op.ptr = buffer.unwrap().ptr; op.length = length; op.callback = std::move(callback); recvOps_.advanceOperation(opIter); } void ChannelImpl::advanceRecvOperation( RecvOpIter opIter, RecvOperation::State prevOpState) { TP_DCHECK(context_->inLoop()); RecvOperation& op = *opIter; recvOps_.attemptTransition( opIter, /*from=*/RecvOperation::UNINITIALIZED, /*to=*/RecvOperation::FINISHED, /*cond=*/error_ || op.length == 0, /*actions=*/{&ChannelImpl::callRecvCallback}); // Needs to go after previous op to ensure predictable and consistent ordering // of read calls on lanes. recvOps_.attemptTransition( opIter, /*from=*/RecvOperation::UNINITIALIZED, /*to=*/RecvOperation::READING_CHUNKS, /*cond=*/!error_ && state_ == ESTABLISHED && prevOpState >= RecvOperation::READING_CHUNKS, /*actions=*/{&ChannelImpl::readChunks}); recvOps_.attemptTransition( opIter, /*from=*/RecvOperation::READING_CHUNKS, /*to=*/RecvOperation::FINISHED, /*cond=*/op.numChunksBeingRead == 0, /*actions=*/{&ChannelImpl::callRecvCallback}); } void ChannelImpl::readChunks(RecvOpIter opIter) { RecvOperation& op = *opIter; for (uint64_t laneIdx = 0; laneIdx < lanes_.size(); laneIdx++) { // Insert "cutpoints" at equally-spaced intervals in the buffer, rounding // them down if they don't end up being at an integer position. uint64_t offsetStart = op.length * laneIdx / lanes_.size(); uint64_t offsetEnd = op.length * (laneIdx + 1) / lanes_.size(); // As void "has no size" we cannot do pointer arithmetic on it. We need to // temporarily convert the pointer to a type that has a size of 1 byte. void* ptr = reinterpret_cast(op.ptr) + offsetStart; uint64_t length = offsetEnd - offsetStart; // Read payload. TP_VLOG(6) << "Channel " << id_ << " reading payload #" << op.sequenceNumber << " on lane " << laneIdx; lanes_[laneIdx]->read( ptr, length, callbackWrapper_([opIter, laneIdx]( ChannelImpl& impl, const void* /* unused */, size_t /* unused */) { TP_VLOG(6) << "Channel " << impl.id_ << " done reading payload #" << opIter->sequenceNumber << " on lane " << laneIdx; --opIter->numChunksBeingRead; impl.recvOps_.advanceOperation(opIter); })); ++op.numChunksBeingRead; } } void ChannelImpl::callRecvCallback(RecvOpIter opIter) { RecvOperation& op = *opIter; op.callback(error_); // Reset callback to release the resources it was holding. op.callback = nullptr; } void ChannelImpl::handleErrorImpl() { sendOps_.advanceAllOperations(); recvOps_.advanceAllOperations(); // Close the connections so that all current operations will be aborted. This // will cause their callbacks to be invoked, and only then we'll invoke ours. connection_->close(); for (auto& lane : lanes_) { if (lane) { lane->close(); } } for (const auto& iter : laneRegistrationIds_) { context_->unregisterConnectionRequest(iter.second); } context_->unenroll(*this); } // TODO Implement setIdImpl to propagate the ID to the connections } // namespace mpt } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/channel/mpt/channel_impl.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include #include #include #include namespace tensorpipe { namespace channel { namespace mpt { class ContextImpl; // State capturing a single send operation. struct SendOperation { enum State { UNINITIALIZED, WRITING_CHUNKS, FINISHED }; // Fields used by the state machine uint64_t sequenceNumber{0}; State state{UNINITIALIZED}; // Progress flags int64_t numChunksBeingWritten{0}; // Arguments at creation const void* ptr; size_t length; TSendCallback callback; }; // State capturing a single recv operation. struct RecvOperation { enum State { UNINITIALIZED, READING_CHUNKS, FINISHED }; // Fields used by the state machine uint64_t sequenceNumber{0}; State state{UNINITIALIZED}; // Progress flags int64_t numChunksBeingRead{0}; // Arguments at creation void* ptr; size_t length; TRecvCallback callback; }; class ChannelImpl final : public ChannelImplBoilerplate { public: ChannelImpl( ConstructorToken token, std::shared_ptr context, std::string id, std::shared_ptr connection, Endpoint endpoint, uint64_t numLanes); protected: // Implement the entry points called by ChannelImplBoilerplate. void initImplFromLoop() override; void sendImplFromLoop( uint64_t sequenceNumber, Buffer buffer, size_t length, TSendCallback callback) override; void recvImplFromLoop( uint64_t sequenceNumber, Buffer buffer, size_t length, TRecvCallback callback) override; void handleErrorImpl() override; private: enum State { UNINITIALIZED, CLIENT_READING_HELLO, SERVER_ACCEPTING_LANES, ESTABLISHED, }; // Called when client reads the server's hello on backbone connection void onClientReadHelloOnConnection(const Packet& nopPacketIn); // Called when server accepts new client connection for lane void onServerAcceptOfLane( uint64_t laneIdx, std::shared_ptr connection); const std::shared_ptr connection_; const Endpoint endpoint_; State state_{UNINITIALIZED}; const uint64_t numLanes_; uint64_t numLanesBeingAccepted_{0}; std::vector> lanes_; std::unordered_map laneRegistrationIds_; OpsStateMachine sendOps_{ *this, &ChannelImpl::advanceSendOperation}; using SendOpIter = decltype(sendOps_)::Iter; OpsStateMachine recvOps_{ *this, &ChannelImpl::advanceRecvOperation}; using RecvOpIter = decltype(recvOps_)::Iter; // State machines for send and recv ops. void advanceSendOperation( SendOpIter opIter, SendOperation::State prevOpState); void advanceRecvOperation( RecvOpIter opIter, RecvOperation::State prevOpState); // Actions (i.e., methods that begin a state transition). // For send operations: void writeChunks(SendOpIter opIter); void callSendCallback(SendOpIter opIter); // For recv operations: void readChunks(RecvOpIter opIter); void callRecvCallback(RecvOpIter opIter); }; } // namespace mpt } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/channel/mpt/context_impl.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include #include #include #include #include #include #include #include namespace tensorpipe { namespace channel { namespace mpt { namespace { std::string generateDomainDescriptor( const std::vector>& contexts) { // FIXME Escape the contexts' domain descriptors in case they contain a colon? // Or put them all in a nop object, that'll do the escaping for us. // But is it okay to compare nop objects by equality bitwise? std::ostringstream ss; ss << contexts.size(); for (const auto& context : contexts) { ss << ":" << context->domainDescriptor(); } return ss.str(); } } // namespace std::shared_ptr ContextImpl::create( std::vector> contexts, std::vector> listeners) { for (const auto& context : contexts) { if (!context->isViable()) { return nullptr; } } std::unordered_map deviceDescriptors = { {Device{kCpuDeviceType, 0}, generateDomainDescriptor(contexts)}}; return std::make_shared( std::move(contexts), std::move(listeners), std::move(deviceDescriptors)); } ContextImpl::ContextImpl( std::vector> contexts, std::vector> listeners, std::unordered_map deviceDescriptors) : ContextImplBoilerplate( std::move(deviceDescriptors)), contexts_(std::move(contexts)), listeners_(std::move(listeners)) { TP_THROW_ASSERT_IF(contexts_.size() != listeners_.size()); numLanes_ = contexts_.size(); addresses_.reserve(numLanes_); for (const auto& listener : listeners_) { addresses_.emplace_back(listener->addr()); } } void ContextImpl::initImplFromLoop() { for (uint64_t laneIdx = 0; laneIdx < numLanes_; ++laneIdx) { acceptLane(laneIdx); } } std::shared_ptr ContextImpl::createChannel( std::vector> connections, Endpoint endpoint) { TP_DCHECK_EQ(numConnectionsNeeded(), connections.size()); return createChannelInternal(std::move(connections[0]), endpoint, numLanes_); } const std::vector& ContextImpl::addresses() const { // As this is an immutable member (after it has been initialized in // the constructor), we'll access it without deferring to the loop. return addresses_; } uint64_t ContextImpl::registerConnectionRequest( uint64_t laneIdx, connection_request_callback_fn fn) { TP_DCHECK(loop_.inLoop()); uint64_t registrationId = nextConnectionRequestRegistrationId_++; TP_VLOG(4) << "Channel context " << id_ << " received a connection request registration (#" << registrationId << ") on lane " << laneIdx; fn = [this, registrationId, fn{std::move(fn)}]( const Error& error, std::shared_ptr connection) { TP_VLOG(4) << "Channel context " << id_ << " calling a connection request registration callback (#" << registrationId << ")"; fn(error, std::move(connection)); TP_VLOG(4) << "Channel context " << id_ << " done calling a connection request registration callback (#" << registrationId << ")"; }; if (error_) { fn(error_, std::shared_ptr()); } else { connectionRequestRegistrations_.emplace(registrationId, std::move(fn)); } return registrationId; } void ContextImpl::unregisterConnectionRequest(uint64_t registrationId) { TP_DCHECK(loop_.inLoop()); TP_VLOG(4) << "Channel context " << id_ << " received a connection request de-registration (#" << registrationId << ")"; connectionRequestRegistrations_.erase(registrationId); } std::shared_ptr ContextImpl::connect( uint64_t laneIdx, std::string address) { TP_VLOG(4) << "Channel context " << id_ << " opening connection on lane " << laneIdx; return contexts_[laneIdx]->connect(std::move(address)); } void ContextImpl::acceptLane(uint64_t laneIdx) { TP_DCHECK(loop_.inLoop()); TP_VLOG(6) << "Channel context " << id_ << " accepting connection on lane " << laneIdx; listeners_[laneIdx]->accept( callbackWrapper_([laneIdx]( ContextImpl& impl, std::shared_ptr connection) { TP_VLOG(6) << "Channel context " << impl.id_ << " done accepting connection on lane " << laneIdx; if (impl.error_) { return; } impl.onAcceptOfLane(std::move(connection)); impl.acceptLane(laneIdx); })); } void ContextImpl::onAcceptOfLane( std::shared_ptr connection) { TP_DCHECK(loop_.inLoop()); // Keep it alive until we figure out what to do with it. connectionsWaitingForHello_.insert(connection); auto npHolderIn = std::make_shared>(); TP_VLOG(6) << "Channel context " << id_ << " reading nop object (client hello)"; connection->read( *npHolderIn, callbackWrapper_([npHolderIn, connection](ContextImpl& impl) mutable { TP_VLOG(6) << "Channel context " << impl.id_ << " done reading nop object (client hello)"; if (impl.error_) { return; } impl.connectionsWaitingForHello_.erase(connection); impl.onReadClientHelloOnLane( std::move(connection), npHolderIn->getObject()); })); } void ContextImpl::onReadClientHelloOnLane( std::shared_ptr connection, const Packet& nopPacketIn) { TP_DCHECK(loop_.inLoop()); TP_DCHECK_EQ(nopPacketIn.index(), nopPacketIn.index_of()); const ClientHello& nopClientHello = *nopPacketIn.get(); uint64_t registrationId = nopClientHello.registrationId; auto iter = connectionRequestRegistrations_.find(registrationId); // The connection request may have already been deregistered, for example // because the channel may have been closed. if (iter != connectionRequestRegistrations_.end()) { auto fn = std::move(iter->second); connectionRequestRegistrations_.erase(iter); fn(Error::kSuccess, std::move(connection)); } } void ContextImpl::handleErrorImpl() { for (auto& iter : connectionRequestRegistrations_) { connection_request_callback_fn fn = std::move(iter.second); fn(error_, std::shared_ptr()); } connectionRequestRegistrations_.clear(); for (const auto& connection : connectionsWaitingForHello_) { connection->close(); } connectionsWaitingForHello_.clear(); for (auto& listener : listeners_) { listener->close(); } for (auto& context : contexts_) { context->close(); } } void ContextImpl::setIdImpl() { for (uint64_t laneIdx = 0; laneIdx < numLanes_; ++laneIdx) { contexts_[laneIdx]->setId(id_ + ".ctx_" + std::to_string(laneIdx)); listeners_[laneIdx]->setId( id_ + ".ctx_" + std::to_string(laneIdx) + ".l_" + std::to_string(laneIdx)); } } void ContextImpl::joinImpl() { for (auto& context : contexts_) { context->join(); } } bool ContextImpl::inLoop() const { return loop_.inLoop(); }; void ContextImpl::deferToLoop(std::function fn) { loop_.deferToLoop(std::move(fn)); }; } // namespace mpt } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/channel/mpt/context_impl.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include #include #include #include #include #include #include #include namespace tensorpipe { namespace channel { namespace mpt { class ChannelImpl; class ContextImpl final : public ContextImplBoilerplate { public: static std::shared_ptr create( std::vector> contexts, std::vector> listeners); ContextImpl( std::vector> contexts, std::vector> listeners, std::unordered_map deviceDescriptors); std::shared_ptr createChannel( std::vector> connections, Endpoint endpoint); // Implement the DeferredExecutor interface. bool inLoop() const override; void deferToLoop(std::function fn) override; using connection_request_callback_fn = std::function)>; const std::vector& addresses() const; uint64_t registerConnectionRequest( uint64_t laneIdx, connection_request_callback_fn fn); void unregisterConnectionRequest(uint64_t registrationId); std::shared_ptr connect( uint64_t laneIdx, std::string address); protected: // Implement the entry points called by ContextImplBoilerplate. void initImplFromLoop() override; void handleErrorImpl() override; void joinImpl() override; void setIdImpl() override; private: OnDemandDeferredExecutor loop_; void acceptLane(uint64_t laneIdx); void onAcceptOfLane(std::shared_ptr connection); void onReadClientHelloOnLane( std::shared_ptr connection, const Packet& nopPacketIn); const std::vector> contexts_; const std::vector> listeners_; uint64_t numLanes_{0}; std::vector addresses_; uint64_t nextConnectionRequestRegistrationId_{0}; // Needed to keep them alive. std::unordered_set> connectionsWaitingForHello_; std::unordered_map connectionRequestRegistrations_; }; } // namespace mpt } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/channel/mpt/factory.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include namespace tensorpipe { namespace channel { namespace mpt { std::shared_ptr create( std::vector> contexts, std::vector> listeners) { return std::make_shared>( std::move(contexts), std::move(listeners)); } } // namespace mpt } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/channel/mpt/factory.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include namespace tensorpipe { namespace channel { namespace mpt { std::shared_ptr create( std::vector> contexts, std::vector> listeners); } // namespace mpt } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/channel/mpt/nop_types.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include namespace tensorpipe { namespace channel { namespace mpt { struct LaneAdvertisement { // This pointless constructor is needed to work around a bug in GCC 5.5 (and // possibly other versions). It appears to be needed in the nop types that are // used inside std::vectors. LaneAdvertisement() {} std::string address; uint64_t registrationId; NOP_STRUCTURE(LaneAdvertisement, address, registrationId); }; struct ServerHello { std::vector laneAdvertisements; NOP_STRUCTURE(ServerHello, laneAdvertisements); }; struct ClientHello { uint64_t registrationId; NOP_STRUCTURE(ClientHello, registrationId); }; using Packet = nop::Variant; } // namespace mpt } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/channel/xth/channel_impl.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include #include #include #include #include #include #include namespace tensorpipe { namespace channel { namespace xth { namespace { struct Descriptor { uint64_t ptr; NOP_STRUCTURE(Descriptor, ptr); }; } // namespace ChannelImpl::ChannelImpl( ConstructorToken token, std::shared_ptr context, std::string id, std::shared_ptr descriptorConnection, std::shared_ptr completionConnection) : ChannelImplBoilerplate( token, std::move(context), std::move(id)), descriptorConnection_(std::move(descriptorConnection)), completionConnection_(std::move(completionConnection)) {} void ChannelImpl::initImplFromLoop() { context_->enroll(*this); } void ChannelImpl::sendImplFromLoop( uint64_t sequenceNumber, Buffer buffer, size_t length, TSendCallback callback) { SendOpIter opIter = sendOps_.emplaceBack(sequenceNumber); SendOperation& op = *opIter; op.ptr = buffer.unwrap().ptr; op.length = length; op.callback = std::move(callback); sendOps_.advanceOperation(opIter); } void ChannelImpl::advanceSendOperation( SendOpIter opIter, SendOperation::State prevOpState) { TP_DCHECK(context_->inLoop()); SendOperation& op = *opIter; sendOps_.attemptTransition( opIter, /*from=*/SendOperation::UNINITIALIZED, /*to=*/SendOperation::FINISHED, /*cond=*/error_ || op.length == 0, /*actions=*/{&ChannelImpl::callSendCallback}); // Needs to go after previous op to ensure predictable and consistent ordering // of write calls on the descriptor control connection and read calls on the // completion control connection. sendOps_.attemptTransition( opIter, /*from=*/SendOperation::UNINITIALIZED, /*to=*/SendOperation::READING_COMPLETION, /*cond=*/!error_ && prevOpState >= SendOperation::READING_COMPLETION, /*actions=*/ {&ChannelImpl::writeDescriptor, &ChannelImpl::readCompletion}); sendOps_.attemptTransition( opIter, /*from=*/SendOperation::READING_COMPLETION, /*to=*/SendOperation::FINISHED, /*cond=*/op.doneReadingCompletion, /*actions=*/{&ChannelImpl::callSendCallback}); } void ChannelImpl::writeDescriptor(SendOpIter opIter) { SendOperation& op = *opIter; auto nopHolder = std::make_shared>(); Descriptor& nopDescriptor = nopHolder->getObject(); nopDescriptor.ptr = reinterpret_cast(op.ptr); TP_VLOG(6) << "Channel " << id_ << " is writing descriptor (#" << op.sequenceNumber << ")"; descriptorConnection_->write( *nopHolder, callbackWrapper_([sequenceNumber{op.sequenceNumber}, nopHolder](ChannelImpl& impl) { TP_VLOG(6) << "Channel " << impl.id_ << " done writing descriptor (#" << sequenceNumber << ")"; })); } void ChannelImpl::readCompletion(SendOpIter opIter) { SendOperation& op = *opIter; TP_VLOG(6) << "Channel " << id_ << " is reading completion (#" << op.sequenceNumber << ")"; completionConnection_->read( nullptr, 0, callbackWrapper_([opIter]( ChannelImpl& impl, const void* /* unused */, size_t /* unused */) { TP_VLOG(6) << "Channel " << impl.id_ << " done reading completion (#" << opIter->sequenceNumber << ")"; opIter->doneReadingCompletion = true; impl.sendOps_.advanceOperation(opIter); })); } void ChannelImpl::callSendCallback(SendOpIter opIter) { SendOperation& op = *opIter; op.callback(error_); // Reset callback to release the resources it was holding. op.callback = nullptr; } void ChannelImpl::recvImplFromLoop( uint64_t sequenceNumber, Buffer buffer, size_t length, TRecvCallback callback) { RecvOpIter opIter = recvOps_.emplaceBack(sequenceNumber); RecvOperation& op = *opIter; op.ptr = buffer.unwrap().ptr; op.length = length; op.callback = std::move(callback); recvOps_.advanceOperation(opIter); } void ChannelImpl::advanceRecvOperation( RecvOpIter opIter, RecvOperation::State prevOpState) { TP_DCHECK(context_->inLoop()); RecvOperation& op = *opIter; recvOps_.attemptTransition( opIter, /*from=*/RecvOperation::UNINITIALIZED, /*to=*/RecvOperation::FINISHED, /*cond=*/error_ || op.length == 0, /*actions=*/{&ChannelImpl::callRecvCallback}); // Needs to go after previous op to ensure predictable and consistent ordering // of read calls on the descriptor control connection. recvOps_.attemptTransition( opIter, /*from=*/RecvOperation::UNINITIALIZED, /*to=*/RecvOperation::READING_DESCRIPTOR, /*cond=*/!error_ && prevOpState >= RecvOperation::READING_DESCRIPTOR, /*actions=*/{&ChannelImpl::readDescriptor}); recvOps_.attemptTransition( opIter, /*from=*/RecvOperation::READING_DESCRIPTOR, /*to=*/RecvOperation::FINISHED, /*cond=*/error_ && op.doneReadingDescriptor, /*actions=*/{&ChannelImpl::callRecvCallback}); recvOps_.attemptTransition( opIter, /*from=*/RecvOperation::READING_DESCRIPTOR, /*to=*/RecvOperation::COPYING, /*cond=*/!error_ && op.doneReadingDescriptor, /*actions=*/{&ChannelImpl::copy}); recvOps_.attemptTransition( opIter, /*from=*/RecvOperation::COPYING, /*to=*/RecvOperation::FINISHED, /*cond=*/error_ && op.doneCopying, /*actions=*/{&ChannelImpl::callRecvCallback}); // Needs to go after previous op to ensure predictable and consistent ordering // of write calls on the completion control connection. recvOps_.attemptTransition( opIter, /*from=*/RecvOperation::COPYING, /*to=*/RecvOperation::FINISHED, /*cond=*/!error_ && op.doneCopying && prevOpState >= RecvOperation::FINISHED, /*actions=*/ {&ChannelImpl::callRecvCallback, &ChannelImpl::writeCompletion}); } void ChannelImpl::readDescriptor(RecvOpIter opIter) { RecvOperation& op = *opIter; TP_VLOG(6) << "Channel " << id_ << " is reading descriptor (#" << op.sequenceNumber << ")"; auto nopHolderIn = std::make_shared>(); descriptorConnection_->read( *nopHolderIn, callbackWrapper_([opIter, nopHolderIn](ChannelImpl& impl) { TP_VLOG(6) << "Channel " << impl.id_ << " done reading descriptor (#" << opIter->sequenceNumber << ")"; opIter->doneReadingDescriptor = true; if (!impl.error_) { Descriptor& nopDescriptor = nopHolderIn->getObject(); opIter->remotePtr = reinterpret_cast(nopDescriptor.ptr); } impl.recvOps_.advanceOperation(opIter); })); } void ChannelImpl::copy(RecvOpIter opIter) { RecvOperation& op = *opIter; TP_VLOG(6) << "Channel " << id_ << " is copying payload (#" << op.sequenceNumber << ")"; context_->requestCopy( op.remotePtr, op.ptr, op.length, callbackWrapper_([opIter](ChannelImpl& impl) { TP_VLOG(6) << "Channel " << impl.id_ << " done copying payload (#" << opIter->sequenceNumber << ")"; opIter->doneCopying = true; impl.recvOps_.advanceOperation(opIter); })); } void ChannelImpl::callRecvCallback(RecvOpIter opIter) { RecvOperation& op = *opIter; op.callback(error_); // Reset callback to release the resources it was holding. op.callback = nullptr; } void ChannelImpl::writeCompletion(RecvOpIter opIter) { RecvOperation& op = *opIter; TP_VLOG(6) << "Channel " << id_ << " is writing completion (#" << op.sequenceNumber << ")"; completionConnection_->write( nullptr, 0, callbackWrapper_([sequenceNumber{op.sequenceNumber}](ChannelImpl& impl) { TP_VLOG(6) << "Channel " << impl.id_ << " done writing completion (#" << sequenceNumber << ")"; })); } void ChannelImpl::handleErrorImpl() { sendOps_.advanceAllOperations(); recvOps_.advanceAllOperations(); descriptorConnection_->close(); completionConnection_->close(); context_->unenroll(*this); } } // namespace xth } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/channel/xth/channel_impl.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include namespace tensorpipe { namespace channel { namespace xth { class ContextImpl; struct SendOperation { enum State { UNINITIALIZED, READING_COMPLETION, FINISHED }; // Fields used by the state machine uint64_t sequenceNumber{0}; State state{UNINITIALIZED}; // Progress flags bool doneReadingCompletion{false}; // Arguments at creation void* ptr; size_t length; TSendCallback callback; }; struct RecvOperation { enum State { UNINITIALIZED, READING_DESCRIPTOR, COPYING, FINISHED }; // Fields used by the state machine uint64_t sequenceNumber{0}; State state{UNINITIALIZED}; // Progress flags bool doneReadingDescriptor{false}; bool doneCopying{false}; // Arguments at creation void* ptr; size_t length; TRecvCallback callback; // Other data void* remotePtr; }; class ChannelImpl final : public ChannelImplBoilerplate { public: ChannelImpl( ConstructorToken token, std::shared_ptr context, std::string id, std::shared_ptr descriptorConnection, std::shared_ptr completionConnection); protected: // Implement the entry points called by ChannelImplBoilerplate. void initImplFromLoop() override; void sendImplFromLoop( uint64_t sequenceNumber, Buffer buffer, size_t length, TSendCallback callback) override; void recvImplFromLoop( uint64_t sequenceNumber, Buffer buffer, size_t length, TRecvCallback callback) override; void handleErrorImpl() override; private: const std::shared_ptr descriptorConnection_; const std::shared_ptr completionConnection_; OpsStateMachine sendOps_{ *this, &ChannelImpl::advanceSendOperation}; using SendOpIter = decltype(sendOps_)::Iter; OpsStateMachine recvOps_{ *this, &ChannelImpl::advanceRecvOperation}; using RecvOpIter = decltype(recvOps_)::Iter; // State machines for send and recv ops. void advanceSendOperation( SendOpIter opIter, SendOperation::State prevOpState); void advanceRecvOperation( RecvOpIter opIter, RecvOperation::State prevOpState); // Actions (i.e., methods that begin a state transition). // For send operations: void writeDescriptor(SendOpIter opIter); void readCompletion(SendOpIter opIter); void callSendCallback(SendOpIter opIter); // For recv operations: void readDescriptor(RecvOpIter opIter); void copy(RecvOpIter opIter); void callRecvCallback(RecvOpIter opIter); void writeCompletion(RecvOpIter opIter); }; } // namespace xth } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/channel/xth/context_impl.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include #include #include #include #include #include #include #include namespace tensorpipe { namespace channel { namespace xth { std::shared_ptr ContextImpl::create() { std::ostringstream oss; auto bootID = getBootID(); TP_THROW_ASSERT_IF(!bootID) << "Unable to read boot_id"; auto nsID = getLinuxNamespaceId(LinuxNamespace::kPid); if (!nsID.has_value()) { TP_VLOG(5) << "XTH channel is not viable because it couldn't determine the PID namespace ID"; return nullptr; } oss << bootID.value() << "_" << nsID.value() << "_" << ::getpid(); const std::string domainDescriptor = oss.str(); std::unordered_map deviceDescriptors = { {Device{kCpuDeviceType, 0}, domainDescriptor}}; return std::make_shared(std::move(deviceDescriptors)); } ContextImpl::ContextImpl( std::unordered_map deviceDescriptors) : ContextImplBoilerplate( std::move(deviceDescriptors)), requests_(std::numeric_limits::max()) { thread_ = std::thread(&ContextImpl::handleCopyRequests, this); } std::shared_ptr ContextImpl::createChannel( std::vector> connections, Endpoint /* unused */) { TP_DCHECK_EQ(numConnectionsNeeded(), connections.size()); return createChannelInternal( std::move(connections[0]), std::move(connections[1])); } size_t ContextImpl::numConnectionsNeeded() const { return 2; } void ContextImpl::handleErrorImpl() { requests_.push(nullopt); } void ContextImpl::joinImpl() { thread_.join(); // TP_DCHECK(requests_.empty()); } bool ContextImpl::inLoop() const { return loop_.inLoop(); }; void ContextImpl::deferToLoop(std::function fn) { loop_.deferToLoop(std::move(fn)); }; void ContextImpl::requestCopy( void* remotePtr, void* localPtr, size_t length, std::function fn) { uint64_t requestId = nextRequestId_++; TP_VLOG(4) << "Channel context " << id_ << " received a copy request (#" << requestId << ")"; fn = [this, requestId, fn{std::move(fn)}](const Error& error) { TP_VLOG(4) << "Channel context " << id_ << " is calling a copy request callback (#" << requestId << ")"; fn(error); TP_VLOG(4) << "Channel context " << id_ << " done calling a copy request callback (#" << requestId << ")"; }; requests_.push(CopyRequest{remotePtr, localPtr, length, std::move(fn)}); } void ContextImpl::handleCopyRequests() { setThreadName("TP_XTH_loop"); while (true) { auto maybeRequest = requests_.pop(); if (!maybeRequest.has_value()) { break; } CopyRequest request = std::move(maybeRequest).value(); // Don't even call memcpy on a length of 0 to avoid issues with the pointer // possibly being null. if (request.length > 0) { // Perform copy. std::memcpy(request.localPtr, request.remotePtr, request.length); } request.callback(Error::kSuccess); } } } // namespace xth } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/channel/xth/context_impl.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include #include #include #include namespace tensorpipe { namespace channel { namespace xth { class ChannelImpl; class ContextImpl final : public ContextImplBoilerplate { public: static std::shared_ptr create(); explicit ContextImpl( std::unordered_map deviceDescriptors); std::shared_ptr createChannel( std::vector> connections, Endpoint endpoint); size_t numConnectionsNeeded() const override; // Implement the DeferredExecutor interface. bool inLoop() const override; void deferToLoop(std::function fn) override; using copy_request_callback_fn = std::function; void requestCopy( void* remotePtr, void* localPtr, size_t length, copy_request_callback_fn fn); protected: // Implement the entry points called by ContextImplBoilerplate. void handleErrorImpl() override; void joinImpl() override; private: OnDemandDeferredExecutor loop_; struct CopyRequest { void* remotePtr; void* localPtr; size_t length; copy_request_callback_fn callback; }; std::thread thread_; Queue> requests_; // This is atomic because it may be accessed from outside the loop. std::atomic nextRequestId_{0}; void handleCopyRequests(); }; } // namespace xth } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/channel/xth/factory.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include namespace tensorpipe { namespace channel { namespace xth { std::shared_ptr create() { return std::make_shared>(); } } // namespace xth } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/channel/xth/factory.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include namespace tensorpipe { namespace channel { namespace xth { std::shared_ptr create(); } // namespace xth } // namespace channel } // namespace tensorpipe ================================================ FILE: tensorpipe/common/address.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include namespace tensorpipe { std::tuple splitSchemeOfURL(const std::string& url) { std::string::size_type endOfScheme = url.find("://"); if (endOfScheme == std::string::npos) { TP_THROW_EINVAL() << "url has no scheme: " << url; } return std::make_tuple( url.substr(0, endOfScheme), url.substr(endOfScheme + 3)); } } // namespace tensorpipe ================================================ FILE: tensorpipe/common/address.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include namespace tensorpipe { std::tuple splitSchemeOfURL(const std::string& url); } ================================================ FILE: tensorpipe/common/allocator.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include namespace tensorpipe { Allocator::Allocator(uint8_t* data, size_t numChunks, size_t chunkSize) : numChunks_(numChunks), chunkSize_(chunkSize), data_(data), chunkAvailable_(numChunks, true) {} Allocator::~Allocator() { close(); } void Allocator::alloc(size_t size, TAllocCallback callback) { TP_DCHECK(size <= chunkSize_); pendingAllocations_.push_back(std::move(callback)); processAllocations(); } size_t Allocator::getChunkLength() const { return chunkSize_; } void Allocator::close() { if (closed_) { return; } closed_ = true; processAllocations(); } void Allocator::processAllocations() { while (!pendingAllocations_.empty()) { auto& callback = pendingAllocations_.front(); if (closed_) { callback(TP_CREATE_ERROR(AllocatorClosedError), nullptr); } else { TChunk ptr = getAvailableChunk(); if (!ptr) { break; } callback(Error::kSuccess, std::move(ptr)); } pendingAllocations_.pop_front(); } } Allocator::TChunk Allocator::getAvailableChunk() { for (size_t curChunk = 0; curChunk < numChunks_; ++curChunk) { if (chunkAvailable_[curChunk]) { chunkAvailable_[curChunk] = false; ++allocatedChunks_; return TChunk(data_ + curChunk * chunkSize_, [this](uint8_t* ptr) { releaseChunk(ptr); }); } } return nullptr; } void Allocator::releaseChunk(uint8_t* ptr) { size_t chunkId = (ptr - data_) / chunkSize_; chunkAvailable_[chunkId] = true; --allocatedChunks_; processAllocations(); } } // namespace tensorpipe ================================================ FILE: tensorpipe/common/allocator.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include #include namespace tensorpipe { class AllocatorClosedError final : public BaseError { std::string what() const override { return "allocator closed"; } }; class Allocator { public: // Note: this is a std::shared_ptr semantically. A shared_ptr with // array type is supported in C++17 and higher. using TChunk = std::shared_ptr; using TAllocCallback = std::function; explicit Allocator(uint8_t* data, size_t numChunks, size_t chunkSize); ~Allocator(); void alloc(size_t size, TAllocCallback callback); size_t getChunkLength() const; void close(); private: const size_t numChunks_; const size_t chunkSize_; uint8_t* const data_; std::vector chunkAvailable_; size_t allocatedChunks_{0}; std::deque pendingAllocations_; bool closed_{false}; void processAllocations(); TChunk getAvailableChunk(); void releaseChunk(uint8_t* ptr); }; } // namespace tensorpipe ================================================ FILE: tensorpipe/common/buffer.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include namespace tensorpipe { class Buffer { class AbstractBufferWrapper { public: virtual Device device() const = 0; virtual void copyConstructInto(void* ptr) const = 0; virtual void moveConstructInto(void* ptr) = 0; virtual ~AbstractBufferWrapper() = default; }; template class BufferWrapper : public AbstractBufferWrapper { static_assert( std::is_trivially_copyable::value, "wrapping non-trivially copyable class"); public: TBuffer buffer; explicit BufferWrapper(TBuffer buffer) : buffer(std::move(buffer)) {} Device device() const override { return buffer.getDevice(); } void copyConstructInto(void* ptr) const override { new (ptr) BufferWrapper(*this); } void moveConstructInto(void* ptr) override { new (ptr) BufferWrapper(std::move(*this)); } }; public: template /* implicit */ Buffer(TBuffer b) { static_assert( sizeof(BufferWrapper) <= kStructSize, "kStructSize too small"); static_assert( alignof(BufferWrapper) <= kStructAlign, "kStructAlign too small"); new (&raw_) BufferWrapper(std::move(b)); } Buffer() : Buffer(CpuBuffer{}) {} Buffer(const Buffer& other) { other.ptr()->copyConstructInto(&raw_); } Buffer& operator=(const Buffer& other) { if (this != &other) { ptr()->~AbstractBufferWrapper(); other.ptr()->copyConstructInto(&raw_); } return *this; } Buffer(Buffer&& other) noexcept { other.ptr()->moveConstructInto(&raw_); } Buffer& operator=(Buffer&& other) { if (this != &other) { ptr()->~AbstractBufferWrapper(); other.ptr()->moveConstructInto(&raw_); } return *this; } ~Buffer() { ptr()->~AbstractBufferWrapper(); } template TBuffer& unwrap() { BufferWrapper* wrapperPtr = dynamic_cast*>(ptr()); if (wrapperPtr == nullptr) { throw std::runtime_error("Invalid unwrapping of tensorpipe::Buffer"); } return wrapperPtr->buffer; } template const TBuffer& unwrap() const { const BufferWrapper* wrapperPtr = dynamic_cast*>(ptr()); if (wrapperPtr == nullptr) { throw std::runtime_error("Invalid unwrapping of tensorpipe::Buffer"); } return wrapperPtr->buffer; } Device device() const { return ptr()->device(); } private: static constexpr int kStructSize = 32; static constexpr int kStructAlign = 8; std::aligned_storage::type raw_{}; const AbstractBufferWrapper* ptr() const { // FIXME: Once we go C++17, use std::launder on the returned pointer. return reinterpret_cast(&raw_); } AbstractBufferWrapper* ptr() { // FIXME: Once we go C++17, use std::launder on the returned pointer. return reinterpret_cast(&raw_); } }; } // namespace tensorpipe ================================================ FILE: tensorpipe/common/busy_polling_loop.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include namespace tensorpipe { class BusyPollingLoop : public EventLoopDeferredExecutor { protected: virtual bool pollOnce() = 0; virtual bool readyToClose() = 0; void stopBusyPolling() { closed_ = true; // No need to wake up the thread, since it is busy-waiting. } void eventLoop() override { while (!closed_ || !readyToClose()) { if (pollOnce()) { // continue } else if (deferredFunctionCount_ > 0) { deferredFunctionCount_ -= runDeferredFunctionsFromEventLoop(); } else { std::this_thread::yield(); } } } void wakeupEventLoopToDeferFunction() override { ++deferredFunctionCount_; // No need to wake up the thread, since it is busy-waiting. } private: std::atomic closed_{false}; std::atomic deferredFunctionCount_{0}; }; } // namespace tensorpipe ================================================ FILE: tensorpipe/common/callback.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include #include #include #include #include #include #include namespace tensorpipe { namespace { // NOTE: This is an incomplete implementation of C++17's `std::apply`. template auto cbApply(F&& f, T&& t, std::index_sequence /*unused*/) { return f(std::get(std::forward(t))...); } template auto cbApply(F&& f, T&& t) { return cbApply( std::move(f), std::forward(t), std::make_index_sequence::value>{}); } } // namespace // A wrapper for a callback that "burns out" after it fires and thus needs to be // rearmed every time. Invocations that are triggered while the callback is // unarmed are stashed and will be delayed until a callback is provided again. template class RearmableCallback { using TFn = std::function; using TStoredArgs = std::tuple::type...>; public: void arm(TFn fn) { if (!args_.empty()) { TStoredArgs args{std::move(args_.front())}; args_.pop_front(); cbApply(std::move(fn), std::move(args)); } else { callbacks_.push_back(std::move(fn)); } } void trigger(Args... args) { if (!callbacks_.empty()) { TFn fn{std::move(callbacks_.front())}; callbacks_.pop_front(); cbApply(std::move(fn), std::tuple(std::forward(args)...)); } else { args_.emplace_back(std::forward(args)...); } } // This method is intended for "flushing" the callback, for example when an // error condition is reached which means that no more callbacks will be // processed but the current ones still must be honored. void triggerAll(std::function()> generator) { while (!callbacks_.empty()) { TFn fn{std::move(callbacks_.front())}; callbacks_.pop_front(); cbApply(std::move(fn), generator()); } } private: std::deque callbacks_; std::deque args_; }; // This class provides some boilerplate that is used by the pipe, the listener // and others when passing a callback to some lower-level component. // It will acquire a shared_ptr to the object (thus preventing the object from // being destroyed until the callback has been fired) and in case of error it // will deal with it but it will still end up invoking the actual callback. template class CallbackWrapper { public: CallbackWrapper( std::enable_shared_from_this& subject, DeferredExecutor& loop) : subject_(subject), loop_(loop) {} template auto operator()(TBoundFn fn) { return [this, subject{subject_.shared_from_this()}, fn{std::move(fn)}]( const Error& error, auto&&... args) mutable { this->entryPoint( std::move(subject), std::move(fn), error, std::forward(args)...); }; } private: std::enable_shared_from_this& subject_; DeferredExecutor& loop_; template void entryPoint( std::shared_ptr subject, TBoundFn fn, const Error& error, Args&&... args) { // Do *NOT* move subject into the lambda's closure, as the shared_ptr we're // holding may be the last one keeping subject alive, in which case it would // die once the lambda runs, and it might kill the loop in turn too, _while_ // the loop's deferToLoop method is running. That's bad. So copy it instead. // FIXME We're copying the args here... loop_.deferToLoop( [this, subject, fn{std::move(fn)}, error{error}, args...]() mutable { entryPointFromLoop( *subject, std::move(fn), error, std::forward(args)...); }); } template void entryPointFromLoop( TSubject& subject, TBoundFn fn, const Error& error, Args&&... args) { TP_DCHECK(loop_.inLoop()); subject.setError(error); // Proceed regardless of any error: this is why it's called "eager". fn(subject, std::forward(args)...); } }; } // namespace tensorpipe ================================================ FILE: tensorpipe/common/cpu_buffer.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include namespace tensorpipe { struct CpuBuffer { void* ptr{nullptr}; Device getDevice() const { return Device{kCpuDeviceType, 0}; } }; } // namespace tensorpipe ================================================ FILE: tensorpipe/common/cuda.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include #include #include #include #include #include #include #include #define TP_CUDA_CHECK(a) \ do { \ cudaError_t error = (a); \ TP_THROW_ASSERT_IF(cudaSuccess != error) \ << __TP_EXPAND_OPD(a) << " " << cudaGetErrorName(error) << " (" \ << cudaGetErrorString(error) << ")"; \ } while (false) namespace tensorpipe { class CudaError final : public BaseError { public: explicit CudaError(cudaError_t error) : error_(error) {} std::string what() const override { return std::string(cudaGetErrorString(error_)); } private: cudaError_t error_; }; class CudaDeviceGuard { public: CudaDeviceGuard() = delete; CudaDeviceGuard(const CudaDeviceGuard&) = delete; CudaDeviceGuard(CudaDeviceGuard&&) = delete; CudaDeviceGuard& operator=(const CudaDeviceGuard&) = delete; CudaDeviceGuard& operator=(CudaDeviceGuard&&) = delete; explicit CudaDeviceGuard(int device) { TP_CUDA_CHECK(cudaGetDevice(&device_)); TP_CUDA_CHECK(cudaSetDevice(device)); } ~CudaDeviceGuard() { TP_CUDA_CHECK(cudaSetDevice(device_)); } private: int device_; }; class CudaEvent { public: CudaEvent() = delete; CudaEvent(const CudaEvent&) = delete; CudaEvent(CudaEvent&&) = delete; CudaEvent& operator=(const CudaEvent&) = delete; CudaEvent& operator=(CudaEvent&&) = delete; explicit CudaEvent(int device, bool interprocess = false) : deviceIdx_(device) { CudaDeviceGuard guard(deviceIdx_); int flags = cudaEventDisableTiming; if (interprocess) { flags |= cudaEventInterprocess; } TP_CUDA_CHECK(cudaEventCreateWithFlags(&ev_, flags)); } explicit CudaEvent(int device, cudaIpcEventHandle_t handle) : deviceIdx_(device) { // It could crash if we don't set device when creating events from handles CudaDeviceGuard guard(deviceIdx_); TP_CUDA_CHECK(cudaIpcOpenEventHandle(&ev_, handle)); } void record(cudaStream_t stream) { CudaDeviceGuard guard(deviceIdx_); TP_CUDA_CHECK(cudaEventRecord(ev_, stream)); } void wait(cudaStream_t stream, int device) { CudaDeviceGuard guard(device); TP_CUDA_CHECK(cudaStreamWaitEvent(stream, ev_, 0)); } bool query() const { CudaDeviceGuard guard(deviceIdx_); cudaError_t res = cudaEventQuery(ev_); if (res == cudaErrorNotReady) { return false; } TP_CUDA_CHECK(res); return true; } cudaEvent_t raw() { return ev_; } cudaIpcEventHandle_t getIpcHandle() const { CudaDeviceGuard guard(deviceIdx_); cudaIpcEventHandle_t handle; TP_CUDA_CHECK(cudaIpcGetEventHandle(&handle, ev_)); return handle; } std::string serializedHandle() { cudaIpcEventHandle_t handle = getIpcHandle(); return std::string(reinterpret_cast(&handle), sizeof(handle)); } ~CudaEvent() { CudaDeviceGuard guard(deviceIdx_); TP_CUDA_CHECK(cudaEventDestroy(ev_)); } private: cudaEvent_t ev_; int deviceIdx_; }; inline int cudaDeviceForPointer(const CudaLib& cudaLib, const void* ptr) { // When calling cudaSetDevice(0) when device 0 hasn't been initialized yet // the CUDA runtime sets the current context of the CUDA driver to what's // apparently an invalid non-null value. This causes cudaPointerGetAttributes // to misbehave (possibly other functions too, but this is the only function // that we call outside of a device guard). In fact, device guards are likely // the reason we call cudaSetDevice(0) at all, because at destruction they // reset the current device to the value it had before construction, and that // will be zero if no other device guard was active at that point. // The ugly workaround is to manually undo the runtime's errors, by clearing // the driver's current context. In a sense, by creating a "reverse" guard. CUcontext ctx; TP_CUDA_DRIVER_CHECK(cudaLib, cudaLib.ctxGetCurrent(&ctx)); TP_CUDA_DRIVER_CHECK(cudaLib, cudaLib.ctxSetCurrent(nullptr)); int deviceIdx; TP_CUDA_DRIVER_CHECK( cudaLib, cudaLib.pointerGetAttribute( &deviceIdx, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, reinterpret_cast(ptr))); TP_CUDA_DRIVER_CHECK(cudaLib, cudaLib.ctxSetCurrent(ctx)); return deviceIdx; } class CudaPinnedMemoryDeleter { public: explicit CudaPinnedMemoryDeleter(int deviceIdx) : deviceIdx_(deviceIdx) {} void operator()(uint8_t* ptr) { CudaDeviceGuard guard(deviceIdx_); TP_CUDA_CHECK(cudaFreeHost(ptr)); } private: const int deviceIdx_; }; using CudaPinnedBuffer = std::unique_ptr; inline CudaPinnedBuffer makeCudaPinnedBuffer(size_t length, int deviceIdx) { CudaDeviceGuard guard(deviceIdx); uint8_t* ptr; TP_CUDA_CHECK(cudaMallocHost(&ptr, length)); return CudaPinnedBuffer(ptr, CudaPinnedMemoryDeleter(deviceIdx)); } class CudaDeviceBuffer { public: CudaDeviceBuffer() = default; CudaDeviceBuffer(size_t length, int deviceIdx) { CudaDeviceGuard guard(deviceIdx); uint8_t* ptr; TP_CUDA_CHECK(cudaMalloc(&ptr, length)); ptr_ = {ptr, Deleter{deviceIdx}}; } uint8_t* ptr() const { return ptr_.get(); } int deviceIdx() const { return ptr_.get_deleter().deviceIdx; } void reset() { ptr_.reset(); } cudaIpcMemHandle_t getIpcHandle() const { CudaDeviceGuard guard(deviceIdx()); cudaIpcMemHandle_t handle; TP_CUDA_CHECK(cudaIpcGetMemHandle(&handle, ptr_.get())); return handle; } private: struct Deleter { int deviceIdx; void operator()(uint8_t* ptr) { CudaDeviceGuard guard(deviceIdx); TP_CUDA_CHECK(cudaFree(ptr)); } }; std::unique_ptr ptr_; }; class CudaIpcBuffer { public: CudaIpcBuffer() = default; CudaIpcBuffer(int deviceIdx, const cudaIpcMemHandle_t& handle) { CudaDeviceGuard guard(deviceIdx); void* ptr; TP_CUDA_CHECK( cudaIpcOpenMemHandle(&ptr, handle, cudaIpcMemLazyEnablePeerAccess)); ptr_ = {reinterpret_cast(ptr), Deleter{deviceIdx}}; } uint8_t* ptr() const { return ptr_.get(); } int deviceIdx() const { return ptr_.get_deleter().deviceIdx; } void reset() { ptr_.reset(); } private: struct Deleter { int deviceIdx; void operator()(uint8_t* ptr) { CudaDeviceGuard guard(deviceIdx); TP_CUDA_CHECK(cudaIpcCloseMemHandle(ptr)); } }; std::unique_ptr ptr_; }; inline std::string getUuidOfDevice(const CudaLib& cudaLib, int deviceIdx) { CUdevice device; TP_CUDA_DRIVER_CHECK(cudaLib, cudaLib.deviceGet(&device, deviceIdx)); CUuuid uuid; TP_CUDA_DRIVER_CHECK(cudaLib, cudaLib.deviceGetUuid(&uuid, device)); // The CUDA driver and NVML choose two different format for UUIDs, hence we // need to reconcile them. We do so using the most human readable format, that // is "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee" (8-4-4-4-12). std::ostringstream uuidSs; uuidSs << std::hex << std::setfill('0'); for (int j = 0; j < 16; ++j) { // The bitmask is required otherwise a negative value will get promoted to // (signed) int with sign extension if char is signed. uuidSs << std::setw(2) << (uuid.bytes[j] & 0xff); if (j == 3 || j == 5 || j == 7 || j == 9) { uuidSs << '-'; } } std::string uuidStr = uuidSs.str(); TP_THROW_ASSERT_IF(!isValidUuid(uuidStr)) << "Couldn't obtain valid UUID for GPU #" << deviceIdx << " from CUDA driver. Got: " << uuidStr; return uuidStr; } inline std::vector getUuidsOfVisibleDevices( const CudaLib& cudaLib) { int deviceCount; TP_CUDA_DRIVER_CHECK(cudaLib, cudaLib.deviceGetCount(&deviceCount)); std::vector result(deviceCount); for (int devIdx = 0; devIdx < deviceCount; ++devIdx) { result[devIdx] = getUuidOfDevice(cudaLib, devIdx); } return result; } inline std::vector getCudaDevices(const CudaLib& cudaLib) { int deviceCount; TP_CUDA_DRIVER_CHECK(cudaLib, cudaLib.deviceGetCount(&deviceCount)); std::vector result(deviceCount); for (int devIdx = 0; devIdx < deviceCount; ++devIdx) { result[devIdx] = Device{kCudaDeviceType, devIdx}; } return result; } } // namespace tensorpipe ================================================ FILE: tensorpipe/common/cuda_buffer.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include namespace tensorpipe { Device CudaBuffer::getDevice() const { static CudaLib cudaLib = []() { Error error; CudaLib lib; std::tie(error, lib) = CudaLib::create(); TP_THROW_ASSERT_IF(error) << "Cannot get CUDA device for pointer because libcuda could not be loaded: " << error.what(); return lib; }(); return Device{kCudaDeviceType, cudaDeviceForPointer(cudaLib, ptr)}; } } // namespace tensorpipe ================================================ FILE: tensorpipe/common/cuda_buffer.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include namespace tensorpipe { struct CudaBuffer { void* ptr{nullptr}; cudaStream_t stream{cudaStreamDefault}; Device getDevice() const; }; } // namespace tensorpipe ================================================ FILE: tensorpipe/common/cuda_lib.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #define TP_CUDA_DRIVER_CHECK(cuda_lib, a) \ do { \ CUresult error = (a); \ if (error != CUDA_SUCCESS) { \ CUresult res; \ const char* errorName; \ const char* errorStr; \ res = cuda_lib.getErrorName(error, &errorName); \ TP_THROW_ASSERT_IF(res != CUDA_SUCCESS); \ res = cuda_lib.getErrorString(error, &errorStr); \ TP_THROW_ASSERT_IF(res != CUDA_SUCCESS); \ TP_THROW_ASSERT() << __TP_EXPAND_OPD(a) << " " << errorName << " (" \ << errorStr << ")"; \ } \ } while (false) namespace tensorpipe { class NoDevicesError final : public BaseError { public: std::string what() const override { return "The CUDA driver failed to init because it didn't find any device"; } }; // Master list of all symbols we care about from libcuda. #define TP_FORALL_CUDA_SYMBOLS(_) \ _(ctxGetCurrent, cuCtxGetCurrent, (CUcontext*)) \ _(ctxSetCurrent, cuCtxSetCurrent, (CUcontext)) \ _(deviceGet, cuDeviceGet, (CUdevice*, int)) \ _(deviceGetCount, cuDeviceGetCount, (int*)) \ _(deviceGetUuid, cuDeviceGetUuid, (CUuuid*, CUdevice)) \ _(getErrorName, cuGetErrorName, (CUresult, const char**)) \ _(getErrorString, cuGetErrorString, (CUresult, const char**)) \ _(init, cuInit, (unsigned int)) \ _(memGetAddressRange_v2, \ cuMemGetAddressRange_v2, \ (CUdeviceptr*, size_t*, CUdeviceptr)) \ _(pointerGetAttribute, \ cuPointerGetAttribute, \ (void*, CUpointer_attribute, CUdeviceptr)) // Wrapper for libcuda. class CudaLib { private: explicit CudaLib(DynamicLibraryHandle dlhandle) : dlhandle_(std::move(dlhandle)) {} DynamicLibraryHandle dlhandle_; #define TP_DECLARE_FIELD(method_name, function_name, args_types) \ CUresult(*function_name##_ptr_) args_types = nullptr; TP_FORALL_CUDA_SYMBOLS(TP_DECLARE_FIELD) #undef TP_DECLARE_FIELD public: CudaLib() = default; #define TP_FORWARD_CALL(method_name, function_name, args_types) \ template \ auto method_name(Args&&... args) const { \ return (*function_name##_ptr_)(std::forward(args)...); \ } TP_FORALL_CUDA_SYMBOLS(TP_FORWARD_CALL) #undef TP_FORWARD_CALL static std::tuple create() { Error error; DynamicLibraryHandle dlhandle; // To keep things "neat" and contained, we open in "local" mode (as // opposed to global) so that the cuda symbols can only be resolved // through this handle and are not exposed (a.k.a., "leaked") to other // shared objects. std::tie(error, dlhandle) = DynamicLibraryHandle::create("libcuda.so.1", RTLD_LOCAL | RTLD_LAZY); if (error) { return std::make_tuple(std::move(error), CudaLib()); } // Log at level 9 as we can't know whether this will be used in a transport // or channel, thus err on the side of this being as low-level as possible // because we don't expect this to be of interest that often. TP_VLOG(9) << [&]() -> std::string { std::string filename; std::tie(error, filename) = dlhandle.getFilename(); if (error) { return "Couldn't determine location of shared library libcuda.so.1: " + error.what(); } return "Found shared library libcuda.so.1 at " + filename; }(); CudaLib lib(std::move(dlhandle)); #define TP_LOAD_SYMBOL(method_name, function_name, args_types) \ { \ void* ptr; \ std::tie(error, ptr) = lib.dlhandle_.loadSymbol(#function_name); \ if (error) { \ return std::make_tuple(std::move(error), CudaLib()); \ } \ TP_THROW_ASSERT_IF(ptr == nullptr); \ lib.function_name##_ptr_ = \ reinterpret_cast(ptr); \ } TP_FORALL_CUDA_SYMBOLS(TP_LOAD_SYMBOL) #undef TP_LOAD_SYMBOL CUresult result = lib.init(0); // If the driver doesn't find any devices it fails to init (beats me why) // but we must support this case, by disabling the channels, rather than // throwing. Hence we treat it as if we couldn't find the driver. if (result == CUDA_ERROR_NO_DEVICE) { return std::make_tuple(TP_CREATE_ERROR(NoDevicesError), CudaLib()); } TP_CUDA_DRIVER_CHECK(lib, result); return std::make_tuple(Error::kSuccess, std::move(lib)); } CUresult memGetAddressRange( CUdeviceptr* pbase, size_t* psize, CUdeviceptr dptr) const { // NOTE: We are forwarding to cuMemGetAddressRange_v2() directly, because // the name cuMemGetAddressRange is #defined to its _v2 variant in cuda.h. // Calling the actual cuMemGetAddressRange() function here would lead to a // CUDA_ERROR_INVALID_CONTEXT. return memGetAddressRange_v2(pbase, psize, dptr); } }; #undef TP_FORALL_CUDA_SYMBOLS } // namespace tensorpipe ================================================ FILE: tensorpipe/common/cuda_loop.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include namespace tensorpipe { namespace { struct CudaCallback { CudaLoop& loop; std::function callback; CudaCallback(CudaLoop& loop, std::function callback) : loop(loop), callback(std::move(callback)) {} }; class CudaLoopClosedError final : public BaseError { std::string what() const override { return "CUDA loop already closed"; } }; } // namespace CudaLoop::CudaLoop() { thread_ = std::thread([this]() { setThreadName("TP_CUDA_callback_loop"); processCallbacks(); }); } CudaLoop::~CudaLoop() { join(); } void CudaLoop::join() { close(); if (!joined_.exchange(true)) { thread_.join(); } } void CudaLoop::close() { std::unique_lock lock(mutex_); if (closed_) { return; } closed_ = true; cv_.notify_all(); } void CudaLoop::processCallbacks() { for (;;) { std::deque operations; { std::unique_lock lock(mutex_); if (operations_.empty()) { if (closed_ && pendingOperations_ == 0) { break; } else { cv_.wait(lock); } } std::swap(operations, operations_); pendingOperations_ -= operations.size(); } for (auto& op : operations) { op.callback(op.error); } } } void CudaLoop::addCallback( int device, cudaStream_t stream, std::function callback) { { std::unique_lock lock(mutex_); if (closed_) { callback(TP_CREATE_ERROR(CudaLoopClosedError)); return; } ++pendingOperations_; } auto cudaCallback = std::make_unique(*this, std::move(callback)); CudaDeviceGuard guard(device); TP_CUDA_CHECK(cudaStreamAddCallback( stream, runCudaCallback, cudaCallback.release(), 0)); } void CUDART_CB CudaLoop::runCudaCallback( cudaStream_t /* unused */, cudaError_t cudaError, void* callbackPtr) { std::unique_ptr cudaCallback( reinterpret_cast(callbackPtr)); CudaLoop& loop = cudaCallback->loop; { std::unique_lock lock(loop.mutex_); auto error = Error::kSuccess; if (cudaError != cudaSuccess) { error = TP_CREATE_ERROR(CudaError, cudaError); } loop.operations_.push_back( {std::move(cudaCallback->callback), std::move(error)}); loop.cv_.notify_all(); } cudaCallback.reset(); } } // namespace tensorpipe ================================================ FILE: tensorpipe/common/cuda_loop.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include #include #include #include namespace tensorpipe { class CudaLoop { struct Operation { std::function callback; Error error; }; public: CudaLoop(); ~CudaLoop(); void join(); void close(); void addCallback( int device, cudaStream_t stream, std::function callback); private: std::thread thread_; std::deque operations_; std::mutex mutex_; std::condition_variable cv_; uint64_t pendingOperations_{0}; bool closed_{false}; std::atomic joined_{false}; void processCallbacks(); // Proxy static method for cudaStreamAddCallback(), which does not accept // lambdas. static void CUDART_CB runCudaCallback( cudaStream_t stream, cudaError_t cudaError, void* callbackPtr); }; } // namespace tensorpipe ================================================ FILE: tensorpipe/common/deferred_executor.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include #include #include #include #include #include #include #include namespace tensorpipe { // Dealing with thread-safety using per-object mutexes is prone to deadlocks // because of reentrant calls (both "upward", when invoking a callback that // calls back into a method of the object, and "downward", when passing a // callback to an operation of another object that calls it inline) and lock // inversions (object A calling a method of object B and attempting to acquire // its lock, with the reverse happening at the same time). Using a "loop" model, // where operations aren't called inlined and piled up on the stack but instead // deferred to a later iteration of the loop, solves many of these issues. This // abstract interface defines the essential methods we need such event loops to // provide. class DeferredExecutor { public: using TTask = std::function; virtual void deferToLoop(TTask fn) = 0; virtual bool inLoop() const = 0; // Prefer using deferToLoop over runInLoop when you don't need to wait for the // result. template void runInLoop(F&& fn) { // When called from the event loop thread itself (e.g., from a callback), // deferring would cause a deadlock because the given callable can only be // run when the loop is allowed to proceed. On the other hand, it means it // is thread-safe to run it immediately. The danger here however is that it // can lead to an inconsistent order between operations run from the event // loop, from outside of it, and deferred. if (inLoop()) { fn(); } else { // Must use a copyable wrapper around std::promise because // we use it from a std::function which must be copyable. auto promise = std::make_shared>(); auto future = promise->get_future(); // Marked as mutable because the fn might hold some state (e.g., the // closure of a lambda) which it might want to modify. deferToLoop([promise, fn{std::forward(fn)}]() mutable { try { fn(); promise->set_value(); } catch (...) { promise->set_exception(std::current_exception()); } }); future.get(); } } virtual ~DeferredExecutor() = default; }; // Transports typically have their own thread they can use as deferred executors // but many objects (like pipes) don't naturally own threads and introducing // them would also mean introducing latency costs due to context switching. // In order to give these objects a loop they can use to defer their operations // to, we can have them temporarily hijack the calling thread and repurpose it // to run an ephemeral loop on which to run the original task and all the ones // that a task running on the loop chooses to defer to a later iteration of the // loop, recursively. Once all these tasks have been completed, the makeshift // loop is dismantled and control of the thread is returned to the caller. // FIXME Rename this to OnDemandDeferredExecutor? class OnDemandDeferredExecutor : public DeferredExecutor { public: bool inLoop() const override { // If the current thread is already holding the lock (i.e., it's already in // this function somewhere higher up in the stack) then this check won't // race and we will detect it correctly. If this is not the case, then this // check may race with another thread, but that's nothing to worry about // because in either case the outcome will be negative. return currentLoop_ == std::this_thread::get_id(); } void deferToLoop(TTask fn) override { { std::unique_lock lock(mutex_); pendingTasks_.push_back(std::move(fn)); if (currentLoop_ != std::thread::id()) { return; } currentLoop_ = std::this_thread::get_id(); } while (true) { TTask task; { std::unique_lock lock(mutex_); if (pendingTasks_.empty()) { currentLoop_ = std::thread::id(); return; } task = std::move(pendingTasks_.front()); pendingTasks_.pop_front(); } task(); } } private: std::mutex mutex_; std::atomic currentLoop_{std::thread::id()}; std::deque pendingTasks_; }; class EventLoopDeferredExecutor : public virtual DeferredExecutor { public: void deferToLoop(TTask fn) override { { std::unique_lock lock(mutex_); if (likely(isThreadConsumingDeferredFunctions_)) { fns_.push_back(std::move(fn)); wakeupEventLoopToDeferFunction(); return; } } // Must call it without holding the lock, as it could cause a reentrant // call. onDemandLoop_.deferToLoop(std::move(fn)); } inline bool inLoop() const override { { std::unique_lock lock(mutex_); if (likely(isThreadConsumingDeferredFunctions_)) { return std::this_thread::get_id() == thread_.get_id(); } } return onDemandLoop_.inLoop(); } protected: // This is the actual long-running event loop, which is implemented by // subclasses and called inside the thread owned by this parent class. virtual void eventLoop() = 0; // This is called after the event loop terminated, still within the thread // that used to run that event loop. It will be called after this class has // transitioned control to the on-demand deferred executor. It thus allows to // clean up any resources without worrying about new work coming in. virtual void cleanUpLoop() {} // This function is called by the parent class when a function is deferred to // it, and must be implemented by subclasses, which are required to have their // event loop call runDeferredFunctionsFromEventLoop as soon as possible. This // function is guaranteed to be called once per function deferral (in case // subclasses want to keep count). virtual void wakeupEventLoopToDeferFunction() = 0; // Called by subclasses to have the parent class start the thread. We cannot // implicitly call this in the parent class's constructor because it could // lead to a race condition between the event loop (run by the thread) and the // subclass's constructor (which is executed after the parent class's one). // Hence this method should be invoked at the end of the subclass constructor. void startThread(std::string threadName) { // FIXME Once we've fixed the viability (by having a factory function return // a nullptr, instead of having a method on the context), remove this, and // instead add a safety check in deferToLoop that ensures that within the // isThreadConsumingDeferredFunctions_ branch the thread is joinable, i.e., // up and still running. { std::unique_lock lock(mutex_); TP_DCHECK(!isThreadConsumingDeferredFunctions_); TP_DCHECK(!thread_.joinable()); TP_DCHECK(fns_.empty()); isThreadConsumingDeferredFunctions_ = true; } thread_ = std::thread( &EventLoopDeferredExecutor::loop, this, std::move(threadName)); } // This is basically the reverse operation of the above, and is needed for the // same (reversed) reason. Note that this only waits for the thread to finish: // the subclass must have its own way of telling its event loop to stop and // return control. void joinThread() { thread_.join(); } // Must be called by the subclass after it was woken up. Even if multiple // functions were deferred, this method only needs to be called once. However, // care must be taken to avoid races between this call and new wakeups. This // method also returns the number of functions it executed, in case the // subclass is keeping count. size_t runDeferredFunctionsFromEventLoop() { decltype(fns_) fns; { std::unique_lock lock(mutex_); std::swap(fns, fns_); } for (auto& fn : fns) { fn(); } return fns.size(); } private: void loop(std::string threadName) { setThreadName(std::move(threadName)); eventLoop(); // The loop is winding down and "handing over" control to the on demand // loop. But it can only do so safely once there are no pending deferred // functions, as otherwise those may risk never being executed. while (true) { decltype(fns_) fns; { std::unique_lock lock(mutex_); if (fns_.empty()) { isThreadConsumingDeferredFunctions_ = false; break; } std::swap(fns, fns_); } for (auto& fn : fns) { fn(); } } cleanUpLoop(); } std::thread thread_; // Whether the thread is taking care of running the deferred functions // // This is part of what can only be described as a hack. Sometimes, even when // using the API as intended, objects try to defer tasks to the loop after // that loop has been closed and joined. Since those tasks may be lambdas that // captured shared_ptrs to the objects in their closures, this may lead to a // reference cycle and thus a leak. Our hack is to have this flag to record // when we can no longer defer tasks to the loop and in that case we just run // those tasks inline. In order to keep ensuring the single-threadedness // assumption of our model (which is what we rely on to be safe from race // conditions) we use an on-demand loop. This flag starts as false as in some // cases (like non-viable transports) the thread may never be started and thus // we want the on-demand loop to be engaged from the beginning. bool isThreadConsumingDeferredFunctions_{false}; OnDemandDeferredExecutor onDemandLoop_; // Mutex to guard the deferring and the running of functions. mutable std::mutex mutex_; // List of deferred functions to run when the loop is ready. std::vector> fns_; }; } // namespace tensorpipe ================================================ FILE: tensorpipe/common/defs.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include #include #include #include #include #include #include // Branch hint macros. C++20 will include them as part of language. #define likely(x) __builtin_expect((x) ? 1 : 0, 1) #define unlikely(x) __builtin_expect((x) ? 1 : 0, 0) /// Auxiliar class to build exception, fill up it's what message and throw /// in a single line. Usually uses as r-value so that destructor is called /// at end of line that created it, throwing the desired exception. /// (See TP_THROW). namespace tensorpipe { template class ExceptionThrower final { public: template ExceptionThrower(TArgs&&... nonWhat) { exBuilder_ = [&](const std::string& what) { return TException(std::move(nonWhat)..., what); }; } // Throw exception on destructor, when l-value instance goes of scope // and stream has been written. Use noexcept(false) to inform the compiler // that it's ok to throw in destructor. ~ExceptionThrower() noexcept(false) { throw exBuilder_(oss_.str() + "\""); } std::ostream& getStream() { return oss_; } protected: std::function exBuilder_; std::ostringstream oss_; }; } // namespace tensorpipe // // Macros to throw commonly used exceptions. // #define TP_STRINGIFY(s) #s #define TP_EXPAND_TO_STR(s) TP_STRINGIFY(s) // Strip all leading components up to the *last* occurrence of "tensorpipe/". // This removes all the system-specific prefixes added by the compiler. #define TP_TRIM_FILENAME(s) \ [](const char* filename) -> const char* { \ while (true) { \ const char* match = std::strstr(filename + 1, "tensorpipe/"); \ if (match == nullptr) { \ break; \ } \ filename = match; \ } \ return filename; \ }(s) #define TP_LOG_LOC \ TP_TRIM_FILENAME(__FILE__) << ":" << TP_EXPAND_TO_STR(__LINE__) #define TP_LOG_PREFFIX "In " << __func__ << " at " << TP_LOG_LOC #define TP_THROW(ex_type, ...) \ ::tensorpipe::ExceptionThrower(__VA_ARGS__).getStream() \ << TP_LOG_PREFFIX << " \"" #define TP_THROW_EINVAL() TP_THROW(std::invalid_argument) #define TP_THROW_SYSTEM(err) \ TP_THROW(std::system_error, err, std::system_category()) #define TP_THROW_SYSTEM_IF(cond, err) \ if (unlikely(cond)) \ TP_THROW_SYSTEM(err) #define TP_THROW_SYSTEM_CODE(err) TP_THROW(std::system_error, err) #define TP_THROW_SYSTEM_CODE_IF(cond, err) \ if (unlikely(cond)) \ TP_THROW_SYSTEM_CODE(err) << TP_STRINGIFY(cond) #define TP_THROW_ASSERT() TP_THROW(std::runtime_error) #define TP_THROW_ASSERT_IF(cond) \ if (unlikely(cond)) \ TP_THROW_ASSERT() << TP_STRINGIFY(cond) // Conditional throwing exception #define TP_THROW_IF_NULLPTR(ptr) \ if (unlikely(ptr == nullptr)) \ TP_THROW_EINVAL() << TP_STRINGIFY(ptr) << " has nullptr value" // Safe-cast to std::error_code namespace tensorpipe { inline std::error_code toErrorCode(ssize_t e) { if (unlikely(e <= 0)) { TP_THROW_EINVAL() << "Error not a positive number. " << "Is this value really an error?"; } else if (unlikely(e > std::numeric_limits::max())) { TP_THROW_EINVAL() << "Error out of range. Is this really an error?"; } return {static_cast(e), std::system_category()}; } } // namespace tensorpipe // // Simple logging to stderr. This macros can be replaced if a more // sophisticated logging is used in the future. // Currently, tensorpipe is meant be used as shared library and to use // exceptions for error handling, so the need for logging in // the library is reduced. // namespace tensorpipe { class LogEntry final { public: explicit LogEntry(char type) { oss_ << type; // In C++17 use std::timespec. struct timeval tv; // In C++17 use std::timespec_get. gettimeofday(&tv, nullptr); struct std::tm tm; // Need to use localtime_r as std::localtime may not be thread-safe. localtime_r(&tv.tv_sec, &tm); oss_ << std::setfill('0') << std::setw(2) << 1 + tm.tm_mon << std::setw(2) << tm.tm_mday << ' ' << std::setw(2) << tm.tm_hour << ':' << std::setw(2) << tm.tm_min << ':' << std::setw(2) << tm.tm_sec << '.' << std::setw(6) << tv.tv_usec; // The glog format uses the thread ID but it's painful to get (there is a // gettid syscall, but it's not exposed in glibc) so we use the PID instead. oss_ << ' ' << std::setfill(' ') << std::setw(5) << getpid(); } ~LogEntry() noexcept { // Multiple threads or processes writing to the same log (e.g., stderr) // might lead to interleaved text and thus garbled output. It seems that a // single write syscall is "rather" atomic so instead of issuing a separate // write for the trailing newline we append it to the message and write them // together. oss_ << std::endl; std::cerr << oss_.str(); } std::ostream& getStream() { return oss_; } protected: std::ostringstream oss_; }; } // namespace tensorpipe #define TP_LOG_DEBUG() \ ::tensorpipe::LogEntry('V').getStream() << ' ' << TP_LOG_LOC << "] " #define TP_LOG_INFO() \ ::tensorpipe::LogEntry('I').getStream() << ' ' << TP_LOG_LOC << "] " #define TP_LOG_WARNING() \ ::tensorpipe::LogEntry('W').getStream() << ' ' << TP_LOG_LOC << "] " #define TP_LOG_ERROR() \ ::tensorpipe::LogEntry('E').getStream() << ' ' << TP_LOG_LOC << "] " #define TP_LOG_DEBUG_IF(cond) \ if (unlikely(cond)) \ TP_LOG_DEBUG() #define TP_LOG_INFO_IF(cond) \ if (unlikely(cond)) \ TP_LOG_INFO() #define TP_LOG_WARNING_IF(cond) \ if (unlikely(cond)) \ TP_LOG_WARNING() #define TP_LOG_ERROR_IF(cond) \ if (unlikely(cond)) \ TP_LOG_ERROR() #define __TP_EXPAND_OPD(opd) TP_STRINGIFY(opd) << "(" << (opd) << ")" // // Debug checks. // Note that non-debug checks are not provided because developers // must handle all errors explicitly. // #define __TP_DCHECK(a) \ if (unlikely(!((a)))) \ TP_THROW_ASSERT() << "Expected true for " << __TP_EXPAND_OPD(a) #define __TP_DCHECK_CMP(a, b, op) \ if (unlikely(!((a)op(b)))) \ TP_THROW_ASSERT() << "Expected " << __TP_EXPAND_OPD(a) \ << " " TP_STRINGIFY(op) << " " << __TP_EXPAND_OPD(b) // Expand macro only in debug mode. #ifdef NDEBUG #define _TP_DLOG() \ while (false) \ TP_LOG_DEBUG() #define _TP_DCHECK(a) \ while (false) \ __TP_DCHECK(a) #define _TP_DCHECK_CMP(a, b, op) \ while (false) \ __TP_DCHECK_CMP(a, b, op) #else #define _TP_DLOG() TP_LOG_DEBUG() #define _TP_DCHECK(a) __TP_DCHECK(a) #define _TP_DCHECK_CMP(a, b, op) __TP_DCHECK_CMP(a, b, op) #endif // Public API for debug logging. #define TP_DLOG() _TP_DLOG() // Public API for debug checks. #define TP_DCHECK(a) _TP_DCHECK(a) #define TP_DCHECK_EQ(a, b) _TP_DCHECK_CMP(a, b, ==) #define TP_DCHECK_NE(a, b) _TP_DCHECK_CMP(a, b, !=) #define TP_DCHECK_LT(a, b) _TP_DCHECK_CMP(a, b, <) #define TP_DCHECK_LE(a, b) _TP_DCHECK_CMP(a, b, <=) #define TP_DCHECK_GT(a, b) _TP_DCHECK_CMP(a, b, >) #define TP_DCHECK_GE(a, b) _TP_DCHECK_CMP(a, b, >=) // // Verbose logging. // Some logging is helpful to diagnose tricky production issues but is too // verbose to keep on all the time. It also should not be controlled by the // debug flags, as we want to allow it to be enabled in production builds. // // The level of each TP_VLOG call should reflect where the object issuing it is // located in the stack , and whether it's a call that involves handling // requests from objects higher up, or issuing requests to objects lower down. // This brings us to the following classification: // - level 1 is for requests that core classes receive from the user // - level 2 is for generic core classes stuff // - level 3 is for requests that core classes issue to channels/transports // - level 4 is for requests that channels receive from core classes // - level 5 is for generic channels stuff // - level 6 is for requests that channels issue to transports // - level 7 is for requests that transports receive from core classes/channels // - level 8 is for generic transports stuff // - level 9 is for how transports deal with system resources namespace tensorpipe { inline unsigned long getVerbosityLevelInternal() { char* levelStr = std::getenv("TP_VERBOSE_LOGGING"); if (levelStr == nullptr) { return 0; } return std::strtoul(levelStr, /*str_end=*/nullptr, /*base=*/10); } inline unsigned long getVerbosityLevel() { static unsigned long level = getVerbosityLevelInternal(); return level; } } // namespace tensorpipe #define TP_VLOG(level) TP_LOG_DEBUG_IF(level <= getVerbosityLevel()) // // Argument checks // #define TP_ARG_CHECK(a) \ if (unlikely(!((a)))) \ TP_THROW_EINVAL() << "Expected argument to be true: " << __TP_EXPAND_OPD(a) #define _TP_ARG_CMP(a, b, op) \ if (unlikely(!((a)op(b)))) \ TP_THROW_EINVAL() << "Expected argument " << __TP_EXPAND_OPD(a) \ << " " TP_STRINGIFY(_op_) << " " << __TP_EXPAND_OPD(b) #define TP_ARG_CHECK_EQ(a, b) _TP_ARG_CMP(a, b, ==) #define TP_ARG_CHECK_NE(a, b) _TP_ARG_CMP(a, b, !=) #define TP_ARG_CHECK_LT(a, b) _TP_ARG_CMP(a, b, <) #define TP_ARG_CHECK_LE(a, b) _TP_ARG_CMP(a, b, <=) #define TP_ARG_CHECK_GT(a, b) _TP_ARG_CMP(a, b, >) #define TP_ARG_CHECK_GE(a, b) _TP_ARG_CMP(a, b, >=) // Define DEXCEPT macro that is noexcept only in debug mode. #ifdef NDEBUG #define DEXCEPT noexcept(true) #else #define DEXCEPT noexcept(false) #endif #define TP_LOG_EXCEPTION(e) \ TP_LOG_ERROR() << "Exception in " << __FUNCTION__ \ << " . Message: " << e.what() ================================================ FILE: tensorpipe/common/device.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include namespace tensorpipe { const std::string kCpuDeviceType{"cpu"}; const std::string kCudaDeviceType{"cuda"}; struct Device { std::string type; int index; // This pointless constructor is needed to work around a bug in GCC 5.5 (and // possibly other versions). It appears to be needed in the nop types that // are used inside nop::Optional. Device() {} Device(std::string type, int index) : type(std::move(type)), index(index) {} std::string toString() const { std::stringstream ss; ss << type << ":" << index; return ss.str(); } bool operator==(const Device& other) const { return type == other.type && index == other.index; } }; } // namespace tensorpipe namespace std { template <> struct hash<::tensorpipe::Device> { size_t operator()(const ::tensorpipe::Device& device) const noexcept { return std::hash{}(device.toString()); } }; template <> struct hash> { size_t operator()(const std::pair<::tensorpipe::Device, ::tensorpipe::Device>& p) const noexcept { size_t h1 = std::hash<::tensorpipe::Device>{}(p.first); size_t h2 = std::hash<::tensorpipe::Device>{}(p.second); // Shifting one hash to avoid collisions between (a, b) and (b, a). return h1 ^ (h2 << 1); } }; } // namespace std ================================================ FILE: tensorpipe/common/dl.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include #include #include #include #include #include namespace tensorpipe { class DlError final : public BaseError { public: explicit DlError(char* error) : error_(error) {} std::string what() const override { return error_; } private: std::string error_; }; class DynamicLibraryHandle { public: DynamicLibraryHandle() = default; static std::tuple create( const char* filename, int flags) { void* ptr = ::dlopen(filename, flags); if (ptr == nullptr) { return std::make_tuple( TP_CREATE_ERROR(DlError, ::dlerror()), DynamicLibraryHandle()); } return std::make_tuple(Error::kSuccess, DynamicLibraryHandle(ptr)); } bool hasValue() const { return ptr_ != nullptr; } std::tuple loadSymbol(const char* name) { // Since dlsym doesn't return a specific value to signal errors (because // NULL is a valid return value), we need to detect errors by calling // dlerror and checking whether it returns a string or not (i.e., NULL). But // in order to do so, we must first reset the error, in case one was already // recorded. ::dlerror(); void* ptr = ::dlsym(ptr_.get(), name); char* err = ::dlerror(); if (err != nullptr) { return std::make_tuple(TP_CREATE_ERROR(DlError, err), nullptr); } return std::make_tuple(Error::kSuccess, ptr); } std::tuple getFilename() { struct link_map* linkMap; int rv = ::dlinfo(ptr_.get(), RTLD_DI_LINKMAP, &linkMap); if (rv < 0) { return std::make_tuple( TP_CREATE_ERROR(DlError, ::dlerror()), std::string()); } std::array path; char* resolvedPath = ::realpath(linkMap->l_name, path.data()); if (resolvedPath == nullptr) { return std::make_tuple( TP_CREATE_ERROR(SystemError, "realpath", errno), std::string()); } TP_DCHECK(resolvedPath == path.data()); return std::make_tuple(Error::kSuccess, std::string(path.data())); } private: struct Deleter { void operator()(void* ptr) { int res = ::dlclose(ptr); TP_THROW_ASSERT_IF(res != 0) << "dlclose() failed: " << ::dlerror(); } }; DynamicLibraryHandle(void* ptr) : ptr_(ptr, Deleter{}) {} std::unique_ptr ptr_; }; } // namespace tensorpipe ================================================ FILE: tensorpipe/common/epoll_loop.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include namespace tensorpipe { EpollLoop::EpollLoop(DeferredExecutor& deferredExecutor) : deferredExecutor_(deferredExecutor) { { auto rv = ::epoll_create(1); TP_THROW_SYSTEM_IF(rv == -1, errno); epollFd_ = Fd(rv); } { auto rv = ::eventfd(0, EFD_NONBLOCK); TP_THROW_SYSTEM_IF(rv == -1, errno); eventFd_ = Fd(rv); } // Register the eventfd with epoll. { struct epoll_event ev; ev.events = EPOLLIN; ev.data.u64 = 0; auto rv = ::epoll_ctl(epollFd_.fd(), EPOLL_CTL_ADD, eventFd_.fd(), &ev); TP_THROW_SYSTEM_IF(rv == -1, errno); } // Start epoll(2) thread. thread_ = std::thread(&EpollLoop::loop, this); } void EpollLoop::close() { if (!closed_.exchange(true)) { wakeup(); } } void EpollLoop::join() { close(); if (!joined_.exchange(true)) { thread_.join(); } } EpollLoop::~EpollLoop() { join(); // Unregister the eventfd with epoll. { auto rv = ::epoll_ctl(epollFd_.fd(), EPOLL_CTL_DEL, eventFd_.fd(), nullptr); TP_THROW_SYSTEM_IF(rv == -1, errno); } } void EpollLoop::registerDescriptor( int fd, int events, std::shared_ptr h) { TP_DCHECK(deferredExecutor_.inLoop()); std::lock_guard lock(handlersMutex_); uint64_t record = nextRecord_++; struct epoll_event ev; ev.events = events; ev.data.u64 = record; auto fdIter = fdToRecord_.find(fd); if (fdIter == fdToRecord_.end()) { fdToRecord_.emplace(fd, record); recordToHandler_.emplace(record, h); auto rv = ::epoll_ctl(epollFd_.fd(), EPOLL_CTL_ADD, fd, &ev); TP_THROW_SYSTEM_IF(rv == -1, errno); } else { uint64_t oldRecord = fdIter->second; fdIter->second = record; recordToHandler_.erase(oldRecord); recordToHandler_.emplace(record, h); auto rv = ::epoll_ctl(epollFd_.fd(), EPOLL_CTL_MOD, fd, &ev); TP_THROW_SYSTEM_IF(rv == -1, errno); } } void EpollLoop::unregisterDescriptor(int fd) { TP_DCHECK(deferredExecutor_.inLoop()); std::lock_guard lock(handlersMutex_); auto fdIter = fdToRecord_.find(fd); TP_DCHECK(fdIter != fdToRecord_.end()); uint64_t oldRecord = fdIter->second; fdToRecord_.erase(fdIter); recordToHandler_.erase(oldRecord); auto rv = ::epoll_ctl(epollFd_.fd(), EPOLL_CTL_DEL, fd, nullptr); TP_THROW_SYSTEM_IF(rv == -1, errno); // Maybe we're done and the event loop is waiting for the last handlers to // be unregistered before terminating, so just in case we wake it up. if (fdToRecord_.empty()) { wakeup(); } } void EpollLoop::wakeup() { // Perform a write to eventfd to wake up epoll_wait(2). eventFd_.writeOrThrow(1); } bool EpollLoop::hasRegisteredHandlers() { std::lock_guard lock(handlersMutex_); TP_DCHECK_EQ(fdToRecord_.size(), recordToHandler_.size()); return !fdToRecord_.empty(); } void EpollLoop::loop() { setThreadName("TP_IBV_loop"); // Stop when another thread has asked the loop the close and when all // handlers have been unregistered except for the wakeup eventfd one. while (!closed_ || hasRegisteredHandlers()) { // Use fixed epoll_event capacity for every call. std::vector epollEvents(kCapacity); // Block waiting for something to happen... auto nfds = ::epoll_wait(epollFd_.fd(), epollEvents.data(), epollEvents.size(), -1); if (nfds == -1) { if (errno == EINTR) { continue; } TP_THROW_SYSTEM(errno); } // Always immediately read from the eventfd so that it is no longer readable // on the next call to epoll_wait(2). As it's opened in non-blocking mode, // reading from it if its value is zero just return EAGAIN. Reset it before // invoking any of the callbacks, so that if they perform a wakeup they will // wake up the next iteration of epoll_wait(2). { uint64_t val; auto rv = eventFd_.read(reinterpret_cast(&val), sizeof(val)); TP_DCHECK( (rv == -1 && errno == EAGAIN) || (rv == sizeof(val) && val > 0)); } // Resize based on actual number of events. epollEvents.resize(nfds); // Defer handling to reactor and wait for it to process these events. deferredExecutor_.runInLoop( [this, epollEvents{std::move(epollEvents)}]() mutable { handleEpollEventsFromLoop(std::move(epollEvents)); }); } } void EpollLoop::handleEpollEventsFromLoop( std::vector epollEvents) { TP_DCHECK(deferredExecutor_.inLoop()); // Process events returned by epoll_wait(2). for (const auto& event : epollEvents) { const uint64_t record = event.data.u64; // Make a copy so that if the handler unregisters itself as it runs it will // still be kept alive by our copy of the shared_ptr. std::shared_ptr handler; { std::unique_lock handlersLock(handlersMutex_); const auto recordIter = recordToHandler_.find(record); if (recordIter == recordToHandler_.end()) { continue; } handler = recordIter->second; } handler->handleEventsFromLoop(event.events); } } std::string EpollLoop::formatEpollEvents(uint32_t events) { std::string res; if (events & EPOLLIN) { res = res.empty() ? "IN" : res + " | IN"; events &= ~EPOLLIN; } if (events & EPOLLOUT) { res = res.empty() ? "OUT" : res + " | OUT"; events &= ~EPOLLOUT; } if (events & EPOLLERR) { res = res.empty() ? "ERR" : res + " | ERR"; events &= ~EPOLLERR; } if (events & EPOLLHUP) { res = res.empty() ? "HUP" : res + " | HUP"; events &= ~EPOLLHUP; } if (events > 0) { std::string eventsStr = std::to_string(events); res = res.empty() ? eventsStr : res + " | " + eventsStr; } return res; } } // namespace tensorpipe ================================================ FILE: tensorpipe/common/epoll_loop.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include #include #include #include #include namespace tensorpipe { class EpollLoop final { public: // Abstract base class called by the epoll(2) event loop. // // Dispatch to multiple types is needed because we must deal with a // few listening sockets and an eventfd(2) per connection. // class EventHandler { public: virtual ~EventHandler() = default; virtual void handleEventsFromLoop(int events) = 0; }; explicit EpollLoop(DeferredExecutor& deferredExecutor); // Register file descriptor with event loop. // // Trigger the handler if any of the epoll events in the `events` // mask occurs. If an event is triggered, the loop first acquires a // copy of the shared_ptr to the handler before calling into its // handler function. This ensures that the handler is alive for the // duration of this function. // void registerDescriptor(int fd, int events, std::shared_ptr h); // Unregister file descriptor from event loop. // // This resets the shared_ptr to the event handler that was registered // in `registerDescriptor`. Upon returning, the handler can no // longer be called, even if there were pending events for the file // descriptor. Only if the loop had acquired a shared_ptr to the // handler prior to this function being called, can the handler // function still be called. // void unregisterDescriptor(int fd); void close(); // Tell loop to terminate when no more handlers remain. void join(); ~EpollLoop(); static std::string formatEpollEvents(uint32_t events); private: static constexpr auto kCapacity = 64; // The reactor is used to process events for this loop. DeferredExecutor& deferredExecutor_; // Wake up the event loop. void wakeup(); // Main loop function. void loop(); // Check whether some handlers are currently registered. bool hasRegisteredHandlers(); Fd epollFd_; Fd eventFd_; std::atomic closed_{false}; std::atomic joined_{false}; std::thread thread_; // Interaction with epoll(7). // // A dedicated thread runs epoll_wait(2) in a loop and, every time it returns, // it defers a function to the reactor which is responsible for processing the // epoll events and executing the handlers, and then notify the epoll thread // that it is done, for it to start another iteration. This back-and-forth // between these threads is done to ensure that all epoll handlers are run // from the reactor thread, just like everything else. Doing so makes it // easier to reason about how certain events are sequenced. For example, if // another processes first makes a write to a connection and then closes the // accompanying Unix domain socket, we know for a fact that the reactor will // first react to the write, and then react to the epoll event caused by // closing the socket. If we didn't force serialization onto the reactor, we // would not have this guarantee. // // It's safe to call epoll_ctl from one thread while another thread is blocked // on an epoll_wait call. This means that the kernel internally serializes the // operations on a single epoll fd. However, we have no way to control whether // a modification of the set of file descriptors monitored by epoll occurred // just before or just after the return from the epoll_wait. This means that // when we start processing the result of epoll_wait we can't know what set of // file descriptors it operated on. This becomes a problem if, for example, in // between the moment epoll_wait returns and the moment we process the results // a file descriptor is unregistered and closed and another one with the same // value is opened and registered: we'd end up calling the handler of the new // fd for the events of the old one (which probably include errors). // // However, epoll offers a way to address this: epoll_wait returns, for each // event, the piece of extra data that was provided by the *last* call on // epoll_ctl for that fd. This allows us to detect whether epoll_wait had // taken into account an update to the set of fds or not. We do so by giving // each update a unique identifier, called "record". Each update to a fd will // associate a new record to it. The handlers are associated to records (and // not to fds), and for each fd we know which handler is the one currently // installed. This way when processing an event we can detect whether the // record for that event is still valid or whether it is stale, in which case // we disregard the event, and wait for it to fire again at the next epoll // iteration, with the up-to-date handler. std::unordered_map fdToRecord_; std::unordered_map> recordToHandler_; uint64_t nextRecord_{1}; // Reserve record 0 for the eventfd std::mutex handlersMutex_; // Deferred to the reactor to handle the events received by epoll_wait(2). void handleEpollEventsFromLoop(std::vector epollEvents); }; } // namespace tensorpipe ================================================ FILE: tensorpipe/common/error.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include namespace tensorpipe { const Error Error::kSuccess = Error(); std::string Error::what() const { TP_DCHECK(error_); std::ostringstream ss; ss << error_->what() << " (this error originated at " << file_ << ":" << line_ << ")"; return ss.str(); } std::string SystemError::what() const { std::ostringstream ss; ss << syscall_ << ": " << strerror(error_); return ss.str(); } int SystemError::errorCode() const { return error_; } std::string ShortReadError::what() const { std::ostringstream ss; ss << "short read: got " << actual_ << " bytes while expecting to read " << expected_ << " bytes"; return ss.str(); } std::string ShortWriteError::what() const { std::ostringstream ss; ss << "short write: wrote " << actual_ << " bytes while expecting to write " << expected_ << " bytes"; return ss.str(); } std::string EOFError::what() const { return "eof"; } } // namespace tensorpipe ================================================ FILE: tensorpipe/common/error.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include namespace tensorpipe { // Base class for actual errors. class BaseError { public: virtual ~BaseError() = default; // Returns an explanatory string. // Like `std::exception` but returns a `std::string`. virtual std::string what() const = 0; }; // Wrapper class for errors. // // Background: we wish to not use exceptions yet need an error // representation that can propagate across function and thread // boundaries. This representation must be copyable (so we can store // and return it at a later point in time) and retain downstream type // information. This implies a heap allocation because it's the // easiest way to deal with variable size objects (barring a union of // all downstream error classes and a lot of custom code). Instead of // passing a shared_ptr around directly, we use this wrapper class to // keep implementation details hidden from calling code. // class Error final { public: // Constant instance that indicates success. static const Error kSuccess; // Default constructor for error that is not an error. Error() {} Error(std::shared_ptr error, std::string file, int line) : error_(std::move(error)), file_(std::move(file)), line_(line) {} ~Error() = default; // Converting to boolean means checking if there is an error. This // means we don't need to use an `std::optional` and allows for a // snippet like the following: // // if (error) { // // Deal with it. // } // operator bool() const { return static_cast(error_); } template std::shared_ptr castToType() const { return std::dynamic_pointer_cast(error_); } template bool isOfType() const { return castToType() != nullptr; } // Like `std::exception` but returns a `std::string`. std::string what() const; private: std::shared_ptr error_; std::string file_; int line_; }; class SystemError final : public BaseError { public: explicit SystemError(const char* syscall, int error) : syscall_(syscall), error_(error) {} std::string what() const override; int errorCode() const; private: const char* syscall_; const int error_; }; class ShortReadError final : public BaseError { public: ShortReadError(ssize_t expected, ssize_t actual) : expected_(expected), actual_(actual) {} std::string what() const override; private: const ssize_t expected_; const ssize_t actual_; }; class ShortWriteError final : public BaseError { public: ShortWriteError(ssize_t expected, ssize_t actual) : expected_(expected), actual_(actual) {} std::string what() const override; private: const ssize_t expected_; const ssize_t actual_; }; class EOFError final : public BaseError { public: EOFError() {} std::string what() const override; }; } // namespace tensorpipe ================================================ FILE: tensorpipe/common/error_macros.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #define TP_CREATE_ERROR(typ, ...) \ (Error( \ std::make_shared(__VA_ARGS__), \ TP_TRIM_FILENAME(__FILE__), \ __LINE__)) ================================================ FILE: tensorpipe/common/fd.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include namespace tensorpipe { ssize_t Fd::read(void* buf, size_t count) { ssize_t rv = -1; for (;;) { rv = ::read(fd_, buf, count); if (rv == -1 && errno == EINTR) { continue; } break; } return rv; } // Proxy to write(2) with EINTR retry. ssize_t Fd::write(const void* buf, size_t count) { ssize_t rv = -1; for (;;) { rv = ::write(fd_, buf, count); if (rv == -1 && errno == EINTR) { continue; } break; } return rv; } // Call read and throw if it doesn't complete. Error Fd::readFull(void* buf, size_t count) { auto rv = read(buf, count); if (rv == -1) { return TP_CREATE_ERROR(SystemError, "read", errno); } if (rv != count) { return TP_CREATE_ERROR(ShortReadError, count, rv); } return Error::kSuccess; } // Call write and throw if it doesn't complete. Error Fd::writeFull(const void* buf, size_t count) { auto rv = write(buf, count); if (rv == -1) { return TP_CREATE_ERROR(SystemError, "write", errno); } if (rv != count) { return TP_CREATE_ERROR(ShortWriteError, count, rv); } return Error::kSuccess; } } // namespace tensorpipe ================================================ FILE: tensorpipe/common/fd.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include namespace tensorpipe { class Fd { public: Fd() = default; explicit Fd(int fd) : fd_(fd) {} virtual ~Fd() { reset(); } // Disable copy constructor. Fd(const Fd&) = delete; // Disable copy assignment. Fd& operator=(const Fd&) = delete; // Custom move constructor. Fd(Fd&& other) noexcept { std::swap(fd_, other.fd_); } // Custom move assignment. Fd& operator=(Fd&& other) noexcept { std::swap(fd_, other.fd_); return *this; } // Return underlying file descriptor. int fd() const { return fd_; } bool hasValue() const { return fd_ >= 0; } void reset() { if (hasValue()) { ::close(fd_); fd_ = -1; } } // Proxy to read(2) with EINTR retry. ssize_t read(void* buf, size_t count); // Proxy to write(2) with EINTR retry. ssize_t write(const void* buf, size_t count); // Call read and return error if it doesn't exactly read `count` bytes. Error readFull(void* buf, size_t count); // Call write and return error if it doesn't exactly write `count` bytes. Error writeFull(const void* buf, size_t count); // Call `readFull` with trivially copyable type. Throws on errors. template T readOrThrow() { T tmp; static_assert(std::is_trivially_copyable::value, "!"); auto err = readFull(&tmp, sizeof(T)); if (err) { throw std::runtime_error(err.what()); } return tmp; } // Call `writeFull` with trivially copyable type. Throws on errors. template void writeOrThrow(const T& t) { static_assert(std::is_trivially_copyable::value, "!"); auto err = writeFull(&t, sizeof(T)); if (err) { throw std::runtime_error(err.what()); } } // Call `readFull` with trivially copyable type. template Error read(T* t) { static_assert(std::is_trivially_copyable::value, "!"); return readFull(t, sizeof(T)); } // Call `writeFull` with trivially copyable type. template Error write(const T& t) { static_assert(std::is_trivially_copyable::value, "!"); return writeFull(&t, sizeof(T)); } protected: int fd_{-1}; }; } // namespace tensorpipe ================================================ FILE: tensorpipe/common/ibv.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include namespace tensorpipe { std::string ibvWorkCompletionOpcodeToStr(IbvLib::wc_opcode opcode) { switch (opcode) { case IbvLib::WC_SEND: return "SEND"; case IbvLib::WC_RDMA_WRITE: return "RDMA_WRITE"; case IbvLib::WC_RDMA_READ: return "RDMA_READ"; case IbvLib::WC_COMP_SWAP: return "COMP_SWAP"; case IbvLib::WC_FETCH_ADD: return "FETCH_ADD"; case IbvLib::WC_BIND_MW: return "BIND_MW"; case IbvLib::WC_RECV: return "RECV"; case IbvLib::WC_RECV_RDMA_WITH_IMM: return "RECV_RDMA_WITH_IMM"; default: return "UNKNOWN (" + std::to_string(opcode) + ")"; } } struct IbvAddress makeIbvAddress( const IbvLib& ibvLib, const IbvContext& context, uint8_t portNum, uint8_t globalIdentifierIndex) { struct IbvAddress addr; std::memset(&addr, 0, sizeof(addr)); addr.portNum = portNum; addr.globalIdentifierIndex = globalIdentifierIndex; IbvLib::port_attr portAttr; std::memset(&portAttr, 0, sizeof(portAttr)); TP_CHECK_IBV_INT(ibvLib.query_port(context.get(), portNum, &portAttr)); addr.localIdentifier = portAttr.lid; addr.maximumTransmissionUnit = portAttr.active_mtu; addr.maximumMessageSize = portAttr.max_msg_sz; TP_CHECK_IBV_INT(ibvLib.query_gid( context.get(), portNum, globalIdentifierIndex, &addr.globalIdentifier)); return addr; } struct IbvSetupInformation makeIbvSetupInformation( const IbvAddress& addr, const IbvQueuePair& qp) { struct IbvSetupInformation info; std::memset(&info, 0, sizeof(info)); info.localIdentifier = addr.localIdentifier; info.globalIdentifier = addr.globalIdentifier; info.queuePairNumber = qp->qp_num; info.maximumTransmissionUnit = addr.maximumTransmissionUnit; info.maximumMessageSize = addr.maximumMessageSize; return info; } void transitionIbvQueuePairToInit( const IbvLib& ibvLib, IbvQueuePair& qp, const IbvAddress& selfAddr) { IbvLib::qp_attr attr; std::memset(&attr, 0, sizeof(attr)); int attrMask = 0; attrMask |= IbvLib::QP_STATE; attr.qp_state = IbvLib::QPS_INIT; // Hardcode the use of the first entry of the partition key table, as it will // always be valid. // FIXME: Make this configurable similarly to the port number. attrMask |= IbvLib::QP_PKEY_INDEX; attr.pkey_index = 0; attrMask |= IbvLib::QP_PORT; attr.port_num = selfAddr.portNum; attrMask |= IbvLib::QP_ACCESS_FLAGS; attr.qp_access_flags = IbvLib::ACCESS_LOCAL_WRITE | IbvLib::ACCESS_REMOTE_WRITE; TP_CHECK_IBV_INT(ibvLib.modify_qp(qp.get(), &attr, attrMask)); } void transitionIbvQueuePairToReadyToReceive( const IbvLib& ibvLib, IbvQueuePair& qp, const IbvAddress& selfAddr, const IbvSetupInformation& destinationInfo) { IbvLib::qp_attr attr; std::memset(&attr, 0, sizeof(attr)); int attrMask = 0; attrMask |= IbvLib::QP_STATE; attr.qp_state = IbvLib::QPS_RTR; // Global routing is only set up as far as needed to support RoCE. attrMask |= IbvLib::QP_AV; if (destinationInfo.localIdentifier != 0) { attr.ah_attr.is_global = 0; attr.ah_attr.dlid = destinationInfo.localIdentifier; } else { attr.ah_attr.is_global = 1; attr.ah_attr.grh.dgid = destinationInfo.globalIdentifier; attr.ah_attr.grh.sgid_index = selfAddr.globalIdentifierIndex; attr.ah_attr.grh.hop_limit = 1; } attr.ah_attr.port_num = selfAddr.portNum; attrMask |= IbvLib::QP_PATH_MTU; attr.path_mtu = std::min( selfAddr.maximumTransmissionUnit, destinationInfo.maximumTransmissionUnit); attrMask |= IbvLib::QP_DEST_QPN; attr.dest_qp_num = destinationInfo.queuePairNumber; // The packet sequence numbers of the local send and of the remote receive // queues (and vice versa) only need to match. Thus we set them all to zero. attrMask |= IbvLib::QP_RQ_PSN; attr.rq_psn = 0; attrMask |= IbvLib::QP_MAX_DEST_RD_ATOMIC; attr.max_dest_rd_atomic = 1; attrMask |= IbvLib::QP_MIN_RNR_TIMER; attr.min_rnr_timer = 20; // 10.24 milliseconds TP_CHECK_IBV_INT(ibvLib.modify_qp(qp.get(), &attr, attrMask)); } void transitionIbvQueuePairToReadyToSend( const IbvLib& ibvLib, IbvQueuePair& qp) { IbvLib::qp_attr attr; std::memset(&attr, 0, sizeof(attr)); int attrMask = 0; attrMask |= IbvLib::QP_STATE; attr.qp_state = IbvLib::QPS_RTS; // The packet sequence numbers of the local send and of the remote receive // queues (and vice versa) only need to match. Thus we set them all to zero. attrMask |= IbvLib::QP_SQ_PSN; attr.sq_psn = 0; attrMask |= IbvLib::QP_TIMEOUT; attr.timeout = 14; // 67.1 milliseconds attrMask |= IbvLib::QP_RETRY_CNT; attr.retry_cnt = 7; attrMask |= IbvLib::QP_RNR_RETRY; attr.rnr_retry = 7; // infinite attrMask |= IbvLib::QP_MAX_QP_RD_ATOMIC; attr.max_rd_atomic = 1; TP_CHECK_IBV_INT(ibvLib.modify_qp(qp.get(), &attr, attrMask)); } void transitionIbvQueuePairToError(const IbvLib& ibvLib, IbvQueuePair& qp) { IbvLib::qp_attr attr; std::memset(&attr, 0, sizeof(attr)); int attrMask = 0; attrMask |= IbvLib::QP_STATE; attr.qp_state = IbvLib::QPS_ERR; TP_CHECK_IBV_INT(ibvLib.modify_qp(qp.get(), &attr, attrMask)); } } // namespace tensorpipe ================================================ FILE: tensorpipe/common/ibv.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include namespace tensorpipe { // Error checking macros #define TP_CHECK_IBV_PTR(op) \ [&]() { \ auto ptr = op; \ TP_THROW_SYSTEM_IF(ptr == nullptr, errno); \ return ptr; \ }() #define TP_CHECK_IBV_INT(op) \ { \ int rv = op; \ TP_THROW_SYSTEM_IF(rv < 0, errno); \ } #define TP_CHECK_IBV_VOID(op) op; // Logging helpers std::string ibvWorkCompletionOpcodeToStr(IbvLib::wc_opcode opcode); // RAII wrappers class IbvDeviceList { private: IbvDeviceList(const IbvLib& ibvLib, IbvLib::device** ptr, int size) : deviceList_(ptr, Deleter{&ibvLib}), size_(size) {} public: IbvDeviceList() = default; static std::tuple create(const IbvLib& ibvLib) { int size; IbvLib::device** ptr = ibvLib.get_device_list(&size); if (ptr == nullptr) { // Earlier versions of libibverbs had a bug where errno would be set to // *negative* ENOSYS when the module wasn't found. This got fixed in // https://github.com/linux-rdma/rdma-core/commit/062bf1a72badaf6ad2d51ebe4c8c8bdccfc376e2 // However, to support those versions, we manually flip it in case. return std::make_tuple( TP_CREATE_ERROR( SystemError, "ibv_get_device_list", errno == -ENOSYS ? ENOSYS : errno), IbvDeviceList()); } return std::make_tuple(Error::kSuccess, IbvDeviceList(ibvLib, ptr, size)); } int size() { return size_; } IbvLib::device& operator[](int i) { return *deviceList_.get()[i]; } void reset() { deviceList_.reset(); } // FIXME Can we support a "range" API (i.e., a begin() and end() method) so // that this can be used in a for (auto& dev : deviceList) expression? private: struct Deleter { void operator()(IbvLib::device** ptr) { TP_CHECK_IBV_VOID(ibvLib->free_device_list(ptr)); } const IbvLib* ibvLib; }; std::unique_ptr deviceList_; int size_; }; struct IbvContextDeleter { void operator()(IbvLib::context* ptr) { TP_CHECK_IBV_INT(ibvLib->close_device(ptr)); } const IbvLib* ibvLib; }; using IbvContext = std::unique_ptr; inline IbvContext createIbvContext( const IbvLib& ibvLib, IbvLib::device& device) { return IbvContext( TP_CHECK_IBV_PTR(ibvLib.open_device(&device)), IbvContextDeleter{&ibvLib}); } struct IbvProtectionDomainDeleter { void operator()(IbvLib::pd* ptr) { TP_CHECK_IBV_INT(ibvLib->dealloc_pd(ptr)); } const IbvLib* ibvLib; }; using IbvProtectionDomain = std::unique_ptr; inline IbvProtectionDomain createIbvProtectionDomain( const IbvLib& ibvLib, IbvContext& context) { return IbvProtectionDomain( TP_CHECK_IBV_PTR(ibvLib.alloc_pd(context.get())), IbvProtectionDomainDeleter{&ibvLib}); } struct IbvCompletionQueueDeleter { void operator()(IbvLib::cq* ptr) { TP_CHECK_IBV_INT(ibvLib->destroy_cq(ptr)); } const IbvLib* ibvLib; }; using IbvCompletionQueue = std::unique_ptr; inline IbvCompletionQueue createIbvCompletionQueue( const IbvLib& ibvLib, IbvContext& context, int cqe, void* cq_context, IbvLib::comp_channel* channel, int comp_vector) { return IbvCompletionQueue( TP_CHECK_IBV_PTR(ibvLib.create_cq( context.get(), cqe, cq_context, channel, comp_vector)), IbvCompletionQueueDeleter{&ibvLib}); } struct IbvSharedReceiveQueueDeleter { void operator()(IbvLib::srq* ptr) { TP_CHECK_IBV_INT(ibvLib->destroy_srq(ptr)); } const IbvLib* ibvLib; }; using IbvSharedReceiveQueue = std::unique_ptr; inline IbvSharedReceiveQueue createIbvSharedReceiveQueue( const IbvLib& ibvLib, IbvProtectionDomain& pd, IbvLib::srq_init_attr& initAttr) { return IbvSharedReceiveQueue( TP_CHECK_IBV_PTR(ibvLib.create_srq(pd.get(), &initAttr)), IbvSharedReceiveQueueDeleter{&ibvLib}); } struct IbvMemoryRegionDeleter { void operator()(IbvLib::mr* ptr) { TP_CHECK_IBV_INT(ibvLib->dereg_mr(ptr)); } const IbvLib* ibvLib; }; using IbvMemoryRegion = std::unique_ptr; inline IbvMemoryRegion createIbvMemoryRegion( const IbvLib& ibvLib, IbvProtectionDomain& pd, void* addr, size_t length, int accessFlags) { return IbvMemoryRegion( TP_CHECK_IBV_PTR(ibvLib.reg_mr(pd.get(), addr, length, accessFlags)), IbvMemoryRegionDeleter{&ibvLib}); } struct IbvQueuePairDeleter { void operator()(IbvLib::qp* ptr) { TP_CHECK_IBV_INT(ibvLib->destroy_qp(ptr)); } const IbvLib* ibvLib; }; using IbvQueuePair = std::unique_ptr; inline IbvQueuePair createIbvQueuePair( const IbvLib& ibvLib, IbvProtectionDomain& pd, IbvLib::qp_init_attr& initAttr) { return IbvQueuePair( TP_CHECK_IBV_PTR(ibvLib.create_qp(pd.get(), &initAttr)), IbvQueuePairDeleter{&ibvLib}); } // Helpers struct IbvAddress { uint8_t portNum; uint8_t globalIdentifierIndex; // The already-resolved LID of the above device+port pair. uint32_t localIdentifier; // The already-resolved GID of the above device+port+index combination. IbvLib::gid globalIdentifier; IbvLib::mtu maximumTransmissionUnit; uint32_t maximumMessageSize; }; struct IbvSetupInformation { uint32_t localIdentifier; IbvLib::gid globalIdentifier; uint32_t queuePairNumber; IbvLib::mtu maximumTransmissionUnit; uint32_t maximumMessageSize; }; struct IbvAddress makeIbvAddress( const IbvLib& ibvLib, const IbvContext& context, uint8_t portNum, uint8_t globalIdentifierIndex); struct IbvSetupInformation makeIbvSetupInformation( const IbvAddress& addr, const IbvQueuePair& qp); void transitionIbvQueuePairToInit( const IbvLib& ibvLib, IbvQueuePair& qp, const IbvAddress& selfAddr); void transitionIbvQueuePairToReadyToReceive( const IbvLib& ibvLib, IbvQueuePair& qp, const IbvAddress& selfAddr, const IbvSetupInformation& destinationInfo); void transitionIbvQueuePairToReadyToSend( const IbvLib& ibvLib, IbvQueuePair& qp); void transitionIbvQueuePairToError(const IbvLib& ibvLib, IbvQueuePair& qp); } // namespace tensorpipe ================================================ FILE: tensorpipe/common/ibv_lib.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include namespace tensorpipe { // Master list of all symbols we care about from libibverbs. #define TP_FORALL_IBV_SYMBOLS(_) \ _(ack_async_event, void, (IbvLib::async_event*)) \ _(alloc_pd, IbvLib::pd*, (IbvLib::context*)) \ _(close_device, int, (IbvLib::context*)) \ _(create_cq, \ IbvLib::cq*, \ (IbvLib::context*, int, void*, IbvLib::comp_channel*, int)) \ _(create_qp, IbvLib::qp*, (IbvLib::pd*, IbvLib::qp_init_attr*)) \ _(create_srq, IbvLib::srq*, (IbvLib::pd*, IbvLib::srq_init_attr*)) \ _(dealloc_pd, int, (IbvLib::pd*)) \ _(dereg_mr, int, (IbvLib::mr*)) \ _(destroy_cq, int, (IbvLib::cq*)) \ _(destroy_qp, int, (IbvLib::qp*)) \ _(destroy_srq, int, (IbvLib::srq*)) \ _(event_type_str, const char*, (IbvLib::event_type)) \ _(free_device_list, void, (IbvLib::device**)) \ _(get_async_event, int, (IbvLib::context*, IbvLib::async_event*)) \ _(get_device_list, IbvLib::device**, (int*)) \ _(get_device_name, const char*, (IbvLib::device*)) \ _(modify_qp, int, (IbvLib::qp*, IbvLib::qp_attr*, int)) \ _(open_device, IbvLib::context*, (IbvLib::device*)) \ _(query_gid, int, (IbvLib::context*, uint8_t, int, IbvLib::gid*)) \ _(query_port, int, (IbvLib::context*, uint8_t, IbvLib::port_attr*)) \ _(reg_mr, IbvLib::mr*, (IbvLib::pd*, void*, size_t, int)) \ _(wc_status_str, const char*, (IbvLib::wc_status)) // Wrapper for libibverbs. class IbvLib { public: // Constants enum { SYSFS_NAME_MAX = 64, SYSFS_PATH_MAX = 256 }; enum { WC_IP_CSUM_OK_SHIFT = 2 }; // Enums enum access_flags { ACCESS_LOCAL_WRITE = 1, ACCESS_REMOTE_WRITE = (1 << 1), ACCESS_REMOTE_READ = (1 << 2), ACCESS_REMOTE_ATOMIC = (1 << 3), ACCESS_MW_BIND = (1 << 4), ACCESS_ZERO_BASED = (1 << 5), ACCESS_ON_DEMAND = (1 << 6), ACCESS_HUGETLB = (1 << 7), ACCESS_RELAXED_ORDERING = (1 << 20), }; enum event_type { EVENT_CQ_ERR, EVENT_QP_FATAL, EVENT_QP_REQ_ERR, EVENT_QP_ACCESS_ERR, EVENT_COMM_EST, EVENT_SQ_DRAINED, EVENT_PATH_MIG, EVENT_PATH_MIG_ERR, EVENT_DEVICE_FATAL, EVENT_PORT_ACTIVE, EVENT_PORT_ERR, EVENT_LID_CHANGE, EVENT_PKEY_CHANGE, EVENT_SM_CHANGE, EVENT_SRQ_ERR, EVENT_SRQ_LIMIT_REACHED, EVENT_QP_LAST_WQE_REACHED, EVENT_CLIENT_REREGISTER, EVENT_GID_CHANGE, EVENT_WQ_FATAL, }; enum mig_state { MIG_MIGRATED, MIG_REARM, MIG_ARMED }; enum mtu { MTU_256 = 1, MTU_512 = 2, MTU_1024 = 3, MTU_2048 = 4, MTU_4096 = 5 }; enum mw_type { MW_TYPE_1 = 1, MW_TYPE_2 = 2 }; enum node_type { NODE_UNKNOWN = -1, NODE_CA = 1, NODE_SWITCH, NODE_ROUTER, NODE_RNIC, NODE_USNIC, NODE_USNIC_UDP, NODE_UNSPECIFIED, }; enum port_state { PORT_NOP = 0, PORT_DOWN = 1, PORT_INIT = 2, PORT_ARMED = 3, PORT_ACTIVE = 4, PORT_ACTIVE_DEFER = 5 }; enum qp_attr_mask { QP_STATE = 1 << 0, QP_CUR_STATE = 1 << 1, QP_EN_SQD_ASYNC_NOTIFY = 1 << 2, QP_ACCESS_FLAGS = 1 << 3, QP_PKEY_INDEX = 1 << 4, QP_PORT = 1 << 5, QP_QKEY = 1 << 6, QP_AV = 1 << 7, QP_PATH_MTU = 1 << 8, QP_TIMEOUT = 1 << 9, QP_RETRY_CNT = 1 << 10, QP_RNR_RETRY = 1 << 11, QP_RQ_PSN = 1 << 12, QP_MAX_QP_RD_ATOMIC = 1 << 13, QP_ALT_PATH = 1 << 14, QP_MIN_RNR_TIMER = 1 << 15, QP_SQ_PSN = 1 << 16, QP_MAX_DEST_RD_ATOMIC = 1 << 17, QP_PATH_MIG_STATE = 1 << 18, QP_CAP = 1 << 19, QP_DEST_QPN = 1 << 20, QP_RATE_LIMIT = 1 << 25, }; enum qp_state { QPS_RESET, QPS_INIT, QPS_RTR, QPS_RTS, QPS_SQD, QPS_SQE, QPS_ERR, QPS_UNKNOWN }; enum qp_type { QPT_RC = 2, QPT_UC, QPT_UD, QPT_RAW_PACKET = 8, QPT_XRC_SEND = 9, QPT_XRC_RECV, QPT_DRIVER = 0xff, }; enum transport_type { TRANSPORT_UNKNOWN = -1, TRANSPORT_IB = 0, TRANSPORT_IWARP, TRANSPORT_USNIC, TRANSPORT_USNIC_UDP, TRANSPORT_UNSPECIFIED, }; enum wc_flags { WC_GRH = 1 << 0, WC_WITH_IMM = 1 << 1, WC_IP_CSUM_OK = 1 << WC_IP_CSUM_OK_SHIFT, WC_WITH_INV = 1 << 3, WC_TM_SYNC_REQ = 1 << 4, WC_TM_MATCH = 1 << 5, WC_TM_DATA_VALID = 1 << 6, }; enum wc_opcode { WC_SEND, WC_RDMA_WRITE, WC_RDMA_READ, WC_COMP_SWAP, WC_FETCH_ADD, WC_BIND_MW, WC_LOCAL_INV, WC_TSO, WC_RECV = 1 << 7, WC_RECV_RDMA_WITH_IMM, WC_TM_ADD, WC_TM_DEL, WC_TM_SYNC, WC_TM_RECV, WC_TM_NO_TAG, WC_DRIVER1, }; enum wc_status { WC_SUCCESS, WC_LOC_LEN_ERR, WC_LOC_QP_OP_ERR, WC_LOC_EEC_OP_ERR, WC_LOC_PROT_ERR, WC_WR_FLUSH_ERR, WC_MW_BIND_ERR, WC_BAD_RESP_ERR, WC_LOC_ACCESS_ERR, WC_REM_INV_REQ_ERR, WC_REM_ACCESS_ERR, WC_REM_OP_ERR, WC_RETRY_EXC_ERR, WC_RNR_RETRY_EXC_ERR, WC_LOC_RDD_VIOL_ERR, WC_REM_INV_RD_REQ_ERR, WC_REM_ABORT_ERR, WC_INV_EECN_ERR, WC_INV_EEC_STATE_ERR, WC_FATAL_ERR, WC_RESP_TIMEOUT_ERR, WC_GENERAL_ERR, WC_TM_ERR, WC_TM_RNDV_INCOMPLETE, }; enum wr_opcode { WR_RDMA_WRITE, WR_RDMA_WRITE_WITH_IMM, WR_SEND, WR_SEND_WITH_IMM, WR_RDMA_READ, WR_ATOMIC_CMP_AND_SWP, WR_ATOMIC_FETCH_AND_ADD, WR_LOCAL_INV, WR_BIND_MW, WR_SEND_WITH_INV, WR_TSO, WR_DRIVER1, }; // Structs and unions // Forward declarations struct _compat_port_attr; struct ah; struct context; struct cq; struct device; struct mr; struct mw_bind; struct mw; struct pd; struct qp; struct srq; struct wq; // Attributes struct port_attr { IbvLib::port_state state; IbvLib::mtu max_mtu; IbvLib::mtu active_mtu; int gid_tbl_len; uint32_t port_cap_flags; uint32_t max_msg_sz; uint32_t bad_pkey_cntr; uint32_t qkey_viol_cntr; uint16_t pkey_tbl_len; uint16_t lid; uint16_t sm_lid; uint8_t lmc; uint8_t max_vl_num; uint8_t sm_sl; uint8_t subnet_timeout; uint8_t init_type_reply; uint8_t active_width; uint8_t active_speed; uint8_t phys_state; uint8_t link_layer; uint8_t flags; uint16_t port_cap_flags2; }; struct qp_cap { uint32_t max_send_wr; uint32_t max_recv_wr; uint32_t max_send_sge; uint32_t max_recv_sge; uint32_t max_inline_data; }; union gid { uint8_t raw[16]; struct { uint64_t subnet_prefix; uint64_t interface_id; } global; }; struct global_route { IbvLib::gid dgid; uint32_t flow_label; uint8_t sgid_index; uint8_t hop_limit; uint8_t traffic_class; }; struct ah_attr { IbvLib::global_route grh; uint16_t dlid; uint8_t sl; uint8_t src_path_bits; uint8_t static_rate; uint8_t is_global; uint8_t port_num; }; struct qp_attr { IbvLib::qp_state qp_state; IbvLib::qp_state cur_qp_state; IbvLib::mtu path_mtu; IbvLib::mig_state path_mig_state; uint32_t qkey; uint32_t rq_psn; uint32_t sq_psn; uint32_t dest_qp_num; unsigned int qp_access_flags; IbvLib::qp_cap cap; IbvLib::ah_attr ah_attr; IbvLib::ah_attr alt_ah_attr; uint16_t pkey_index; uint16_t alt_pkey_index; uint8_t en_sqd_async_notify; uint8_t sq_draining; uint8_t max_rd_atomic; uint8_t max_dest_rd_atomic; uint8_t min_rnr_timer; uint8_t port_num; uint8_t timeout; uint8_t retry_cnt; uint8_t rnr_retry; uint8_t alt_port_num; uint8_t alt_timeout; uint32_t rate_limit; }; struct qp_init_attr { void* qp_context; IbvLib::cq* send_cq; IbvLib::cq* recv_cq; IbvLib::srq* srq; IbvLib::qp_cap cap; IbvLib::qp_type qp_type; int sq_sig_all; }; struct srq_attr { uint32_t max_wr; uint32_t max_sge; uint32_t srq_limit; }; struct srq_init_attr { void* srq_context; IbvLib::srq_attr attr; }; // Work requests and completions struct sge { uint64_t addr; uint32_t length; uint32_t lkey; }; struct recv_wr { uint64_t wr_id; IbvLib::recv_wr* next; IbvLib::sge* sg_list; int num_sge; }; struct mw_bind_info { IbvLib::mr* mr; uint64_t addr; uint64_t length; unsigned int mw_access_flags; }; struct send_wr { uint64_t wr_id; IbvLib::send_wr* next; IbvLib::sge* sg_list; int num_sge; IbvLib::wr_opcode opcode; unsigned int send_flags; union { uint32_t imm_data; uint32_t invalidate_rkey; }; union { struct { uint64_t remote_addr; uint32_t rkey; } rdma; struct { uint64_t remote_addr; uint64_t compare_add; uint64_t swap; uint32_t rkey; } atomic; struct { IbvLib::ah* ah; uint32_t remote_qpn; uint32_t remote_qkey; } ud; } wr; union { struct { uint32_t remote_srqn; } xrc; } qp_type; union { struct { IbvLib::mw* mw; uint32_t rkey; IbvLib::mw_bind_info bind_info; } bind_mw; struct { void* hdr; uint16_t hdr_sz; uint16_t mss; } tso; }; }; struct wc { uint64_t wr_id; IbvLib::wc_status status; IbvLib::wc_opcode opcode; uint32_t vendor_err; uint32_t byte_len; union { uint32_t imm_data; uint32_t invalidated_rkey; }; uint32_t qp_num; uint32_t src_qp; unsigned int wc_flags; uint16_t pkey_index; uint16_t slid; uint8_t sl; uint8_t dlid_path_bits; }; // Main structs struct async_event { union { IbvLib::cq* cq; IbvLib::qp* qp; IbvLib::srq* srq; IbvLib::wq* wq; int port_num; } element; IbvLib::event_type event_type; }; struct comp_channel { IbvLib::context* context; int fd; int refcnt; }; struct context_ops { void* (*_compat_query_device)(void); int (*_compat_query_port)( IbvLib::context* context, uint8_t port_num, struct IbvLib::_compat_port_attr* port_attr); void* (*_compat_alloc_pd)(void); void* (*_compat_dealloc_pd)(void); void* (*_compat_reg_mr)(void); void* (*_compat_rereg_mr)(void); void* (*_compat_dereg_mr)(void); IbvLib::mw* (*alloc_mw)(IbvLib::pd* pd, IbvLib::mw_type type); int (*bind_mw)(IbvLib::qp* qp, IbvLib::mw* mw, IbvLib::mw_bind* mw_bind); int (*dealloc_mw)(IbvLib::mw* mw); void* (*_compat_create_cq)(void); int (*poll_cq)(IbvLib::cq* cq, int num_entries, IbvLib::wc* wc); int (*req_notify_cq)(IbvLib::cq* cq, int solicited_only); void* (*_compat_cq_event)(void); void* (*_compat_resize_cq)(void); void* (*_compat_destroy_cq)(void); void* (*_compat_create_srq)(void); void* (*_compat_modify_srq)(void); void* (*_compat_query_srq)(void); void* (*_compat_destroy_srq)(void); int (*post_srq_recv)( IbvLib::srq* srq, IbvLib::recv_wr* recv_wr, IbvLib::recv_wr** bad_recv_wr); void* (*_compat_create_qp)(void); void* (*_compat_query_qp)(void); void* (*_compat_modify_qp)(void); void* (*_compat_destroy_qp)(void); int (*post_send)( IbvLib::qp* qp, IbvLib::send_wr* wr, IbvLib::send_wr** bad_wr); int (*post_recv)( IbvLib::qp* qp, IbvLib::recv_wr* wr, IbvLib::recv_wr** bad_wr); void* (*_compat_create_ah)(void); void* (*_compat_destroy_ah)(void); void* (*_compat_attach_mcast)(void); void* (*_compat_detach_mcast)(void); void* (*_compat_async_event)(void); }; struct context { IbvLib::device* device; IbvLib::context_ops ops; int cmd_fd; int async_fd; int num_comp_vectors; pthread_mutex_t mutex; void* abi_compat; }; struct cq { IbvLib::context* context; IbvLib::comp_channel* channel; void* cq_context; uint32_t handle; int cqe; pthread_mutex_t mutex; pthread_cond_t cond; uint32_t comp_events_completed; uint32_t async_events_completed; }; struct _device_ops { IbvLib::context* (*_dummy1)(IbvLib::device* device, int cmd_fd); void (*_dummy2)(IbvLib::context* context); }; struct device { IbvLib::_device_ops _ops; IbvLib::node_type node_type; IbvLib::transport_type transport_type; char name[IbvLib::SYSFS_NAME_MAX]; char dev_name[IbvLib::SYSFS_NAME_MAX]; char dev_path[IbvLib::SYSFS_PATH_MAX]; char ibdev_path[IbvLib::SYSFS_PATH_MAX]; }; struct mr { IbvLib::context* context; IbvLib::pd* pd; void* addr; size_t length; uint32_t handle; uint32_t lkey; uint32_t rkey; }; struct pd { IbvLib::context* context; uint32_t handle; }; struct qp { IbvLib::context* context; void* qp_context; IbvLib::pd* pd; IbvLib::cq* send_cq; IbvLib::cq* recv_cq; IbvLib::srq* srq; uint32_t handle; uint32_t qp_num; IbvLib::qp_state state; IbvLib::qp_type qp_type; pthread_mutex_t mutex; pthread_cond_t cond; uint32_t events_completed; }; struct srq { IbvLib::context* context; void* srq_context; IbvLib::pd* pd; uint32_t handle; pthread_mutex_t mutex; pthread_cond_t cond; uint32_t events_completed; }; private: explicit IbvLib(DynamicLibraryHandle dlhandle) : dlhandle_(std::move(dlhandle)) {} DynamicLibraryHandle dlhandle_; #define TP_DECLARE_FIELD(function_name, return_type, args_types) \ return_type(*function_name##_ptr_) args_types = nullptr; TP_FORALL_IBV_SYMBOLS(TP_DECLARE_FIELD) #undef TP_DECLARE_FIELD public: IbvLib() = default; #define TP_FORWARD_CALL(function_name, return_type, args_types) \ template \ auto function_name(Args&&... args) const { \ return (*function_name##_ptr_)(std::forward(args)...); \ } TP_FORALL_IBV_SYMBOLS(TP_FORWARD_CALL) #undef TP_FORWARD_CALL static std::tuple create() { Error error; DynamicLibraryHandle dlhandle; // To keep things "neat" and contained, we open in "local" mode (as opposed // to global) so that the ibverbs symbols can only be resolved through this // handle and are not exposed (a.k.a., "leaded") to other shared objects. std::tie(error, dlhandle) = DynamicLibraryHandle::create("libibverbs.so.1", RTLD_LOCAL | RTLD_LAZY); if (error) { return std::make_tuple(std::move(error), IbvLib()); } // Log at level 9 as we can't know whether this will be used in a transport // or channel, thus err on the side of this being as low-level as possible // because we don't expect this to be of interest that often. TP_VLOG(9) << [&]() -> std::string { std::string filename; std::tie(error, filename) = dlhandle.getFilename(); if (error) { return "Couldn't determine location of shared library libibverbs.so.1: " + error.what(); } return "Found shared library libibverbs.so.1 at " + filename; }(); IbvLib lib(std::move(dlhandle)); #define TP_LOAD_SYMBOL(function_name, return_type, args_types) \ { \ void* ptr; \ std::tie(error, ptr) = lib.dlhandle_.loadSymbol("ibv_" #function_name); \ if (error) { \ return std::make_tuple(std::move(error), IbvLib()); \ } \ TP_THROW_ASSERT_IF(ptr == nullptr); \ lib.function_name##_ptr_ = \ reinterpret_cast(ptr); \ } TP_FORALL_IBV_SYMBOLS(TP_LOAD_SYMBOL) #undef TP_LOAD_SYMBOL return std::make_tuple(Error::kSuccess, std::move(lib)); } // These functions (which, it would seem, are the ones that are used in the // critical control path, and which thus must have the lowest latency and // avoid any syscall/kernel overhead) are not exposed as symbols of // libibverbs.so: they are defined inline in the header and, in fact, they // access a function pointer stored on the ibv_context and execute it. int poll_cq(IbvLib::cq* cq, int num_entries, IbvLib::wc* wc) const { return cq->context->ops.poll_cq(cq, num_entries, wc); } int post_send(IbvLib::qp* qp, IbvLib::send_wr* wr, IbvLib::send_wr** bad_wr) const { return qp->context->ops.post_send(qp, wr, bad_wr); } int post_recv(IbvLib::qp* qp, IbvLib::recv_wr* wr, IbvLib::recv_wr** bad_wr) const { return qp->context->ops.post_recv(qp, wr, bad_wr); } int post_srq_recv( IbvLib::srq* srq, IbvLib::recv_wr* recv_wr, IbvLib::recv_wr** bad_recv_wr) const { return srq->context->ops.post_srq_recv(srq, recv_wr, bad_recv_wr); } }; #undef TP_FORALL_IBV_SYMBOLS } // namespace tensorpipe ================================================ FILE: tensorpipe/common/memory.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include namespace tensorpipe { class MmappedPtr { MmappedPtr(uint8_t* ptr, size_t length) { ptr_ = decltype(ptr_)(ptr, Deleter{length}); } public: MmappedPtr() = default; static std::tuple create( size_t length, int prot, int flags, int fd) { void* ptr; ptr = ::mmap(nullptr, length, prot, flags, fd, 0); if (ptr == MAP_FAILED) { return std::make_tuple( TP_CREATE_ERROR(SystemError, "mmap", errno), MmappedPtr()); } return std::make_tuple( Error::kSuccess, MmappedPtr(reinterpret_cast(ptr), length)); } uint8_t* ptr() { return ptr_.get(); } const uint8_t* ptr() const { return ptr_.get(); } size_t getLength() const { return ptr_.get_deleter().length; } void reset() { ptr_.reset(); } private: struct Deleter { size_t length; void operator()(void* ptr) { int ret = ::munmap(ptr, length); TP_THROW_SYSTEM_IF(ret != 0, errno); } }; std::unique_ptr ptr_{nullptr, Deleter{}}; }; } // namespace tensorpipe ================================================ FILE: tensorpipe/common/nop.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include namespace tensorpipe { // Libnop makes heavy use of templates, whereas TensorPipe is designed around // polymorphism (abstract interfaces and concrete derived classes). The two // don't mix well: for example, one can't have virtual method templates. One // technique to get around this is type erasure, which is however tricky to get // right because the "fundamental" operation(s) of libnop, (de)serialization, // are simultaneously templated on two types: the reader/writer and the object. // Ideally we'd like for both these sets of types to be dynamically extensible, // as we want to allow transpors to provide their own specialized readers and // writers, and channels could have their own custom objects that they want to // (de)serialize. New transports and channel could be implemented by third // parties and plugged in at runtime, so the sets of reader/writers and of // objects that we must support can't be known in advance. // We had originally found a solution to this pickle by doing two type erasures // one after the other, first on the reader/writer, which deals with bytes and // not objects and is thus not templated, and then on objects, leveraging the // fact that there is one libnop (de)serializer that takes a *pointer* to a // reader/writer giving us a "hook" on which to do polymorphism, by hardcoding a // pointer to the base reader/writer class as template parameter, but then // passing in an instance of a concrete subclass at runtime. // However it turned out that this performed poorly, apparently due to the // (de)serialization process consisting in many small calls to the reader/writer // which each had to perform a vtable lookup. So, instead, we decided to not // allow transports to utilize custom specialized readers/writers and to provide // a single global reader/writer class that is able to cover the two main usage // patterns we think are most likely to come up: reading/writing to a temporary // contiguous buffer, and reading/writing to a ringbuffer. // This reader and writer can operate either on one single buffer (ptr + len) or // on two buffers: in the latter case, they first consume the first one and, // when that fills up, they "spill over" into the second one. This is needed in // order to support the "wrap around" point in ringbuffers. class NopReader final { public: NopReader(const uint8_t* ptr, size_t len) : ptr1_(ptr), len1_(len) {} NopReader(const uint8_t* ptr1, size_t len1, const uint8_t* ptr2, size_t len2) : ptr1_(ptr1), len1_(len1), ptr2_(ptr2), len2_(len2) {} // NOLINTNEXTLINE(readability-identifier-naming) nop::Status Ensure(size_t size) { if (likely(size <= len1_ + len2_)) { return nop::ErrorStatus::None; } else { return nop::ErrorStatus::ReadLimitReached; } } // NOLINTNEXTLINE(readability-identifier-naming) nop::Status Read(uint8_t* byte) { if (unlikely(len1_ == 0)) { ptr1_ = ptr2_; len1_ = len2_; ptr2_ = nullptr; len2_ = 0; } *byte = *ptr1_; ptr1_++; len1_--; return nop::ErrorStatus::None; } // NOLINTNEXTLINE(readability-identifier-naming) nop::Status Read(void* begin, void* end) { size_t size = reinterpret_cast(end) - reinterpret_cast(begin); if (unlikely(len1_ < size)) { std::memcpy(begin, ptr1_, len1_); begin = reinterpret_cast(begin) + len1_; size -= len1_; ptr1_ = ptr2_; len1_ = len2_; ptr2_ = nullptr; len2_ = 0; } std::memcpy(begin, ptr1_, size); ptr1_ += size; len1_ -= size; return nop::ErrorStatus::None; } // NOLINTNEXTLINE(readability-identifier-naming) nop::Status Skip(size_t paddingBytes) { if (unlikely(len1_ < paddingBytes)) { paddingBytes -= len1_; ptr1_ = ptr2_; len1_ = len2_; ptr2_ = nullptr; len2_ = 0; } ptr1_ += paddingBytes; len1_ -= paddingBytes; return nop::ErrorStatus::None; } private: const uint8_t* ptr1_ = nullptr; size_t len1_ = 0; const uint8_t* ptr2_ = nullptr; size_t len2_ = 0; }; class NopWriter final { public: NopWriter(uint8_t* ptr, size_t len) : ptr1_(ptr), len1_(len) {} NopWriter(uint8_t* ptr1, size_t len1, uint8_t* ptr2, size_t len2) : ptr1_(ptr1), len1_(len1), ptr2_(ptr2), len2_(len2) {} // NOLINTNEXTLINE(readability-identifier-naming) nop::Status Prepare(size_t size) { if (likely(size <= len1_ + len2_)) { return nop::ErrorStatus::None; } else { return nop::ErrorStatus::WriteLimitReached; } } // NOLINTNEXTLINE(readability-identifier-naming) nop::Status Write(uint8_t byte) { if (unlikely(len1_ == 0)) { ptr1_ = ptr2_; len1_ = len2_; ptr2_ = nullptr; len2_ = 0; } *ptr1_ = byte; ptr1_++; len1_--; return nop::ErrorStatus::None; } // NOLINTNEXTLINE(readability-identifier-naming) nop::Status Write(const void* begin, const void* end) { size_t size = reinterpret_cast(end) - reinterpret_cast(begin); if (unlikely(len1_ < size)) { std::memcpy(ptr1_, begin, len1_); begin = reinterpret_cast(begin) + len1_; size -= len1_; ptr1_ = ptr2_; len1_ = len2_; ptr2_ = nullptr; len2_ = 0; } std::memcpy(ptr1_, begin, size); ptr1_ += size; len1_ -= size; return nop::ErrorStatus::None; } // NOLINTNEXTLINE(readability-identifier-naming) nop::Status Skip(size_t paddingBytes, uint8_t paddingValue) { if (unlikely(len1_ < paddingBytes)) { std::memset(ptr1_, paddingValue, paddingBytes); paddingBytes -= len1_; ptr1_ = ptr2_; len1_ = len2_; ptr2_ = nullptr; len2_ = 0; } std::memset(ptr1_, paddingValue, paddingBytes); ptr1_ += paddingBytes; len1_ -= paddingBytes; return nop::ErrorStatus::None; } private: uint8_t* ptr1_ = nullptr; size_t len1_ = 0; uint8_t* ptr2_ = nullptr; size_t len2_ = 0; }; // The helpers to perform type erasure of the object type: a untemplated base // class exposing the methods we need for (de)serialization, and then templated // subclasses allowing to create a holder for each concrete libnop type. class AbstractNopHolder { public: virtual size_t getSize() const = 0; virtual nop::Status write(NopWriter& writer) const = 0; virtual nop::Status read(NopReader& reader) = 0; virtual ~AbstractNopHolder() = default; }; template class NopHolder : public AbstractNopHolder { public: T& getObject() { return object_; } const T& getObject() const { return object_; } size_t getSize() const override { return nop::Encoding::Size(object_); } nop::Status write(NopWriter& writer) const override { return nop::Encoding::Write(object_, &writer); } nop::Status read(NopReader& reader) override { return nop::Encoding::Read(&object_, &reader); } private: T object_; }; } // namespace tensorpipe namespace nop { // The `nop::Encoding` specialization for `tensorpipe::optional` was inspired // by that of `nop::Optional`, available here: // https://github.com/google/libnop/blob/master/include/nop/base/optional.h template struct Encoding> : EncodingIO> { using Type = tensorpipe::optional; // NOLINTNEXTLINE(readability-identifier-naming) static constexpr EncodingByte Prefix(const Type& value) { return value ? Encoding::Prefix(value.value()) : EncodingByte::Nil; } // NOLINTNEXTLINE(readability-identifier-naming) static constexpr std::size_t Size(const Type& value) { return value ? Encoding::Size(value.value()) : BaseEncodingSize(EncodingByte::Nil); } // NOLINTNEXTLINE(readability-identifier-naming) static constexpr bool Match(EncodingByte prefix) { return prefix == EncodingByte::Nil || Encoding::Match(prefix); } template // NOLINTNEXTLINE(readability-identifier-naming) static constexpr Status WritePayload( EncodingByte prefix, const Type& value, Writer* writer) { if (value) { return Encoding::WritePayload(prefix, value.value(), writer); } else { return {}; } } template // NOLINTNEXTLINE(readability-identifier-naming) static constexpr Status ReadPayload( EncodingByte prefix, Type* value, Reader* reader) { if (prefix == EncodingByte::Nil) { value->reset(); } else { T temp; auto status = Encoding::ReadPayload(prefix, &temp, reader); if (!status) { return status; } *value = std::move(temp); } return {}; } }; } // namespace nop ================================================ FILE: tensorpipe/common/nvml_lib.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #define TP_NVML_CHECK(nvml_lib, a) \ do { \ nvmlReturn_t error = (a); \ if (error != NVML_SUCCESS) { \ const char* errorStr; \ errorStr = (nvml_lib).errorString(error); \ TP_THROW_ASSERT() << __TP_EXPAND_OPD(a) << " " << errorStr; \ } \ } while (false) namespace tensorpipe { // Master list of all symbols we care about from libnvidia-ml. #define TP_FORALL_NVML_SYMBOLS(_) \ _(deviceGetComputeRunningProcesses, \ nvmlDeviceGetComputeRunningProcesses, \ nvmlReturn_t, \ (nvmlDevice_t, unsigned int*, nvmlProcessInfo_t*)) \ _(deviceGetCount_v2, nvmlDeviceGetCount_v2, nvmlReturn_t, (unsigned int*)) \ _(deviceGetHandleByIndex_v2, \ nvmlDeviceGetHandleByIndex_v2, \ nvmlReturn_t, \ (unsigned int, nvmlDevice_t*)) \ _(deviceGetHandleByUUID, \ nvmlDeviceGetHandleByUUID, \ nvmlReturn_t, \ (const char*, nvmlDevice_t*)) \ _(deviceGetP2PStatus, \ nvmlDeviceGetP2PStatus, \ nvmlReturn_t, \ (nvmlDevice_t, nvmlDevice_t, nvmlGpuP2PCapsIndex_t, nvmlGpuP2PStatus_t*)) \ _(deviceGetUUID, \ nvmlDeviceGetUUID, \ nvmlReturn_t, \ (nvmlDevice_t, char*, unsigned int)) \ _(errorString, nvmlErrorString, const char*, (nvmlReturn_t)) \ _(init_v2, nvmlInit_v2, nvmlReturn_t, ()) \ _(shutdown, nvmlShutdown, nvmlReturn_t, ()) // Wrapper for libnvidia-ml. class NvmlLib { private: explicit NvmlLib(DynamicLibraryHandle dlhandle) : dlhandle_(std::move(dlhandle)) {} DynamicLibraryHandle dlhandle_; bool inited_ = false; #define TP_DECLARE_FIELD(method_name, function_name, return_type, args_types) \ return_type(*function_name##_ptr_) args_types = nullptr; TP_FORALL_NVML_SYMBOLS(TP_DECLARE_FIELD) #undef TP_DECLARE_FIELD public: NvmlLib() = default; // Implement another RAII layer (on top of the one of DynamicLibraryHandle) to // deal with nvmlInit_v2 and nvmlShutdown. The default move assignment would // fail to shutdown NVML when another instance is moved into it, and it would // cause the destructor to shutdown a moved-out instance. NvmlLib(const NvmlLib&) = delete; NvmlLib& operator=(const NvmlLib&) = delete; NvmlLib(NvmlLib&& other) { *this = std::move(other); } NvmlLib& operator=(NvmlLib&& other) { std::swap(dlhandle_, other.dlhandle_); std::swap(inited_, other.inited_); #define TP_SWAP_FIELD(method_name, function_name, return_type, args_types) \ std::swap(function_name##_ptr_, other.function_name##_ptr_); TP_FORALL_NVML_SYMBOLS(TP_SWAP_FIELD) #undef TP_SWAP_FIELD return *this; } #define TP_FORWARD_CALL(method_name, function_name, return_type, args_types) \ template \ auto method_name(Args&&... args) const { \ return (*function_name##_ptr_)(std::forward(args)...); \ } TP_FORALL_NVML_SYMBOLS(TP_FORWARD_CALL) #undef TP_FORWARD_CALL static std::tuple create() { Error error; DynamicLibraryHandle dlhandle; // To keep things "neat" and contained, we open in "local" mode (as // opposed to global) so that the cuda symbols can only be resolved // through this handle and are not exposed (a.k.a., "leaked") to other // shared objects. std::tie(error, dlhandle) = DynamicLibraryHandle::create( "libnvidia-ml.so.1", RTLD_LOCAL | RTLD_LAZY); if (error) { return std::make_tuple(std::move(error), NvmlLib()); } // Log at level 9 as we can't know whether this will be used in a transport // or channel, thus err on the side of this being as low-level as possible // because we don't expect this to be of interest that often. TP_VLOG(9) << [&]() -> std::string { std::string filename; std::tie(error, filename) = dlhandle.getFilename(); if (error) { return "Couldn't determine location of shared library libnvidia-ml.so.1: " + error.what(); } return "Found shared library libnvidia-ml.so.1 at " + filename; }(); NvmlLib lib(std::move(dlhandle)); #define TP_LOAD_SYMBOL(method_name, function_name, return_type, args_types) \ { \ void* ptr; \ std::tie(error, ptr) = lib.dlhandle_.loadSymbol(#function_name); \ if (error) { \ return std::make_tuple(std::move(error), NvmlLib()); \ } \ TP_THROW_ASSERT_IF(ptr == nullptr); \ lib.function_name##_ptr_ = \ reinterpret_cast(ptr); \ } TP_FORALL_NVML_SYMBOLS(TP_LOAD_SYMBOL) #undef TP_LOAD_SYMBOL TP_NVML_CHECK(lib, lib.init_v2()); lib.inited_ = true; return std::make_tuple(Error::kSuccess, std::move(lib)); } ~NvmlLib() { if (inited_) { TP_DCHECK(dlhandle_.hasValue()); TP_NVML_CHECK(*this, shutdown()); } } }; #undef TP_FORALL_NVML_SYMBOLS } // namespace tensorpipe ================================================ FILE: tensorpipe/common/optional.h ================================================ #pragma once #include namespace tensorpipe { using std::optional; using std::nullopt; } // namespace tensorpipe ================================================ FILE: tensorpipe/common/queue.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include namespace tensorpipe { template class Queue { public: explicit Queue(int capacity = 1) : capacity_(capacity) {} void push(T t) { std::unique_lock lock(mutex_); while (items_.size() >= capacity_) { cv_.wait(lock); } items_.push_back(std::move(t)); cv_.notify_all(); } T pop() { std::unique_lock lock(mutex_); while (items_.size() == 0) { cv_.wait(lock); } T t(std::move(items_.front())); items_.pop_front(); cv_.notify_all(); return t; } private: std::mutex mutex_; std::condition_variable cv_; const int capacity_; std::deque items_; }; } // namespace tensorpipe ================================================ FILE: tensorpipe/common/ringbuffer.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include /// /// C++17 implementation of shared-memory friendly perf_event style ringbuffer. /// It's designed to avoid parallel access and provide (almost) zero-copy /// /// /// A ringbuffer has a header and a data members that can be allocated /// independently from the ringbuffer object, allowing the ringbuffer object /// to be stored in process' exclusive memory while header and data /// could be in shared memory. /// /// Multiple ringbuffers can reference the same header + data. /// /// Multiple producers (or consumers) can reference the same ringbuffer. /// /// Synchronization between all producers/consumers of all ringbuffers that /// reference the same header + pair pairs is done using atomic operations /// care is taken to guarantee lock-free implementations, reduce the usage /// of LOCK prefixes and the access to non-exclusive cache lines by CPUs. /// /// Producers write data atomically at ringbuffer's head, while Consumers /// write data atomically at ringbuffer's tail. /// namespace tensorpipe { /// /// RingBufferHeader contains the head, tail and other control information /// of the RingBuffer. /// /// is the minimum byte size of the circular buffer. The actual /// size is the smallest power of 2 larger than kMinByteSize_. Enforcing the /// size to be a power of two avoids costly division/modulo operations. /// template class RingBufferHeader { public: static_assert(NumRoles > 0, ""); const uint64_t kDataPoolByteSize; const uint64_t kDataModMask; RingBufferHeader(const RingBufferHeader&) = delete; RingBufferHeader(RingBufferHeader&&) = delete; // Implementation uses power of 2 arithmetic to avoid costly modulo. // So build the largest RingBuffer with size of the smallest power of 2 >= // . explicit RingBufferHeader(uint64_t minDataByteSize) : kDataPoolByteSize{nextPow2(minDataByteSize)}, kDataModMask{kDataPoolByteSize - 1} { // Minimum size where implementation of bit shift arithmetic works. TP_DCHECK_GE(kDataPoolByteSize, 2) << "Minimum supported ringbuffer data size is 2 bytes"; TP_DCHECK(isPow2(kDataPoolByteSize)) << kDataPoolByteSize << " is not a power of 2"; TP_DCHECK_LE(kDataPoolByteSize, std::numeric_limits::max()) << "Logic piggy-backs read/write size on ints, to be safe forbid" " buffer to ever be larger than what an int can hold"; for (int roleIdx = 0; roleIdx < NumRoles; ++roleIdx) { inTx_[roleIdx].clear(); markers_[roleIdx] = 0; } } // Being in a transaction (either a read or a write one) gives a user of the // ringbuffer (either a consumer or a producer, respectively) the right to // read the head and tail and to modify the one they are responsible for (the // tail and the head, respectively). Accessing the head or tail outside of a // transaction could lead to races. This also means we need memory barriers // around a transaction, to make sure side-effects of other users are visible // upon entering and our side effects become visible to others upon exiting. // We also must prevent the compiler from reordering memory accesses. Failure // to do so may result in our reads of head/tail to look like they occurred // before we entered the transaction, and writes to them to look like they // occurred after we exited it. In order to get the desired behavior, we use // the acquire memory order when starting a transaction (which means no later // memory access can be moved before it) and the release memory order when // ending it (no earlier memory access can be moved after it). template [[nodiscard]] bool beginTransaction() { static_assert(0 <= RoleIdx && RoleIdx < NumRoles, ""); return inTx_[RoleIdx].test_and_set(std::memory_order_acquire); } template void endTransaction() { static_assert(0 <= RoleIdx && RoleIdx < NumRoles, ""); inTx_[RoleIdx].clear(std::memory_order_release); } // Reading the head and tail is what gives a user of the ringbuffer (either a // consumer or a producer) the right to access the buffer's contents: the // producer can write on [head, tail) (modulo the size), the consumer can read // from [tail, head). And, when the producer increases the head, or when the // consumer increases the tail, they give users of the opposite type the right // to access some of the memory that was previously under their control. Thus, // just like we do for the transactions, we need memory barriers around reads // and writes to the head and tail, with the same reasoning for memory orders. template uint64_t readMarker() const { static_assert(0 <= RoleIdx && RoleIdx < NumRoles, ""); return markers_[RoleIdx].load(std::memory_order_acquire); } template void incMarker(uint64_t inc) { static_assert(0 <= RoleIdx && RoleIdx < NumRoles, ""); markers_[RoleIdx].fetch_add(inc, std::memory_order_release); } protected: std::array inTx_; std::array, NumRoles> markers_; // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2007/n2427.html#atomics.lockfree // static_assert( // decltype(markers_)::value_type::is_always_lock_free, // "Only lock-free atomics are supported"); }; /// /// Process' view of a ring buffer. /// This cannot reside in shared memory since it has pointers. /// template class RingBuffer final { public: RingBuffer() = default; RingBuffer(RingBufferHeader* header, uint8_t* data) : header_(header), data_(data) { TP_THROW_IF_NULLPTR(header_) << "Header cannot be nullptr"; TP_THROW_IF_NULLPTR(data_) << "Data cannot be nullptr"; } const RingBufferHeader& getHeader() const { return *header_; } RingBufferHeader& getHeader() { return *header_; } const uint8_t* getData() const { return data_; } uint8_t* getData() { return data_; } protected: RingBufferHeader* header_ = nullptr; uint8_t* data_ = nullptr; }; } // namespace tensorpipe ================================================ FILE: tensorpipe/common/ringbuffer_read_write_ops.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include #include #include #include namespace tensorpipe { // Reads happen only if the user supplied a callback (and optionally // a destination buffer). The callback is run from the event loop // thread upon receiving a notification from our peer. // // The memory pointer argument to the callback is valid only for the // duration of the callback. If the memory contents must be // preserved for longer, it must be copied elsewhere. // class RingbufferReadOperation { enum Mode { READ_LENGTH, READ_PAYLOAD, }; public: using read_callback_fn = std::function; // Read into a user-provided buffer of known length. inline RingbufferReadOperation(void* ptr, size_t len, read_callback_fn fn); // Read into an auto-allocated buffer, whose length is read from the wire. explicit inline RingbufferReadOperation(read_callback_fn fn); // Read into a user-provided libnop object, read length from the wire. inline RingbufferReadOperation( AbstractNopHolder* nopObject, read_callback_fn fn); // Processes a pending read. template inline size_t handleRead(RingBufferRole& inbox); bool completed() const { return (mode_ == READ_PAYLOAD && bytesRead_ == len_); } inline void handleError(const Error& error); private: Mode mode_{READ_LENGTH}; void* ptr_{nullptr}; AbstractNopHolder* nopObject_{nullptr}; std::unique_ptr buf_; size_t len_{0}; size_t bytesRead_{0}; read_callback_fn fn_; // Use a separare flag, rather than checking if ptr_ == nullptr, to catch the // case of a user explicitly passing in a nullptr with length zero, in which // case we must check that the length matches the header we see on the wire. const bool ptrProvided_; template inline ssize_t readNopObject(RingBufferRole& inbox); }; // Writes happen only if the user supplied a memory pointer, the // number of bytes to write, and a callback to execute upon // completion of the write. // // The memory pointed to by the pointer may only be reused or freed // after the callback has been called. // class RingbufferWriteOperation { enum Mode { WRITE_LENGTH, WRITE_PAYLOAD, }; public: using write_callback_fn = std::function; // Write from a user-provided buffer of known length. inline RingbufferWriteOperation( const void* ptr, size_t len, write_callback_fn fn); // Write from a user-provided libnop object. inline RingbufferWriteOperation( const AbstractNopHolder* nopObject, write_callback_fn fn); template inline size_t handleWrite(RingBufferRole& outbox); bool completed() const { return (mode_ == WRITE_PAYLOAD && bytesWritten_ == len_); } inline void handleError(const Error& error); private: Mode mode_{WRITE_LENGTH}; const void* ptr_{nullptr}; const AbstractNopHolder* nopObject_{nullptr}; size_t len_{0}; size_t bytesWritten_{0}; write_callback_fn fn_; template inline ssize_t writeNopObject(RingBufferRole& outbox); }; RingbufferReadOperation::RingbufferReadOperation( void* ptr, size_t len, read_callback_fn fn) : ptr_(ptr), len_(len), fn_(std::move(fn)), ptrProvided_(true) {} RingbufferReadOperation::RingbufferReadOperation(read_callback_fn fn) : fn_(std::move(fn)), ptrProvided_(false) {} RingbufferReadOperation::RingbufferReadOperation( AbstractNopHolder* nopObject, read_callback_fn fn) : nopObject_(nopObject), fn_(std::move(fn)), ptrProvided_(false) {} template size_t RingbufferReadOperation::handleRead( RingBufferRole& inbox) { ssize_t ret; size_t bytesReadNow = 0; // Start read transaction. This end of the connection is the only consumer for // this ringbuffer, and all reads are done from the reactor thread, so there // cannot be another transaction already going on. Fail hard in case. ret = inbox.startTx(); TP_THROW_SYSTEM_IF(ret < 0, -ret); if (mode_ == READ_LENGTH) { uint32_t length; ret = inbox.template readInTx( &length, sizeof(length)); if (likely(ret >= 0)) { mode_ = READ_PAYLOAD; bytesReadNow += ret; if (nopObject_ != nullptr) { len_ = length; } else if (ptrProvided_) { TP_DCHECK_EQ(length, len_); } else { len_ = length; buf_ = std::make_unique(len_); ptr_ = buf_.get(); } } else if (unlikely(ret != -ENODATA)) { TP_THROW_SYSTEM(-ret); } } if (mode_ == READ_PAYLOAD) { if (nopObject_ != nullptr) { ret = readNopObject(inbox); } else { ret = inbox.template readInTx( reinterpret_cast(ptr_) + bytesRead_, len_ - bytesRead_); } if (likely(ret >= 0)) { bytesRead_ += ret; bytesReadNow += ret; } else if (unlikely(ret != -ENODATA)) { TP_THROW_SYSTEM(-ret); } } ret = inbox.commitTx(); TP_THROW_SYSTEM_IF(ret < 0, -ret); if (completed()) { fn_(Error::kSuccess, ptr_, len_); } return bytesReadNow; } template ssize_t RingbufferReadOperation::readNopObject( RingBufferRole& inbox) { TP_THROW_ASSERT_IF(len_ > inbox.getSize()); ssize_t numBuffers; std::array::Buffer, 2> buffers; std::tie(numBuffers, buffers) = inbox.template accessContiguousInTx(len_); if (unlikely(numBuffers < 0)) { return numBuffers; } NopReader reader( buffers[0].ptr, buffers[0].len, buffers[1].ptr, buffers[1].len); nop::Status status = nopObject_->read(reader); if (status.error() == nop::ErrorStatus::ReadLimitReached) { return -ENODATA; } else if (status.has_error()) { return -EINVAL; } return len_; } void RingbufferReadOperation::handleError(const Error& error) { fn_(error, nullptr, 0); } RingbufferWriteOperation::RingbufferWriteOperation( const void* ptr, size_t len, write_callback_fn fn) : ptr_(ptr), len_(len), fn_(std::move(fn)) {} RingbufferWriteOperation::RingbufferWriteOperation( const AbstractNopHolder* nopObject, write_callback_fn fn) : nopObject_(nopObject), len_(nopObject_->getSize()), fn_(std::move(fn)) {} template size_t RingbufferWriteOperation::handleWrite( RingBufferRole& outbox) { ssize_t ret; size_t bytesWrittenNow = 0; // Start write transaction. This end of the connection is the only producer // for this ringbuffer, and all writes are done from the reactor thread, so // there cannot be another transaction already going on. Fail hard in case. ret = outbox.startTx(); TP_THROW_SYSTEM_IF(ret < 0, -ret); if (mode_ == WRITE_LENGTH) { uint32_t length = len_; ret = outbox.template writeInTx( &length, sizeof(length)); if (likely(ret >= 0)) { mode_ = WRITE_PAYLOAD; bytesWrittenNow += ret; } else if (unlikely(ret != -ENODATA)) { TP_THROW_SYSTEM(-ret); } } if (mode_ == WRITE_PAYLOAD) { if (nopObject_ != nullptr) { ret = writeNopObject(outbox); } else { ret = outbox.template writeInTx( reinterpret_cast(ptr_) + bytesWritten_, len_ - bytesWritten_); } if (likely(ret >= 0)) { bytesWritten_ += ret; bytesWrittenNow += ret; } else if (unlikely(ret != -ENODATA)) { TP_THROW_SYSTEM(-ret); } } ret = outbox.commitTx(); TP_THROW_SYSTEM_IF(ret < 0, -ret); if (completed()) { fn_(Error::kSuccess); } return bytesWrittenNow; } template ssize_t RingbufferWriteOperation::writeNopObject( RingBufferRole& outbox) { TP_THROW_ASSERT_IF(len_ > outbox.getSize()); ssize_t numBuffers; std::array::Buffer, 2> buffers; std::tie(numBuffers, buffers) = outbox.template accessContiguousInTx(len_); if (unlikely(numBuffers < 0)) { return numBuffers; } NopWriter writer( buffers[0].ptr, buffers[0].len, buffers[1].ptr, buffers[1].len); nop::Status status = nopObject_->write(writer); if (status.error() == nop::ErrorStatus::WriteLimitReached) { return -ENODATA; } else if (status.has_error()) { return -EINVAL; } return len_; } void RingbufferWriteOperation::handleError(const Error& error) { fn_(error); } } // namespace tensorpipe ================================================ FILE: tensorpipe/common/ringbuffer_role.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include namespace tensorpipe { /// /// Role of a RingBuffer. /// /// Provides methods to read and write data into a ringbuffer. /// template class RingBufferRole { public: static_assert(0 <= RoleIdx && RoleIdx < NumRoles, ""); RingBufferRole() = delete; explicit RingBufferRole(RingBuffer& rb) : header_{rb.getHeader()}, data_{rb.getData()} { TP_THROW_IF_NULLPTR(data_); } RingBufferRole(const RingBufferRole&) = delete; RingBufferRole(RingBufferRole&&) = delete; RingBufferRole& operator=(const RingBufferRole&) = delete; RingBufferRole& operator=(RingBufferRole&&) = delete; ~RingBufferRole() noexcept { TP_THROW_ASSERT_IF(inTx()); } size_t getSize() const { return header_.kDataPoolByteSize; } // // Transaction based API. // // Only one instance of a role can have an active transaction at any time. // *InTx* operations that fail do not cancel transaction. // bool inTx() const noexcept { return inTx_; } [[nodiscard]] ssize_t startTx() noexcept { if (unlikely(inTx())) { return -EBUSY; } if (header_.template beginTransaction()) { return -EAGAIN; } inTx_ = true; TP_DCHECK_EQ(txSize_, 0); return 0; } [[nodiscard]] ssize_t commitTx() noexcept { if (unlikely(!inTx())) { return -EINVAL; } header_.template incMarker(txSize_); txSize_ = 0; inTx_ = false; header_.template endTransaction(); return 0; } [[nodiscard]] ssize_t cancelTx() noexcept { if (unlikely(!inTx())) { return -EINVAL; } txSize_ = 0; inTx_ = false; header_.template endTransaction(); return 0; } struct Buffer { uint8_t* ptr{nullptr}; size_t len{0}; }; // The first item is negative in case of error, otherwise it contains how many // elements of the array are valid (0, 1 or 2). The elements are ptr+len pairs // of contiguous areas of the ringbuffer that, chained together, represent a // slice of the requested size (or less if not enough data is available, and // AllowPartial is set to true). template [[nodiscard]] std::pair> accessContiguousInTx( size_t size) noexcept { std::array result; if (unlikely(!inTx())) { return {-EINVAL, result}; } if (unlikely(size == 0)) { return {0, result}; } const uint64_t tail = header_.template readMarker(); const uint64_t head = header_.template readMarker<(RoleIdx + 1) % NumRoles>() + (RoleIdx + 1 == NumRoles ? header_.kDataPoolByteSize : 0); TP_DCHECK_LE(head - tail, header_.kDataPoolByteSize); const size_t avail = head - tail - txSize_; TP_DCHECK_GE(avail, 0); if (!AllowPartial && avail < size) { return {-ENODATA, result}; } if (avail == 0) { return {0, result}; } size = std::min(size, avail); const uint64_t start = (tail + txSize_) & header_.kDataModMask; const uint64_t end = (start + size) & header_.kDataModMask; txSize_ += size; // end == 0 is the same as end == bufferSize, in which case it doesn't wrap. const bool wrap = (start >= end && end > 0); if (likely(!wrap)) { result[0] = {.ptr = data_ + start, .len = size}; return {1, result}; } else { result[0] = { .ptr = data_ + start, .len = header_.kDataPoolByteSize - start}; result[1] = {.ptr = data_, .len = end}; return {2, result}; } } // Increment our marker without doing anything, i.e., "skip" over the data. [[nodiscard]] ssize_t incMarkerInTx(size_t size) { // We could implement this from scratch but we'd rather re-use the logic // from accessContiguous as it's easy to get it wrong. ssize_t ret; std::array buffers; std::tie(ret, buffers) = accessContiguousInTx(size); return ret; } // Copy data from the ringbuffer into the provided buffer, up to the given // size (only copy less data if AllowPartial is set to true). template [[nodiscard]] ssize_t readInTx(void* buffer, const size_t size) noexcept { ssize_t numBuffers; std::array buffers; std::tie(numBuffers, buffers) = accessContiguousInTx(size); if (unlikely(numBuffers < 0)) { return numBuffers; } if (unlikely(numBuffers == 0)) { // Nothing to do. return 0; } else if (likely(numBuffers == 1)) { std::memcpy(buffer, buffers[0].ptr, buffers[0].len); return buffers[0].len; } else if (likely(numBuffers == 2)) { std::memcpy(buffer, buffers[0].ptr, buffers[0].len); std::memcpy( reinterpret_cast(buffer) + buffers[0].len, buffers[1].ptr, buffers[1].len); return buffers[0].len + buffers[1].len; } else { TP_THROW_ASSERT() << "Bad number of buffers: " << numBuffers; // Dummy return to make the compiler happy. return -EINVAL; } } // Copy data from the provided buffer into the ringbuffer, up to the given // size (only copy less data if AllowPartial is set to true). template [[nodiscard]] ssize_t writeInTx( const void* buffer, const size_t size) noexcept { ssize_t numBuffers; std::array buffers; std::tie(numBuffers, buffers) = accessContiguousInTx(size); if (unlikely(numBuffers < 0)) { return numBuffers; } if (unlikely(numBuffers == 0)) { // Nothing to do. return 0; } else if (likely(numBuffers == 1)) { std::memcpy(buffers[0].ptr, buffer, buffers[0].len); return buffers[0].len; } else if (likely(numBuffers == 2)) { std::memcpy(buffers[0].ptr, buffer, buffers[0].len); std::memcpy( buffers[1].ptr, reinterpret_cast(buffer) + buffers[0].len, buffers[1].len); return buffers[0].len + buffers[1].len; } else { TP_THROW_ASSERT() << "Bad number of buffers: " << numBuffers; // Dummy return to make the compiler happy. return -EINVAL; } } // // High-level atomic operations. // // Copy data from the ringbuffer into the provided buffer, exactly the given // size. Take care of opening and closing the transaction. [[nodiscard]] ssize_t read(void* buffer, const size_t size) noexcept { auto ret = startTx(); if (0 > ret) { return ret; } ret = readInTx(buffer, size); if (0 > ret) { auto r = cancelTx(); TP_DCHECK_EQ(r, 0); return ret; } TP_DCHECK_EQ(ret, size); ret = commitTx(); TP_DCHECK_EQ(ret, 0); return size; } // Copy data from the provided buffer into the ringbuffer, exactly the given // size. Take care of opening and closing the transaction. [[nodiscard]] ssize_t write(const void* buffer, size_t size) noexcept { auto ret = startTx(); if (0 > ret) { return ret; } ret = writeInTx(buffer, size); if (0 > ret) { auto r = cancelTx(); TP_DCHECK_EQ(r, 0); return ret; } TP_DCHECK_EQ(ret, size); ret = commitTx(); TP_DCHECK_EQ(ret, 0); return size; } private: RingBufferHeader& header_; uint8_t* const data_; unsigned txSize_ = 0; bool inTx_{false}; }; } // namespace tensorpipe ================================================ FILE: tensorpipe/common/shm_ringbuffer.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include namespace tensorpipe { /// Creates ringbuffer on shared memory. /// /// is the minimum size of the data section of the RingBuffer. /// template std::tuple> createShmRingBuffer(size_t minRbByteSize) { Error error; ShmSegment headerSegment; RingBufferHeader* header; std::tie(error, headerSegment, header) = ShmSegment::create>(minRbByteSize); if (error) { return std::make_tuple( std::move(error), ShmSegment(), ShmSegment(), RingBuffer()); } ShmSegment dataSegment; uint8_t* data; std::tie(error, dataSegment, data) = ShmSegment::create(header->kDataPoolByteSize); if (error) { return std::make_tuple( std::move(error), ShmSegment(), ShmSegment(), RingBuffer()); } // Note: cannot use implicit construction from initializer list on GCC 5.5: // "converting to XYZ from initializer list would use explicit constructor". return std::make_tuple( Error::kSuccess, std::move(headerSegment), std::move(dataSegment), RingBuffer(header, data)); } template std::tuple> loadShmRingBuffer(Fd headerFd, Fd dataFd) { Error error; ShmSegment headerSegment; RingBufferHeader* header; std::tie(error, headerSegment, header) = ShmSegment::load>(std::move(headerFd)); if (error) { return std::make_tuple( std::move(error), ShmSegment(), ShmSegment(), RingBuffer()); } constexpr auto kHeaderSize = sizeof(RingBufferHeader); if (unlikely(kHeaderSize != headerSegment.getSize())) { TP_THROW_SYSTEM(EPERM) << "Header segment of unexpected size"; } ShmSegment dataSegment; uint8_t* data; std::tie(error, dataSegment, data) = ShmSegment::load(std::move(dataFd)); if (error) { return std::make_tuple( std::move(error), ShmSegment(), ShmSegment(), RingBuffer()); } if (unlikely(header->kDataPoolByteSize != dataSegment.getSize())) { TP_THROW_SYSTEM(EPERM) << "Data segment of unexpected size"; } return std::make_tuple( Error::kSuccess, std::move(headerSegment), std::move(dataSegment), RingBuffer(header, data)); } } // namespace tensorpipe ================================================ FILE: tensorpipe/common/shm_segment.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include namespace tensorpipe { namespace { // Our goal is to obtain a file descriptor that is backed by a region of memory. // (We need an fd so we can pass it over a UNIX domain socket). We support two // ways of doing so: // - The memfd_create syscall, which does exactly what we need. Unfortunately // it was added in a recent-ish kernel and an even more recent glibc version. // - As a fallback for older systems, we open a file in the /dev/shm directory, // which we expect to be a mountpoint of tmpfs type. We open it with O_TMPFILE // so it remains unnamed, which won't appear in the directory and can't thus // be opened by other processes and will be automatically cleaned up when we // exit. This method has some issues, as it depends on the availability of // /dev/shm and is capped to the size of that mountpoint (rather than the // total memory of the system), which are especially problematic in Docker. // FIXME O_TMPFILE is also not that old, and some users have reported issues due // to it. We could add a third method as a further fallback. // Name to give to the memfds. This is just displayed when inspecting the file // descriptor in /proc/self/fd to aid debugging, and doesn't have to be unique. constexpr const char* kMemfdName = "tensorpipe_shm"; std::tuple createMemfd() { // We don't want to use the ::memfd_create function directly as it's harder to // detect its availability (we'd need to perform a feature check in CMake and // inject the result as a preprocessor flag) and because it would cause us to // link against glibc 2.27. PyTorch aims to support the manylinux2014 platform // (one of the standard platforms defined by Python for PyPI/pip), which has // glibc 2.17. Thus instead we issue the syscall directly, skipping the glibc // wrapper. #ifdef SYS_memfd_create // We want to pass the MFD_CLOEXEC flag, but we can't rely on glibc exposing // it, thus we redefine its value if needed. #ifndef MFD_CLOEXEC // https://github.com/torvalds/linux/blob/master/include/uapi/linux/memfd.h #define MFD_CLOEXEC 0x0001U #endif int fd = static_cast(::syscall( SYS_memfd_create, static_cast(kMemfdName), static_cast(MFD_CLOEXEC))); if (fd < 0) { return std::make_tuple( TP_CREATE_ERROR(SystemError, "memfd_create", errno), Fd()); } return std::make_tuple(Error::kSuccess, Fd(fd)); #else // SYS_memfd_create return std::make_tuple( TP_CREATE_ERROR(SystemError, "memfd_create", ENOSYS), Fd()); #endif // SYS_memfd_create } // Default base path for all segments created. constexpr const char* kBasePath = "/dev/shm"; std::tuple openTmpfileInDevShm() { // Some users are compiling on old pre-3.11 kernels. We'd like our backends to // only depend on runtime capabilities, and not on compile-time ones, hence we // "polyfill" the flag so the build will pass and we'll get a runtime error. #ifndef O_TMPFILE // https://github.com/torvalds/linux/blob/master/include/uapi/asm-generic/fcntl.h #define O_TMPFILE (020000000 | 00200000) #endif int flags = O_TMPFILE | O_EXCL | O_RDWR | O_CLOEXEC; int fd = ::open(kBasePath, flags, 0); if (fd < 0) { return std::make_tuple(TP_CREATE_ERROR(SystemError, "open", errno), Fd()); } return std::make_tuple(Error::kSuccess, Fd(fd)); } std::tuple createShmFd() { Error error; Fd fd; std::tie(error, fd) = createMemfd(); if (error && error.isOfType() && error.castToType()->errorCode() == ENOSYS) { std::tie(error, fd) = openTmpfileInDevShm(); } return std::make_tuple(std::move(error), std::move(fd)); } std::tuple mmapShmFd(int fd, size_t byteSize) { int flags = MAP_SHARED; int prot = PROT_READ | PROT_WRITE; return MmappedPtr::create(byteSize, prot, flags, fd); } } // namespace ShmSegment::ShmSegment(Fd fd, MmappedPtr ptr) : fd_(std::move(fd)), ptr_(std::move(ptr)) {} std::tuple ShmSegment::alloc(size_t byteSize) { Error error; Fd fd; std::tie(error, fd) = createShmFd(); if (error) { return std::make_tuple(std::move(error), ShmSegment()); } // grow size to contain byte_size bytes. off_t len = static_cast(byteSize); int ret = ::fallocate(fd.fd(), 0, 0, len); if (ret < 0) { return std::make_tuple( TP_CREATE_ERROR(SystemError, "fallocate", errno), ShmSegment()); } MmappedPtr ptr; std::tie(error, ptr) = mmapShmFd(fd.fd(), byteSize); if (error) { return std::make_tuple(std::move(error), ShmSegment()); } return std::make_tuple( Error::kSuccess, ShmSegment(std::move(fd), std::move(ptr))); } std::tuple ShmSegment::access(Fd fd) { // Load whole file. Use fstat to obtain size. struct stat sb; int ret = ::fstat(fd.fd(), &sb); if (ret < 0) { return std::make_tuple( TP_CREATE_ERROR(SystemError, "fstat", errno), ShmSegment()); } size_t byteSize = static_cast(sb.st_size); Error error; MmappedPtr ptr; std::tie(error, ptr) = mmapShmFd(fd.fd(), byteSize); if (error) { return std::make_tuple(std::move(error), ShmSegment()); } return std::make_tuple( Error::kSuccess, ShmSegment(std::move(fd), std::move(ptr))); } } // namespace tensorpipe ================================================ FILE: tensorpipe/common/shm_segment.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include #include #include // // A C++17 version of shared memory segments handler inspired on boost // interprocess. // namespace tensorpipe { class ShmSegment { ShmSegment(Fd fd, MmappedPtr ptr); public: ShmSegment() = default; static std::tuple alloc(size_t byteSize); static std::tuple access(Fd fd); /// Allocate shared memory to contain an object of type T and construct it. /// /// The Segment object owns the memory and frees it when destructed. /// The raw pointer to the object provides a view into the Segment but doesn't /// own it and may thus become invalid if the Segment isn't kept alive. template < typename T, typename... Args, std::enable_if_t::value, int> = 0> static std::tuple create(Args&&... args) { static_assert( std::is_trivially_copyable::value, "Shared memory segments are restricted to only store objects that " "are trivially copyable (i.e. no pointers and no heap allocation"); const auto byteSize = sizeof(T); Error error; ShmSegment segment; std::tie(error, segment) = ShmSegment::alloc(byteSize); if (error) { return std::make_tuple(std::move(error), ShmSegment(), nullptr); } TP_DCHECK_EQ(segment.getSize(), byteSize); // Initialize in place. Forward T's constructor arguments. T* ptr = new (segment.getPtr()) T(std::forward(args)...); TP_THROW_SYSTEM_IF(ptr != segment.getPtr(), EPERM) << "new's address cannot be different from segment.getPtr() " << "address. Some aligment assumption was incorrect"; return std::make_tuple(Error::kSuccess, std::move(segment), ptr); } /// One-dimensional array version of create. // XXX: Fuse all versions of create. template < typename T, std::enable_if_t::value, int> = 0, typename TScalar = typename std::remove_all_extents::type> static std::tuple create(size_t numElements) { static_assert( std::is_same::value, "Only one-dimensional unbounded arrays are supported"); static_assert( std::is_trivially_copyable::value, "Shared memory segments are restricted to only store objects that " "are trivially copyable (i.e. no pointers and no heap allocation"); size_t byteSize = sizeof(TScalar) * numElements; Error error; ShmSegment segment; std::tie(error, segment) = ShmSegment::alloc(byteSize); if (error) { return std::make_tuple(std::move(error), ShmSegment(), nullptr); } TP_DCHECK_EQ(segment.getSize(), byteSize); // Initialize in place. TScalar* ptr = new (segment.getPtr()) TScalar[numElements](); TP_THROW_SYSTEM_IF(ptr != segment.getPtr(), EPERM) << "new's address cannot be different from segment.getPtr() " << "address. Some aligment assumption was incorrect"; return std::make_tuple(Error::kSuccess, std::move(segment), ptr); } /// Load an existing shared memory region that already holds an object of type /// T, where T is NOT an array type. template ::value, int> = 0> static std::tuple load(Fd fd) { static_assert( std::is_trivially_copyable::value, "Shared memory segments are restricted to only store objects that " "are trivially copyable (i.e. no pointers and no heap allocation"); Error error; ShmSegment segment; std::tie(error, segment) = ShmSegment::access(std::move(fd)); if (error) { return std::make_tuple(std::move(error), ShmSegment(), nullptr); } const size_t size = segment.getSize(); // XXX: Do some checking other than the size that we are loading // the right type. TP_THROW_SYSTEM_IF(size != sizeof(T), EPERM) << "Shared memory file has unexpected size. " << "Got: " << size << " bytes, expected: " << sizeof(T) << ". " << "If there is a race between creation and loading of segments, " << "consider linking segment after it has been fully initialized."; auto ptr = static_cast(segment.getPtr()); return std::make_tuple(Error::kSuccess, std::move(segment), ptr); } /// Load an existing shared memory region that already holds an object of type /// T, where T is an array type. template < typename T, std::enable_if_t::value, int> = 0, typename TScalar = typename std::remove_all_extents::type> static std::tuple load(Fd fd) { static_assert( std::is_same::value, "Only one-dimensional unbounded arrays are supported"); static_assert( std::is_trivially_copyable::value, "Shared memory segments are restricted to only store objects that " "are trivially copyable (i.e. no pointers and no heap allocation"); Error error; ShmSegment segment; std::tie(error, segment) = ShmSegment::access(std::move(fd)); if (error) { return std::make_tuple(std::move(error), ShmSegment(), nullptr); } auto ptr = static_cast(segment.getPtr()); return std::make_tuple(Error::kSuccess, std::move(segment), ptr); } int getFd() const { return fd_.fd(); } void* getPtr() { return ptr_.ptr(); } const void* getPtr() const { return ptr_.ptr(); } size_t getSize() const { return ptr_.getLength(); } private: // The file descriptor of the shared memory file. Fd fd_; // Base pointer of mmmap'ed shared memory segment. MmappedPtr ptr_; }; } // namespace tensorpipe ================================================ FILE: tensorpipe/common/socket.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include #include #ifndef SOCK_NONBLOCK #define SOCK_NONBLOCK 0 #endif // SOCK_NONBLOCK namespace tensorpipe { std::tuple Socket::createForFamily(sa_family_t aiFamily) { auto rv = socket(aiFamily, SOCK_STREAM | SOCK_NONBLOCK, 0); if (rv == -1) { return std::make_tuple( TP_CREATE_ERROR(SystemError, "socket", errno), Socket()); } Socket sock(rv); #ifndef SOCK_NONBLOCK // The SOCK_NONBLOCK option of socket() is Linux-only. On OSX, we need to // manually set the socket to non-blocking after its creation. auto err = sock->block(false); if (err) { return std::make_tuple(err, Socket()); } #endif // SOCK_NONBLOCK return std::make_tuple(Error::kSuccess, std::move(sock)); } Error Socket::block(bool on) { int rv; rv = fcntl(fd_, F_GETFL); if (rv == -1) { return TP_CREATE_ERROR(SystemError, "fcntl", errno); } if (!on) { // Set O_NONBLOCK rv |= O_NONBLOCK; } else { // Clear O_NONBLOCK rv &= ~O_NONBLOCK; } rv = fcntl(fd_, F_SETFL, rv); if (rv == -1) { return TP_CREATE_ERROR(SystemError, "fcntl", errno); } return Error::kSuccess; } Error Socket::reuseAddr(bool on) { int onInt = on ? 1 : 0; auto rv = setsockopt(fd_, SOL_SOCKET, SO_REUSEADDR, &onInt, sizeof(onInt)); if (rv == -1) { return TP_CREATE_ERROR(SystemError, "setsockopt", errno); } return Error::kSuccess; } Error Socket::bind(const Sockaddr& addr) { auto rv = ::bind(fd_, addr.addr(), addr.addrlen()); if (rv == -1) { return TP_CREATE_ERROR(SystemError, "bind", errno); } return Error::kSuccess; } Error Socket::listen(int backlog) { auto rv = ::listen(fd_, backlog); if (rv == -1) { return TP_CREATE_ERROR(SystemError, "listen", errno); } return Error::kSuccess; } std::tuple Socket::accept() { struct sockaddr_storage addr; socklen_t addrlen = sizeof(addr); int rv = -1; for (;;) { rv = ::accept(fd_, (struct sockaddr*)&addr, &addrlen); if (rv == -1) { if (errno == EINTR) { continue; } return std::make_tuple( TP_CREATE_ERROR(SystemError, "accept", errno), Socket()); } break; } return std::make_tuple(Error::kSuccess, Socket(rv)); } Error Socket::connect(const Sockaddr& addr) { for (;;) { auto rv = ::connect(fd_, addr.addr(), addr.addrlen()); if (rv == -1) { if (errno == EINTR) { continue; } if (errno != EINPROGRESS) { return TP_CREATE_ERROR(SystemError, "connect", errno); } } break; } return Error::kSuccess; } std::tuple Socket::getSockName() const { struct sockaddr_storage addr; socklen_t addrlen = sizeof(addr); int rv = ::getsockname(fd_, reinterpret_cast(&addr), &addrlen); if (rv < 0) { return std::make_tuple( TP_CREATE_ERROR(SystemError, "getsockname", errno), addr, addrlen); } return std::make_tuple(Error::kSuccess, addr, addrlen); } } // namespace tensorpipe ================================================ FILE: tensorpipe/common/socket.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include #include #include #include namespace tensorpipe { namespace { void saveOneFdToArray(int& dst, const int& src) { dst = src; } void saveOneFdToArray(int& dst, const Fd& src) { dst = src.fd(); } template void saveFdsToArray( int* array, std::index_sequence /*unused*/, const Fds&... fds) { // This is a trick to do pack expansion of the function call. auto dummy = {(saveOneFdToArray(array[Idxs], fds), 0)...}; } void loadOneFdFromArray(int& src, int& dst) { dst = src; } void loadOneFdFromArray(int& src, Fd& dst) { dst = Fd(src); } template void loadFdsFromArray( int* array, std::index_sequence /*unused*/, Fds&... fds) { // This is a trick to do pack expansion of the function call. auto dummy = {(loadOneFdFromArray(array[Idxs], fds), 0)...}; } } // namespace template [[nodiscard]] Error sendToSocket( int socketFd, const T& t1, const T& t2, const Fds&... fds) { using TPayload = int; // Build message. struct msghdr msg; msg.msg_name = nullptr; msg.msg_namelen = 0; msg.msg_flags = 0; // Build iov to write Ts. std::array tbuf = {t1, t2}; struct iovec iov; iov.iov_base = tbuf.data(); iov.iov_len = sizeof(tbuf); msg.msg_iov = &iov; msg.msg_iovlen = sizeof(iov) / sizeof(iovec); // Build control message. std::array buf; msg.msg_control = buf.data(); msg.msg_controllen = buf.size(); struct cmsghdr* cmsg; cmsg = CMSG_FIRSTHDR(&msg); cmsg->cmsg_level = SOL_SOCKET; cmsg->cmsg_type = SCM_RIGHTS; cmsg->cmsg_len = CMSG_LEN(sizeof(TPayload) * sizeof...(Fds)); auto payload = reinterpret_cast(CMSG_DATA(cmsg)); saveFdsToArray(payload, std::index_sequence_for{}, fds...); // Send message. for (;;) { auto rv = ::sendmsg(socketFd, &msg, 0); if (rv == -1) { if (errno == EINTR) { continue; } return TP_CREATE_ERROR(SystemError, "sendmsg", errno); } if (rv != iov.iov_len) { return TP_CREATE_ERROR(ShortWriteError, iov.iov_len, rv); } break; } return Error::kSuccess; } template [[nodiscard]] Error sendFdsToSocket(int socketFd, const Fds&... fds) { char dummy = 0; return sendToSocket(socketFd, dummy, dummy, fds...); } template [[nodiscard]] Error recvFromSocket(int socketFd, T& t1, T& t2, Fds&... fds) { using TPayload = int; // Build message. struct msghdr msg; msg.msg_name = nullptr; msg.msg_namelen = 0; msg.msg_flags = 0; // Build iov to read Ts. std::array tbuf; struct iovec iov; iov.iov_base = tbuf.data(); iov.iov_len = sizeof(tbuf); msg.msg_iov = &iov; msg.msg_iovlen = sizeof(iov) / sizeof(iovec); // Build control message. std::array buf; msg.msg_control = buf.data(); msg.msg_controllen = buf.size(); // Receive message. for (;;) { auto rv = ::recvmsg(socketFd, &msg, 0); if (rv == -1) { if (errno == EINTR) { continue; } return TP_CREATE_ERROR(SystemError, "recvmsg", errno); } if (rv != iov.iov_len) { return TP_CREATE_ERROR(ShortReadError, iov.iov_len, rv); } break; } t1 = tbuf[0]; t2 = tbuf[1]; // Read control message. struct cmsghdr* cmsg; cmsg = CMSG_FIRSTHDR(&msg); TP_DCHECK_NE(cmsg, static_cast(nullptr)); TP_DCHECK_EQ(cmsg->cmsg_level, SOL_SOCKET); TP_DCHECK_EQ(cmsg->cmsg_type, SCM_RIGHTS); TP_DCHECK_EQ(cmsg->cmsg_len, CMSG_LEN(sizeof(TPayload) * sizeof...(Fds))); auto payload = reinterpret_cast(CMSG_DATA(cmsg)); loadFdsFromArray(payload, std::index_sequence_for{}, fds...); return Error::kSuccess; } template [[nodiscard]] Error recvFdsFromSocket(int socketFd, Fds&... fds) { char dummy = 0; return recvFromSocket(socketFd, dummy, dummy, fds...); } class Sockaddr { public: virtual const struct sockaddr* addr() const = 0; virtual socklen_t addrlen() const = 0; virtual ~Sockaddr() = default; }; class Socket final : public Fd { public: [[nodiscard]] static std::tuple createForFamily( sa_family_t aiFamily); Socket() = default; explicit Socket(int fd) : Fd(fd) {} // Configure if the socket is blocking or not. [[nodiscard]] Error block(bool on); // Set (or unset) the SO_REUSEADDR option on the socket. [[nodiscard]] Error reuseAddr(bool on); // Bind socket to address. [[nodiscard]] Error bind(const Sockaddr& addr); // Listen on socket. [[nodiscard]] Error listen(int backlog); // Accept new socket connecting to listening socket. [[nodiscard]] std::tuple accept(); // Connect to address. [[nodiscard]] Error connect(const Sockaddr& addr); [[nodiscard]] std::tuple getSockName() const; // Send file descriptor. template [[nodiscard]] Error sendFds(const Fds&... fds) { return sendFdsToSocket(fd_, fds...); } // Receive file descriptor. template [[nodiscard]] Error recvFds(Fds&... fds) { return recvFdsFromSocket(fd_, fds...); } // Send object and file descriptor. template < typename T, typename... Fds, typename std::enable_if::value, bool>:: type = false> [[nodiscard]] Error sendPayloadAndFds( const T& t1, const T& t2, const Fds&... fds) { return sendToSocket(fd_, t1, t2, fds...); } // Receive object and file descriptor. template < typename T, typename... Fds, typename std::enable_if::value, bool>:: type = false> [[nodiscard]] Error recvPayloadAndFds(T& t1, T& t2, Fds&... fds) { return recvFromSocket(fd_, t1, t2, fds...); } }; } // namespace tensorpipe ================================================ FILE: tensorpipe/common/state_machine.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include namespace tensorpipe { template class OpsStateMachine { public: class Iter { public: TOp& operator*() const { return *opPtr_; } TOp* operator->() const { return opPtr_; } private: explicit Iter(TOp* opPtr) : opPtr_(opPtr) {} TOp* opPtr_{nullptr}; friend OpsStateMachine; }; using Transitioner = void (TSubject::*)(Iter, typename TOp::State); OpsStateMachine(TSubject& subject, Transitioner transitioner) : subject_(subject), transitioner_(transitioner) {} template Iter emplaceBack(uint64_t sequenceNumber, TArgs&&... args) { ops_.emplace_back(std::forward(args)...); TOp& op = ops_.back(); op.sequenceNumber = sequenceNumber; return Iter(&op); } void advanceOperation(Iter initialOpIter) { // Advancing one operation may unblock later ones that could have progressed // but were prevented from overtaking. Thus each time an operation manages // to advance we'll try to also advance the one after. for (int64_t sequenceNumber = initialOpIter->sequenceNumber;; ++sequenceNumber) { TOp* opPtr = findOperation(sequenceNumber); if (opPtr == nullptr || opPtr->state == TOp::FINISHED || !advanceOneOperation(*opPtr)) { break; } } } void advanceAllOperations() { // We cannot just iterate over the operations here as advanceOneOperation // could potentially erase some of them, thus invalidating references and/or // iterators. if (ops_.empty()) { return; } for (int64_t sequenceNumber = ops_.front().sequenceNumber;; ++sequenceNumber) { TOp* opPtr = findOperation(sequenceNumber); if (opPtr == nullptr) { break; } advanceOneOperation(*opPtr); } } void attemptTransition( Iter opIter, typename TOp::State from, typename TOp::State to, bool cond, std::initializer_list actions) { if (opIter->state == from && cond) { for (const auto& action : actions) { (subject_.*action)(opIter); } opIter->state = to; } } private: TOp* findOperation(int64_t sequenceNumber) { if (ops_.empty()) { return nullptr; } int64_t offset = sequenceNumber - ops_.front().sequenceNumber; if (offset < 0 || offset >= ops_.size()) { return nullptr; } TOp& op = ops_[offset]; TP_DCHECK_EQ(op.sequenceNumber, sequenceNumber); return &op; } bool advanceOneOperation(TOp& op) { // Due to the check in attemptTransition, each time that an operation // advances its state we must check whether this unblocks some later // operations that could progress but weren't allowed to overtake. In order // to detect whether this operation is advancing we store its state at the // beginning and then compare it with the state at the end. typename TOp::State initialState = op.state; // The operations must advance in order: later operations cannot "overtake" // earlier ones. Thus if this operation would reach a more advanced state // than previous operation we won't perform the transition. TOp* prevOpPtr = findOperation(op.sequenceNumber - 1); typename TOp::State prevOpState = prevOpPtr != nullptr ? prevOpPtr->state : TOp::FINISHED; (subject_.*transitioner_)(Iter(&op), prevOpState); // Compute return value now in case we next delete the operation. bool hasAdvanced = op.state != initialState; if (op.state == TOp::FINISHED) { // We can't remove the op if it's "in the middle". And, therefore, once we // remove the op at the front, we must check if other ops now also get // "unblocked". In other words, we always remove as much as we can from // the front. while (!ops_.empty() && ops_.front().state == TOp::FINISHED) { ops_.pop_front(); } } return hasAdvanced; } TSubject& subject_; const Transitioner transitioner_; std::deque ops_; }; } // namespace tensorpipe ================================================ FILE: tensorpipe/common/stream_read_write_ops.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include #include #include namespace tensorpipe { // The read operation captures all state associated with reading a // fixed length chunk of data from the underlying connection. All // reads are required to include a word-sized header containing the // number of bytes in the operation. This makes it possible for the // read side of the connection to either 1) not know how many bytes // to expected, and dynamically allocate, or 2) know how many bytes // to expect, and preallocate the destination memory. class StreamReadOperation { enum Mode { READ_LENGTH, READ_PAYLOAD, COMPLETE, }; public: using read_callback_fn = std::function; explicit inline StreamReadOperation(read_callback_fn fn); inline StreamReadOperation(void* ptr, size_t length, read_callback_fn fn); // Called when a buffer is needed to read data from stream. inline void allocFromLoop(char** base, size_t* len); // Called when data has been read from stream. inline void readFromLoop(size_t nread); // Returns if this read operation is complete. inline bool completeFromLoop() const; // Invoke user callback. inline void callbackFromLoop(const Error& error); private: Mode mode_{READ_LENGTH}; char* ptr_{nullptr}; // Number of bytes as specified by the user (if applicable). optional givenLength_; // Number of bytes to expect as read from the connection. size_t readLength_{0}; // Number of bytes read from the connection. // This is reset to 0 when we advance from READ_LENGTH to READ_PAYLOAD. size_t bytesRead_{0}; // Holds temporary allocation if no length was specified. std::unique_ptr buffer_{nullptr}; // User callback. read_callback_fn fn_; }; StreamReadOperation::StreamReadOperation(read_callback_fn fn) : fn_(std::move(fn)) {} StreamReadOperation::StreamReadOperation( void* ptr, size_t length, read_callback_fn fn) : ptr_(static_cast(ptr)), givenLength_(length), fn_(std::move(fn)) {} void StreamReadOperation::allocFromLoop(char** base, size_t* len) { if (mode_ == READ_LENGTH) { TP_DCHECK_LT(bytesRead_, sizeof(readLength_)); *base = reinterpret_cast(&readLength_) + bytesRead_; *len = sizeof(readLength_) - bytesRead_; } else if (mode_ == READ_PAYLOAD) { TP_DCHECK_LT(bytesRead_, readLength_); TP_DCHECK(ptr_ != nullptr); *base = ptr_ + bytesRead_; *len = readLength_ - bytesRead_; } else { TP_THROW_ASSERT() << "invalid mode " << mode_; } } void StreamReadOperation::readFromLoop(size_t nread) { bytesRead_ += nread; if (mode_ == READ_LENGTH) { TP_DCHECK_LE(bytesRead_, sizeof(readLength_)); if (bytesRead_ == sizeof(readLength_)) { if (givenLength_.has_value()) { TP_DCHECK(ptr_ != nullptr || givenLength_.value() == 0); TP_DCHECK_EQ(readLength_, givenLength_.value()); } else { TP_DCHECK(ptr_ == nullptr); buffer_ = std::make_unique(readLength_); ptr_ = buffer_.get(); } if (readLength_ == 0) { mode_ = COMPLETE; } else { mode_ = READ_PAYLOAD; } bytesRead_ = 0; } } else if (mode_ == READ_PAYLOAD) { TP_DCHECK_LE(bytesRead_, readLength_); if (bytesRead_ == readLength_) { mode_ = COMPLETE; } } else { TP_THROW_ASSERT() << "invalid mode " << mode_; } } bool StreamReadOperation::completeFromLoop() const { return mode_ == COMPLETE; } void StreamReadOperation::callbackFromLoop(const Error& error) { fn_(error, ptr_, readLength_); } // The write operation captures all state associated with writing a // fixed length chunk of data from the underlying connection. The // write includes a word-sized header containing the length of the // write. This header is a member field on this class and therefore // the instance must be kept alive and the reference to the instance // must remain valid until the write callback has been called. class StreamWriteOperation { public: using write_callback_fn = std::function; inline StreamWriteOperation( const void* ptr, size_t length, write_callback_fn fn); struct Buf { char* base; size_t len; }; inline std::tuple getBufs(); // Invoke user callback. inline void callbackFromLoop(const Error& error); private: const char* ptr_; const size_t length_; // Buffers (structs with pointers and lengths) to write to stream. std::array bufs_; // User callback. write_callback_fn fn_; }; StreamWriteOperation::StreamWriteOperation( const void* ptr, size_t length, write_callback_fn fn) : ptr_(static_cast(ptr)), length_(length), fn_(std::move(fn)) { bufs_[0].base = const_cast(reinterpret_cast(&length_)); bufs_[0].len = sizeof(length_); bufs_[1].base = const_cast(ptr_); bufs_[1].len = length_; } std::tuple StreamWriteOperation::getBufs() { size_t numBuffers = length_ == 0 ? 1 : 2; return std::make_tuple(bufs_.data(), numBuffers); } void StreamWriteOperation::callbackFromLoop(const Error& error) { fn_(error); } } // namespace tensorpipe ================================================ FILE: tensorpipe/common/strings.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include namespace tensorpipe { inline std::string joinStrs(const std::vector& strs) { if (strs.empty()) { return ""; } std::ostringstream oss; oss << strs[0]; for (size_t idx = 1; idx < strs.size(); idx++) { oss << ", " << strs[idx]; } return oss.str(); } template std::string formatMatrix(const std::vector>& matrix) { std::ostringstream oss; oss << "{"; for (size_t rowIdx = 0; rowIdx < matrix.size(); rowIdx++) { if (rowIdx > 0) { oss << ", "; } oss << "{"; for (size_t colIdx = 0; colIdx < matrix[rowIdx].size(); colIdx++) { if (colIdx > 0) { oss << ", "; } oss << matrix[rowIdx][colIdx]; } oss << "}"; } oss << "}"; return oss.str(); } // Since text manipulation is hard, let's use this to double-check our results. inline bool isValidUuid(const std::string& uuid) { // Check it's in this format: // aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee // |0 |5 |10 |15 |20 |25 |30 |35 if (uuid.size() != 36) { return false; } for (int i = 0; i < uuid.size(); i++) { if (i == 8 || i == 13 || i == 18 || i == 23) { if (uuid[i] != '-') { return false; } } else { if (!((uuid[i] >= '0' && uuid[i] <= '9') || (uuid[i] >= 'a' && uuid[i] <= 'f'))) { return false; } } } return true; } } // namespace tensorpipe ================================================ FILE: tensorpipe/common/system.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #ifdef __linux__ #include #include #include #include #include #endif #ifdef __APPLE__ #include #endif #include #include #include #include #include #include #include #include #ifdef __linux__ // This is a libc wrapper for the Linux syscall. // I'm not sure why we need to declare it ourselves, but that's what libcap // does too, and I couldn't find any libc header in which it's declared. // Direct use of the syscall is strongly discouraged, in favor of libcap (which // has a more friendly API and better backwards-compatibility). However we // really don't want to add a dependency, and moreover libcap introduces an // artificial limitation that only allows us to query the capabilities that were // defined by the kernel headers when libcap was built, meaning we might miss // some (new) capabilities if the kernel was updated in the meantime. extern "C" { extern int capget(cap_user_header_t header, const cap_user_data_t data); } #endif namespace tensorpipe { namespace { #ifdef __APPLE__ optional getBootIDInternal() { std::array buf; // See https://developer.apple.com/documentation/iokit/iokitlib_h for IOKitLib // API documentation. io_registry_entry_t ioRegistryRoot = IORegistryEntryFromPath(kIOMainPortDefault, "IOService:/"); CFStringRef uuidCf = (CFStringRef)IORegistryEntryCreateCFProperty( ioRegistryRoot, CFSTR(kIOPlatformUUIDKey), kCFAllocatorDefault, 0); IOObjectRelease(ioRegistryRoot); CFStringGetCString(uuidCf, buf.data(), buf.size(), kCFStringEncodingMacRoman); CFRelease(uuidCf); return std::string(buf.data()); } #elif defined(__linux__) optional getBootIDInternal() { std::ifstream f{"/proc/sys/kernel/random/boot_id"}; if (!f.is_open()) { return nullopt; } std::string v; getline(f, v); f.close(); return v; } // See namespaces(7). std::string getPathForLinuxNamespace(LinuxNamespace ns) { std::ostringstream oss; oss << "/proc/self/ns/"; switch (ns) { case LinuxNamespace::kIpc: oss << "ipc"; break; case LinuxNamespace::kNet: oss << "net"; break; case LinuxNamespace::kPid: oss << "pid"; break; case LinuxNamespace::kUser: oss << "user"; break; default: TP_THROW_ASSERT() << "Unknown namespace"; } return oss.str(); } #endif } // namespace std::string tstampToStr(TimeStamp ts) { if (ts == kInvalidTimeStamp) { return "NA"; } // print timestaps in microseconds. constexpr TimeStamp kDiv = 1000u; std::stringstream ss; ss << std::setw(9) << std::setfill(' ') << ts / kDiv; ss << "." << std::setw(3) << std::setfill('0') << ts % kDiv << "us"; return ss.str(); } optional getProcFsStr(const std::string& fileName, pid_t tid) { std::ostringstream oss; oss << "/proc/" << tid << "/" << fileName; std::ifstream f{oss.str()}; if (!f.is_open()) { return nullopt; } std::string v; getline(f, v); f.close(); return v; } std::string removeBlankSpaces(std::string s) { // Remove blanks. s.erase( std::remove_if( s.begin(), s.end(), [](unsigned char c) { return std::isspace(c); }), s.end()); return s; } optional getBootID() { static optional bootID = getBootIDInternal(); return bootID; } #ifdef __APPLE__ // OSX is a UNIX, so often we'd like some of our Linux backends to work there // too, but its lack of support for namespaces poses issues. However, that's // like saying that in OSX all processes are in the same namespace with respect // to all resources, so we pretend namespaces are supported, with a constant ID. optional getLinuxNamespaceId(LinuxNamespace ns) { return std::string(); } #elif defined(__linux__) // According to namespaces(7): // > Each process has a /proc/[pid]/ns/ subdirectory containing one entry for // > each namespace [...]. If two processes are in the same namespace, then the // > device IDs and inode numbers of their /proc/[pid]/ns/xxx symbolic links // > will be the same; an application can check this using the stat.st_dev and // > stat.st_ino fields returned by stat(2). optional getLinuxNamespaceId(LinuxNamespace ns) { struct stat statInfo; std::string procfsNamespacePath = getPathForLinuxNamespace(ns); // First use lstat to stat the link itself, to ensure it's indeed a link. int rv = ::lstat(procfsNamespacePath.c_str(), &statInfo); if (rv < 0 && errno == ENOENT) { // These files were first provided in Linux 3.0 (although some of them came // later), however namespaces already existed before then, hence the only // safe thing to do is assume all processes are in different namespaces. return nullopt; } // Other errors, like access/permission ones, are unexpected. TP_THROW_SYSTEM_IF(rv < 0, errno); // Between Linux 3.0 and 3.7 these files were hard links. In Linux 3.8 they // became symlinks and only then it became possible to identify namespaces // through these files' inode numbers. if (!S_ISLNK(statInfo.st_mode)) { return nullopt; } // Then stat the "file" the link points to, as it's its inode we care about. rv = ::stat(procfsNamespacePath.c_str(), &statInfo); TP_THROW_SYSTEM_IF(rv < 0, errno); // These fields are of types dev_t and ino_t, which I couldn't find described // anywhere. They appear to be unsigned longs, but all we care about is that // they are integers, so let's check that. static_assert(std::is_integral::value, ""); static_assert(std::is_integral::value, ""); std::ostringstream oss; oss << std::hex << statInfo.st_dev << '_' << statInfo.st_ino; return oss.str(); } // According to https://www.kernel.org/doc/Documentation/security/LSM.txt: // > A list of the active security modules can be found by reading // > /sys/kernel/security/lsm. This is a comma separated list [...]. optional> getLinuxSecurityModules() { std::ifstream f{"/sys/kernel/security/lsm"}; if (f.fail()) { return nullopt; } // We shouldn't have to worry about an entirely empty file, as according to // the doc "[this list] will always include the capability module". std::vector res; while (!f.eof()) { std::string lsm; std::getline(f, lsm, ','); TP_THROW_ASSERT_IF(f.fail()); res.push_back(std::move(lsm)); } f.close(); TP_THROW_ASSERT_IF(f.fail()); return res; } // See ptrace(2) (the sections towards the end) and // https://www.kernel.org/doc/Documentation/security/Yama.txt optional getYamaPtraceScope() { std::ifstream f{"/proc/sys/kernel/yama/ptrace_scope"}; if (f.fail()) { return nullopt; } int scope; f >> scope; TP_THROW_ASSERT_IF(f.fail()); f.close(); TP_THROW_ASSERT_IF(f.fail()); switch (scope) { case 0: return YamaPtraceScope::kClassicPtracePermissions; case 1: return YamaPtraceScope::kRestrictedPtrace; case 2: return YamaPtraceScope::kAdminOnlyAttach; case 3: return YamaPtraceScope::kNoAttach; default: TP_THROW_ASSERT() << "Unrecognized YAMA ptrace scope: " << scope; // Dummy return to make the compiler happy. return nullopt; } } optional getPermittedCapabilitiesID() { std::remove_pointer::type header; std::array::type, 2> data; // At the time of writing there are three versions of the syscall supported // by the kernel, and we're supposed to perform a "handshake" to agree on the // latest version supported both by us and by the kernel. However, this is // only needed if we want to support pre-2.6.26 kernels, which we don't. Hence // we'll fail if the kernel doesn't support the latest version (v3). On the // other hand there is no way to figure out if the kernel's version has // advanced past the one we support. This will occur once there will be more // than 64 capabilities, but given the current pace this shouldn't happen for // quite a while. Such a limitation probably comes from the capability system // being designed around querying for a specific capability (in which case a // program only needs to support the syscall version where that capability was // added); querying _all_ capabilities (as we do) is kinda out-of-scope. header.version = 0x20080522; header.pid = 0; int rv = ::capget(&header, data.data()); TP_THROW_SYSTEM_IF(rv < 0, errno); // We'll create a bitmask of the capabilities, and then return its hex. uint64_t bitmask = static_cast(data[0].permitted) | (static_cast(data[1].permitted) << 32); std::ostringstream oss; oss << std::hex << bitmask; return oss.str(); } #endif void setThreadName(std::string name) { #ifdef __linux__ // In glibc this non-standard call was added in version 2.12, hence we guard it. #ifdef __GLIBC__ #if ((__GLIBC__ > 2) || ((__GLIBC__ == 2) && (__GLIBC_MINOR__ >= 12))) pthread_setname_np(pthread_self(), name.c_str()); #endif // In other standard libraries we didn't check yet, hence we always enable it. #else pthread_setname_np(pthread_self(), name.c_str()); #endif #endif } } // namespace tensorpipe ================================================ FILE: tensorpipe/common/system.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include #include #include #include namespace tensorpipe { // // TimeStamp is a 64 bit value representing // a high-resolution clock. It is usually // in nano-seconds or in TSC cycles. // using TimeStamp = uint64_t; constexpr TimeStamp kInvalidTimeStamp = std::numeric_limits::max(); std::string tstampToStr(TimeStamp ts); // std::chronos::duration to TSC. template TimeStamp durationToTimeStamp(TDuration d) { auto ns = std::chrono::duration_cast(d).count(); if (ns < 0) { TP_THROW_EINVAL() << "Negative time durations are not valid"; } return static_cast(ns); } // // Useful math functions to work with CPU and binary integers // /// Is it a Power of 2? constexpr bool isPow2(uint64_t n) noexcept { return n > 0 && !((n - 1) & n); } /// Smallest power of 2 larger or equal to . constexpr uint32_t nextPow2(uint32_t n) noexcept { --n; n |= n >> 1; n |= n >> 2; n |= n >> 4; n |= n >> 8; n |= n >> 16; return n + 1; } /// Smallest power of 2 larger or equal to constexpr uint64_t nextPow2(uint64_t n) noexcept { --n; n |= n >> 1; n |= n >> 2; n |= n >> 4; n |= n >> 8; n |= n >> 16; n |= n >> 32; return n + 1; } /// Largest power of 2 less or equal to constexpr uint64_t maxPow2LessEqualThan(uint64_t n) noexcept { if (isPow2(n)) { return n; } return nextPow2(n) >> 1; } // Return contents of /proc/sys/kernel/random/boot_id. optional getBootID(); enum class LinuxNamespace { kIpc, kNet, kPid, kUser, // Add more entries as needed. }; // Returns a string that uniquely identifies a namespace of a certain type. // It is only valid within the same machine and for that fixed type. optional getLinuxNamespaceId(LinuxNamespace ns); // Returns the names of the active Linux Security Modules, in the order in which // they are employed by the kernel. The names could be arbitrary (as third-party // LSMs could be in use) but contain values like "capability", "apparmor", // "yama", "lockdown", ... optional> getLinuxSecurityModules(); enum class YamaPtraceScope { kClassicPtracePermissions, kRestrictedPtrace, kAdminOnlyAttach, kNoAttach, }; // YAMA is a Linux Security Module that specifically targets ptrace by locking // down a process so it can only be targeted by its ancestors or by processes // that it specifically selects. However YAMA can be disabled, or made even // stricter. This function returns precisely what level YAMA is operating at. optional getYamaPtraceScope(); // Return a representation of the set of permitted capabilities of the process. // We're talking about Linux kernel capabilities, see capabilities(7). optional getPermittedCapabilitiesID(); // Set the name of the current thread, if possible. Use only for debugging. void setThreadName(std::string name); } // namespace tensorpipe ================================================ FILE: tensorpipe/config.h.in ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #cmakedefine01 TENSORPIPE_HAS_SHM_TRANSPORT #cmakedefine01 TENSORPIPE_HAS_IBV_TRANSPORT #cmakedefine01 TENSORPIPE_HAS_CMA_CHANNEL ================================================ FILE: tensorpipe/config_cuda.h.in ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #cmakedefine01 TENSORPIPE_HAS_CUDA_IPC_CHANNEL #cmakedefine01 TENSORPIPE_HAS_CUDA_GDR_CHANNEL ================================================ FILE: tensorpipe/core/context.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include #include namespace tensorpipe { Context::Context(ContextOptions opts) : impl_(std::make_shared(std::move(opts))) { impl_->init(); } void Context::registerTransport( int64_t priority, std::string transport, std::shared_ptr context) { impl_->registerTransport(priority, std::move(transport), std::move(context)); } void Context::registerChannel( int64_t priority, std::string channel, std::shared_ptr context) { impl_->registerChannel(priority, std::move(channel), std::move(context)); } std::shared_ptr Context::listen( const std::vector& urls) { return impl_->listen(urls); } std::shared_ptr Context::connect( const std::string& url, PipeOptions opts) { return impl_->connect(url, std::move(opts)); } void Context::close() { impl_->close(); } void Context::join() { impl_->join(); } Context::~Context() { join(); } } // namespace tensorpipe ================================================ FILE: tensorpipe/core/context.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include namespace tensorpipe { class ContextImpl; class Listener; class Pipe; class ContextOptions { public: // The name should be a semantically meaningful description of this context. // It will only be used for logging and debugging purposes, to identify the // endpoints of a pipe. ContextOptions&& name(std::string name) && { name_ = std::move(name); return std::move(*this); } private: std::string name_; friend ContextImpl; }; class PipeOptions { public: // The name should be a semantically meaningful description of the context // that the pipe is connecting to. It will only be used for logging and // debugging purposes, to identify the endpoints of a pipe. PipeOptions&& remoteName(std::string remoteName) && { remoteName_ = std::move(remoteName); return std::move(*this); } private: std::string remoteName_; friend ContextImpl; }; class Context final { public: explicit Context(ContextOptions opts = ContextOptions()); void registerTransport( int64_t priority, std::string transport, std::shared_ptr context); void registerChannel( int64_t priority, std::string channel, std::shared_ptr context); std::shared_ptr listen(const std::vector& urls); std::shared_ptr connect( const std::string& url, PipeOptions opts = PipeOptions()); // Put the context in a terminal state, in turn closing all of its pipes and // listeners, and release its resources. This may be done asynchronously, in // background. void close(); // Wait for all resources to be released and all background activity to stop. void join(); ~Context(); private: // The implementation is managed by a shared_ptr because each child object // will also hold a shared_ptr to it. However, its lifetime is tied to the one // of this public object since when the latter is destroyed the implementation // is closed and joined. const std::shared_ptr impl_; }; } // namespace tensorpipe ================================================ FILE: tensorpipe/core/context_impl.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include namespace tensorpipe { namespace { std::atomic contextCouter{0}; std::string createContextId() { // Should we use argv[0] instead of the PID? It may be more semantically // meaningful and consistent across runs, but it may not be unique... // Also, should we add the hostname/the IP address in case the logs from // different hosts are merged into a single stream? // Eventually we'll have to replace getpid with something more portable. // Libuv offers a cross-platform function to get the process ID. return std::to_string(getpid()) + ":c" + std::to_string(contextCouter++); } } // namespace ContextImpl::ContextImpl(ContextOptions opts) : id_(createContextId()), name_(std::move(opts.name_)) { TP_VLOG(1) << "Context " << id_ << " created"; if (name_ != "") { TP_VLOG(1) << "Context " << id_ << " aliased as " << name_; id_ = name_; } } void ContextImpl::init() { deferToLoop([this]() { initFromLoop(); }); } void ContextImpl::initFromLoop() {} void ContextImpl::registerTransport( int64_t priority, std::string transport, std::shared_ptr context) { TP_THROW_ASSERT_IF(transport.empty()); TP_THROW_ASSERT_IF(transports_.find(transport) != transports_.end()) << "transport " << transport << " already registered"; TP_THROW_ASSERT_IF( transportsByPriority_.find(-priority) != transportsByPriority_.end()) << "transport with priority " << priority << " already registered"; if (!context->isViable()) { TP_VLOG(1) << "Context " << id_ << " is not registering transport " << transport << " because it is not viable"; return; } TP_VLOG(1) << "Context " << id_ << " is registering transport " << transport; context->setId(id_ + ".tr_" + transport); transports_.emplace(transport, context); // Reverse the priority, as the pipe will pick the *first* available transport // it can find in the ordered map, so higher priorities should come first. transportsByPriority_.emplace(-priority, std::make_tuple(transport, context)); } void ContextImpl::registerChannel( int64_t priority, std::string channel, std::shared_ptr context) { TP_THROW_ASSERT_IF(channel.empty()); TP_THROW_ASSERT_IF(channels_.find(channel) != channels_.end()) << "channel " << channel << " already registered"; TP_THROW_ASSERT_IF( channelsByPriority_.find(-priority) != channelsByPriority_.end()) << "channel with priority " << priority << " already registered"; if (!context->isViable()) { TP_VLOG(1) << "Context " << id_ << " is not registering channel " << channel << " because it is not viable"; return; } TP_VLOG(1) << "Context " << id_ << " is registering channel " << channel; context->setId(id_ + ".ch_" + channel); channels_.emplace(channel, context); // Reverse the priority, as the pipe will pick the *first* available channel // it can find in the ordered map, so higher priorities should come first. channelsByPriority_.emplace(-priority, std::make_tuple(channel, context)); } std::shared_ptr ContextImpl::listen( const std::vector& urls) { std::string listenerId = id_ + "[l" + std::to_string(listenerCounter_++) + "]"; TP_VLOG(1) << "Context " << id_ << " is opening listener " << listenerId; return std::make_shared( Listener::ConstructorToken(), shared_from_this(), std::move(listenerId), urls); } std::shared_ptr ContextImpl::connect( const std::string& url, PipeOptions opts) { std::string pipeId = id_ + ".p" + std::to_string(pipeCounter_++); TP_VLOG(1) << "Context " << id_ << " is opening pipe " << pipeId; std::string remoteContextName = std::move(opts.remoteName_); if (remoteContextName != "") { std::string aliasPipeId = id_ + "_to_" + remoteContextName; TP_VLOG(1) << "Pipe " << pipeId << " aliased as " << aliasPipeId; pipeId = std::move(aliasPipeId); } return std::make_shared( Pipe::ConstructorToken(), shared_from_this(), std::move(pipeId), std::move(remoteContextName), url); } std::shared_ptr ContextImpl::getTransport( const std::string& transport) { auto iter = transports_.find(transport); if (iter == transports_.end()) { TP_THROW_EINVAL() << "unsupported transport " << transport; } return iter->second; } std::shared_ptr ContextImpl::getChannel( const std::string& channel) { auto iter = channels_.find(channel); if (iter == channels_.end()) { TP_THROW_EINVAL() << "unsupported channel " << channel; } return iter->second; } const ContextImpl::TOrderedTransports& ContextImpl::getOrderedTransports() { return transportsByPriority_; } const ContextImpl::TOrderedChannels& ContextImpl::getOrderedChannels() { return channelsByPriority_; } const std::string& ContextImpl::getName() { return name_; } void ContextImpl::enroll(ListenerImpl& listener) { TP_DCHECK(inLoop()); bool wasInserted; std::tie(std::ignore, wasInserted) = listeners_.emplace(&listener, listener.shared_from_this()); TP_DCHECK(wasInserted); } void ContextImpl::enroll(PipeImpl& pipe) { TP_DCHECK(inLoop()); bool wasInserted; std::tie(std::ignore, wasInserted) = pipes_.emplace(&pipe, pipe.shared_from_this()); TP_DCHECK(wasInserted); } void ContextImpl::unenroll(ListenerImpl& listener) { TP_DCHECK(inLoop()); auto numRemoved = listeners_.erase(&listener); TP_DCHECK_EQ(numRemoved, 1); } void ContextImpl::unenroll(PipeImpl& pipe) { TP_DCHECK(inLoop()); auto numRemoved = pipes_.erase(&pipe); TP_DCHECK_EQ(numRemoved, 1); } bool ContextImpl::closed() { TP_DCHECK(inLoop()); return error_; } void ContextImpl::deferToLoop(TTask fn) { loop_.deferToLoop(std::move(fn)); } bool ContextImpl::inLoop() const { return loop_.inLoop(); } void ContextImpl::close() { deferToLoop([this]() { closeFromLoop(); }); } void ContextImpl::closeFromLoop() { TP_DCHECK(inLoop()); TP_VLOG(1) << "Context " << id_ << " is closing"; setError(TP_CREATE_ERROR(ContextClosedError)); TP_VLOG(1) << "Context " << id_ << " done closing"; } void ContextImpl::setError(Error error) { // Don't overwrite an error that's already set. if (error_ || !error) { return; } error_ = std::move(error); handleError(); } void ContextImpl::handleError() { TP_DCHECK(inLoop()); TP_VLOG(5) << "Context " << id_ << " is handling error " << error_.what(); // Make a copy as they could unenroll themselves inline. auto listenersCopy = listeners_; auto pipesCopy = pipes_; // We call closeFromLoop, rather than just close, because we need these // objects to transition _immediately_ to error, "atomically". If we just // deferred closing to later, this could come after some already-enqueued // operations that could try to access the context, which would be closed, // and this could fail. for (auto& iter : listenersCopy) { iter.second->closeFromLoop(); } for (auto& iter : pipesCopy) { iter.second->closeFromLoop(); } for (auto& iter : transports_) { iter.second->close(); } for (auto& iter : channels_) { iter.second->close(); } } void ContextImpl::join() { close(); if (!joined_.exchange(true)) { TP_VLOG(1) << "Context " << id_ << " is joining"; // As closing is deferred to the loop, we must wait for close to be actually // called before we join, to avoid race conditions. For this, we defer // another task to the loop, which we know will run after the closing, and // then we wait for that task to be run. std::promise hasClosed; deferToLoop([&]() { hasClosed.set_value(); }); hasClosed.get_future().wait(); for (auto& iter : transports_) { iter.second->join(); } for (auto& iter : channels_) { iter.second->join(); } TP_VLOG(1) << "Context " << id_ << " done joining"; TP_DCHECK(listeners_.empty()); TP_DCHECK(pipes_.empty()); } } } // namespace tensorpipe ================================================ FILE: tensorpipe/core/context_impl.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include #include #include #include #include #include namespace tensorpipe { class ListenerImpl; class PipeImpl; class ContextImpl final : public virtual DeferredExecutor, public std::enable_shared_from_this { public: explicit ContextImpl(ContextOptions opts); void init(); void registerTransport( int64_t priority, std::string transport, std::shared_ptr context); void registerChannel( int64_t priority, std::string channel, std::shared_ptr context); std::shared_ptr listen(const std::vector& urls); std::shared_ptr connect(const std::string& url, PipeOptions opts); std::shared_ptr getTransport( const std::string& transport); std::shared_ptr getChannel(const std::string& channel); using TOrderedTransports = std::map< int64_t, std::tuple>>; const TOrderedTransports& getOrderedTransports(); using TOrderedChannels = std:: map>>; const TOrderedChannels& getOrderedChannels(); // Return the name given to the context's constructor. It will be retrieved // by the pipes and listener in order to attach it to logged messages. const std::string& getName(); // Enrolling dependent objects (listeners and pipes) causes them to be kept // alive for as long as the context exists. These objects should enroll // themselves as soon as they're created (in their initFromLoop method) and // unenroll themselves after they've completed handling an error (either right // in the handleError method or in a subsequent callback). The context, on the // other hand, should avoid terminating (i.e., complete joining) until all // objects have unenrolled themselves. void enroll(ListenerImpl& listener); void enroll(PipeImpl& pipe); void unenroll(ListenerImpl& listener); void unenroll(PipeImpl& pipe); // Return whether the context is in a closed state. To avoid race conditions, // this must be called from within the loop. bool closed(); // Implement DeferredExecutor interface. void deferToLoop(TTask fn) override; bool inLoop() const override; void close(); void join(); private: OnDemandDeferredExecutor loop_; Error error_{Error::kSuccess}; std::atomic joined_{false}; // An identifier for the context, either consisting of the user-provided name // for this context (see below) or, by default, composed of unique information // about the host and process, combined with an increasing sequence number. It // will be used as a prefix for the identifiers of listeners and pipes. All of // them will only be used for logging and debugging purposes. std::string id_; // Sequence numbers for the listeners and pipes created by this context, used // to create their identifiers based off this context's identifier. They will // only be used for logging and debugging. std::atomic listenerCounter_{0}; std::atomic pipeCounter_{0}; // Store shared_ptrs to dependent objects that have enrolled themselves to // keep them alive. We use a map, indexed by raw pointers, rather than a set // of shared_ptrs so that we can erase objects without them having to create // a fresh shared_ptr just for that. std::unordered_map> listeners_; std::unordered_map> pipes_; // A user-provided name for this context which should be semantically // meaningful. It will only be used for logging and debugging purposes, to // identify the endpoints of a pipe. std::string name_; std::unordered_map> transports_; using TContextMap = std::unordered_map>; TContextMap channels_; TOrderedTransports transportsByPriority_; TOrderedChannels channelsByPriority_; CallbackWrapper callbackWrapper_{*this, *this}; void initFromLoop(); void closeFromLoop(); void setError(Error error); void handleError(); template friend class CallbackWrapper; }; } // namespace tensorpipe ================================================ FILE: tensorpipe/core/error.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include namespace tensorpipe { std::string LogicError::what() const { std::ostringstream ss; ss << "logic error: " << reason_; return ss.str(); } std::string ContextClosedError::what() const { return "context closed"; } std::string ListenerClosedError::what() const { return "listener closed"; } std::string PipeClosedError::what() const { return "pipe closed"; } } // namespace tensorpipe ================================================ FILE: tensorpipe/core/error.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include namespace tensorpipe { class LogicError final : public BaseError { public: explicit LogicError(std::string reason) : reason_(std::move(reason)) {} std::string what() const override; private: const std::string reason_; }; class ContextClosedError final : public BaseError { public: explicit ContextClosedError() {} std::string what() const override; }; class ListenerClosedError final : public BaseError { public: explicit ListenerClosedError() {} std::string what() const override; }; class PipeClosedError final : public BaseError { public: explicit PipeClosedError() {} std::string what() const override; }; } // namespace tensorpipe ================================================ FILE: tensorpipe/core/listener.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include #include #include namespace tensorpipe { Listener::Listener( ConstructorToken /* unused */, std::shared_ptr context, std::string id, const std::vector& urls) : impl_(std::make_shared( std::move(context), std::move(id), urls)) { impl_->init(); } void Listener::close() { impl_->close(); } Listener::~Listener() { close(); } void Listener::accept(accept_callback_fn fn) { impl_->accept(std::move(fn)); } const std::map& Listener::addresses() const { return impl_->addresses(); } const std::string& Listener::address(const std::string& transport) const { return impl_->address(transport); } std::string Listener::url(const std::string& transport) const { return impl_->url(transport); } } // namespace tensorpipe ================================================ FILE: tensorpipe/core/listener.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include namespace tensorpipe { class ContextImpl; class ListenerImpl; class Pipe; // The listener. // // Listeners are used to produce pipes. Depending on the type of the // context, listeners may use a variety of addresses to listen on. For // example, for TCP/IP sockets they listen on an IPv4 or IPv6 address, // for Unix domain sockets they listen on a path, etcetera. // // A pipe can only be accepted from this listener after it has been // fully established. This means that both its connection and all its // side channels have been established. // class Listener final { // Use the passkey idiom to allow make_shared to call what should be a private // constructor. See https://abseil.io/tips/134 for more information. struct ConstructorToken {}; public: Listener( ConstructorToken token, std::shared_ptr context, std::string id, const std::vector& urls); // // Entry points for user code // using accept_callback_fn = std::function)>; void accept(accept_callback_fn fn); // Returns map with the materialized address of listeners by transport. // // If you don't bind a transport listener to a specific port or address, it // may generate its address automatically. Then, in order to connect to the // listener, the user must use a separate mechanism to communicate the // materialized address to whoever wants to connect. // const std::map& addresses() const; // Returns materialized address for specific transport. // // See `addresses()` for more information. // const std::string& address(const std::string& transport) const; // Returns URL with materialized address for specific transport. // // See `addresses()` for more information. // std::string url(const std::string& transport) const; // Put the listener in a terminal state, aborting its pending operations and // rejecting future ones, and release its resrouces. This may be carried out // asynchronously, in background. Since the pipes may occasionally use the // listener to open new connections, closing a listener may trigger errors // in the pipes. void close(); ~Listener(); private: // Using a shared_ptr allows us to detach the lifetime of the implementation // from the public object's one and perform the destruction asynchronously. const std::shared_ptr impl_; // Allow context to access constructor token. friend ContextImpl; }; } // namespace tensorpipe ================================================ FILE: tensorpipe/core/listener_impl.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include namespace tensorpipe { ListenerImpl::ListenerImpl( std::shared_ptr context, std::string id, const std::vector& urls) : context_(std::move(context)), id_(std::move(id)) { for (const auto& url : urls) { std::string transport; std::string address; std::tie(transport, address) = splitSchemeOfURL(url); std::shared_ptr context = context_->getTransport(transport); std::shared_ptr listener = context->listen(address); listener->setId(id_ + ".tr_" + transport); addresses_.emplace(transport, listener->addr()); listeners_.emplace(transport, std::move(listener)); } } void ListenerImpl::init() { context_->deferToLoop( [impl{this->shared_from_this()}]() { impl->initFromLoop(); }); } void ListenerImpl::initFromLoop() { TP_DCHECK(context_->inLoop()); if (context_->closed()) { // Set the error without calling setError because we do not want to invoke // handleError as it would find itself in a weird state (since the rest of // initFromLoop wouldn't have been called). error_ = TP_CREATE_ERROR(ListenerClosedError); TP_VLOG(1) << "Listener " << id_ << " is closing (without initing)"; return; } context_->enroll(*this); for (const auto& listener : listeners_) { armListener(listener.first); } } void ListenerImpl::close() { context_->deferToLoop( [impl{this->shared_from_this()}]() { impl->closeFromLoop(); }); } void ListenerImpl::closeFromLoop() { TP_DCHECK(context_->inLoop()); TP_VLOG(1) << "Listener " << id_ << " is closing"; setError(TP_CREATE_ERROR(ListenerClosedError)); } // // Entry points for user code // void ListenerImpl::accept(accept_callback_fn fn) { context_->deferToLoop( [impl{this->shared_from_this()}, fn{std::move(fn)}]() mutable { impl->acceptFromLoop(std::move(fn)); }); } void ListenerImpl::acceptFromLoop(accept_callback_fn fn) { TP_DCHECK(context_->inLoop()); uint64_t sequenceNumber = nextPipeBeingAccepted_++; TP_VLOG(1) << "Listener " << id_ << " received an accept request (#" << sequenceNumber << ")"; fn = [this, sequenceNumber, fn{std::move(fn)}]( const Error& error, std::shared_ptr pipe) { TP_DCHECK_EQ(sequenceNumber, nextAcceptCallbackToCall_++); TP_VLOG(1) << "Listener " << id_ << " is calling an accept callback (#" << sequenceNumber << ")"; fn(error, std::move(pipe)); TP_VLOG(1) << "Listener " << id_ << " done calling an accept callback (#" << sequenceNumber << ")"; }; if (error_) { fn(error_, std::shared_ptr()); return; } acceptCallback_.arm(std::move(fn)); } const std::map& ListenerImpl::addresses() const { // As this is an immutable member (after it has been initialized in // the constructor), we'll access it without deferring to the loop. return addresses_; } const std::string& ListenerImpl::address(const std::string& transport) const { // As this is an immutable member (after it has been initialized in // the constructor), we'll access it without deferring to the loop. const auto it = addresses_.find(transport); TP_THROW_ASSERT_IF(it == addresses_.end()) << ": transport '" << transport << "' not in use by this listener."; return it->second; } std::string ListenerImpl::url(const std::string& transport) const { // As this is an immutable member (after it has been initialized in // the constructor), we'll access it without deferring to the loop. return transport + "://" + address(transport); } // // Entry points for internal code // uint64_t ListenerImpl::registerConnectionRequest( connection_request_callback_fn fn) { TP_DCHECK(context_->inLoop()); uint64_t registrationId = nextConnectionRequestRegistrationId_++; TP_VLOG(1) << "Listener " << id_ << " received a connection request registration (#" << registrationId << ")"; fn = [this, registrationId, fn{std::move(fn)}]( const Error& error, std::string transport, std::shared_ptr connection) { TP_VLOG(1) << "Listener " << id_ << " is calling a connection request registration callback (#" << registrationId << ")"; fn(error, std::move(transport), std::move(connection)); TP_VLOG(1) << "Listener " << id_ << " done calling a connection request registration callback (#" << registrationId << ")"; }; if (error_) { fn(error_, std::string(), std::shared_ptr()); } else { connectionRequestRegistrations_.emplace(registrationId, std::move(fn)); } return registrationId; } void ListenerImpl::unregisterConnectionRequest(uint64_t registrationId) { TP_DCHECK(context_->inLoop()); TP_VLOG(1) << "Listener " << id_ << " received a connection request de-registration (#" << registrationId << ")"; connectionRequestRegistrations_.erase(registrationId); } // // Error handling // void ListenerImpl::setError(Error error) { // Don't overwrite an error that's already set. if (error_ || !error) { return; } error_ = std::move(error); handleError(); } void ListenerImpl::handleError() { TP_DCHECK(context_->inLoop()); TP_VLOG(2) << "Listener " << id_ << " is handling error " << error_.what(); acceptCallback_.triggerAll([&]() { return std::make_tuple(std::cref(error_), std::shared_ptr()); }); for (auto& iter : connectionRequestRegistrations_) { connection_request_callback_fn fn = std::move(iter.second); fn(error_, std::string(), std::shared_ptr()); } connectionRequestRegistrations_.clear(); for (const auto& listener : listeners_) { listener.second->close(); } for (const auto& connection : connectionsWaitingForHello_) { connection->close(); } connectionsWaitingForHello_.clear(); context_->unenroll(*this); } // // Everything else // void ListenerImpl::onAccept( std::string transport, std::shared_ptr connection) { TP_DCHECK(context_->inLoop()); // Keep it alive until we figure out what to do with it. connectionsWaitingForHello_.insert(connection); auto nopHolderIn = std::make_shared>(); TP_VLOG(3) << "Listener " << id_ << " is reading nop object (spontaneous or requested connection)"; connection->read( *nopHolderIn, callbackWrapper_([nopHolderIn, transport{std::move(transport)}, connection](ListenerImpl& impl) mutable { TP_VLOG(3) << "Listener " << impl.id_ << " done reading nop object (spontaneous or requested connection)"; if (impl.error_) { return; } impl.connectionsWaitingForHello_.erase(connection); impl.onConnectionHelloRead( std::move(transport), std::move(connection), nopHolderIn->getObject()); })); } void ListenerImpl::armListener(std::string transport) { TP_DCHECK(context_->inLoop()); auto iter = listeners_.find(transport); if (iter == listeners_.end()) { TP_THROW_EINVAL() << "unsupported transport " << transport; } auto transportListener = iter->second; TP_VLOG(3) << "Listener " << id_ << " is accepting connection on transport " << transport; transportListener->accept( callbackWrapper_([transport]( ListenerImpl& impl, std::shared_ptr connection) { TP_VLOG(3) << "Listener " << impl.id_ << " done accepting connection on transport " << transport; if (impl.error_) { return; } impl.onAccept(transport, std::move(connection)); impl.armListener(transport); })); } void ListenerImpl::onConnectionHelloRead( std::string transport, std::shared_ptr connection, const Packet& nopPacketIn) { TP_DCHECK(context_->inLoop()); if (nopPacketIn.is()) { const SpontaneousConnection& nopSpontaneousConnection = *nopPacketIn.get(); TP_VLOG(3) << "Listener " << id_ << " got spontaneous connection"; std::string pipeId = id_ + ".p" + std::to_string(pipeCounter_++); TP_VLOG(1) << "Listener " << id_ << " is opening pipe " << pipeId; const std::string& remoteContextName = nopSpontaneousConnection.contextName; if (remoteContextName != "") { std::string aliasPipeId = id_ + "_from_" + remoteContextName; TP_VLOG(1) << "Pipe " << pipeId << " aliased as " << aliasPipeId; pipeId = std::move(aliasPipeId); } auto pipe = std::make_shared( context_, shared_from_this(), std::move(pipeId), remoteContextName, std::move(transport), std::move(connection)); // We initialize the pipe from the loop immediately, inline, because the // initialization of a pipe accepted by a listener happens partly in the // listener and partly in the pipe's initFromLoop, and we need these two // steps to happen "atomically" to make it impossible for an error to occur // in between. pipe->initFromLoop(); acceptCallback_.trigger( Error::kSuccess, std::make_shared(Pipe::ConstructorToken(), std::move(pipe))); } else if (nopPacketIn.is()) { const RequestedConnection& nopRequestedConnection = *nopPacketIn.get(); uint64_t registrationId = nopRequestedConnection.registrationId; TP_VLOG(3) << "Listener " << id_ << " got requested connection (#" << registrationId << ")"; auto iter = connectionRequestRegistrations_.find(registrationId); // The connection request may have already been deregistered, for example // because the pipe may have been closed. if (iter != connectionRequestRegistrations_.end()) { auto fn = std::move(iter->second); connectionRequestRegistrations_.erase(iter); fn(Error::kSuccess, std::move(transport), std::move(connection)); } } else { TP_LOG_ERROR() << "packet contained unknown content: " << nopPacketIn.index(); } } } // namespace tensorpipe ================================================ FILE: tensorpipe/core/listener_impl.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include namespace tensorpipe { class ContextImpl; class ListenerImpl final : public std::enable_shared_from_this { public: ListenerImpl( std::shared_ptr context, std::string id, const std::vector& urls); // Called by the listener's constructor. void init(); using accept_callback_fn = Listener::accept_callback_fn; void accept(accept_callback_fn fn); const std::map& addresses() const; const std::string& address(const std::string& transport) const; std::string url(const std::string& transport) const; using connection_request_callback_fn = std::function< void(const Error&, std::string, std::shared_ptr)>; uint64_t registerConnectionRequest(connection_request_callback_fn fn); void unregisterConnectionRequest(uint64_t registrationId); void close(); private: void acceptFromLoop(accept_callback_fn fn); void closeFromLoop(); Error error_{Error::kSuccess}; std::shared_ptr context_; // An identifier for the listener, composed of the identifier for the context, // combined with an increasing sequence number. It will be used as a prefix // for the identifiers of pipes. All of them will only be used for logging and // debugging purposes. std::string id_; // Sequence numbers for the pipes created by this listener, used to create // their identifiers based off this listener's identifier. They will only be // used for logging and debugging. std::atomic pipeCounter_{0}; std::unordered_map> listeners_; std::map addresses_; // A sequence number for the calls to accept. uint64_t nextPipeBeingAccepted_{0}; // A sequence number for the invocations of the callbacks of accept. uint64_t nextAcceptCallbackToCall_{0}; RearmableCallback> acceptCallback_; // Needed to keep them alive. std::unordered_set> connectionsWaitingForHello_; uint64_t nextConnectionRequestRegistrationId_{0}; // FIXME Consider using a (ordered) map, because keys are IDs which are // generated in sequence and thus we can do a quick (but partial) check of // whether a callback is in the map by comparing its ID with the smallest // and largest key, which in an ordered map are the first and last item. std::unordered_map connectionRequestRegistrations_; // // Initialization // void initFromLoop(); // // Helpers to prepare callbacks from transports // CallbackWrapper callbackWrapper_{*this, *this->context_}; // // Error handling // void setError(Error error); void handleError(); // // Everything else // void armListener(std::string transport); void onAccept( std::string transport, std::shared_ptr connection); void onConnectionHelloRead( std::string transport, std::shared_ptr connection, const Packet& nopPacketIn); template friend class CallbackWrapper; // Contexts do sometimes need to call directly into closeFromLoop, in order to // make sure that some of their operations can happen "atomically" on the // connection, without possibly other operations occurring in between (e.g., // an error). friend ContextImpl; }; } // namespace tensorpipe ================================================ FILE: tensorpipe/core/message.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include namespace tensorpipe { // Messages consist of a primary buffer and zero or more separate // buffers. The primary buffer is always a host-side memory region that // contains a serialized version of the message we're dealing with. This // serialized message, in turn, may have references to the separate // buffers that accompany the primary buffer. These separate buffers may // point to any type of memory, host-side or device-side. // class Message final { public: std::string metadata; struct Payload { void* data{nullptr}; size_t length{0}; // Users may include arbitrary metadata in the following fields. // This may contain allocation hints for the receiver, for example. std::string metadata; }; // Holds the payloads that are transferred over the primary connection. std::vector payloads; struct Tensor { tensorpipe::Buffer buffer; size_t length{0}; // Users may optionally specify the target device, on which the receiver // should allocate memory for this tensor. If left unset, the receiver will // choose one at their convenience. optional targetDevice; // Users may include arbitrary metadata in the following field. // This may contain allocation hints for the receiver, for example. std::string metadata; }; // Holds the tensors that are offered to the side channels. std::vector tensors; }; // Descriptors consist of metadata required by the receiver to allocate memory // for an incoming message. class Descriptor final { public: std::string metadata; struct Payload { size_t length{0}; std::string metadata; }; std::vector payloads; struct Tensor { size_t length{0}; // This is the sender-side device from which this tensor is being sent. Device sourceDevice; // The sender may optionally specify a target device, in which case the // receiver must allocate memory for this tensor on the specified device. optional targetDevice; std::string metadata; }; std::vector tensors; }; // Allocations consist of actual memory allocations provided by the receiver for // an incoming message. They must match the length and target devices specified // in the corresponding Descriptor. class Allocation final { public: struct Payload { void* data{nullptr}; }; std::vector payloads; struct Tensor { tensorpipe::Buffer buffer; }; std::vector tensors; }; } // namespace tensorpipe ================================================ FILE: tensorpipe/core/nop_types.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include #include #include #include #include namespace tensorpipe { struct SpontaneousConnection { std::string contextName; NOP_STRUCTURE(SpontaneousConnection, contextName); }; struct RequestedConnection { uint64_t registrationId; NOP_STRUCTURE(RequestedConnection, registrationId); }; NOP_EXTERNAL_STRUCTURE(Device, type, index); struct Brochure { std::unordered_map transportDomainDescriptors; std::unordered_map> channelDeviceDescriptors; NOP_STRUCTURE(Brochure, transportDomainDescriptors, channelDeviceDescriptors); }; struct BrochureAnswer { std::string transport; std::string address; std::unordered_map transportRegistrationIds; std::string transportDomainDescriptor; std::unordered_map> channelRegistrationIds; std::unordered_map> channelDeviceDescriptors; std::unordered_map, std::string> channelForDevicePair; NOP_STRUCTURE( BrochureAnswer, transport, address, transportRegistrationIds, transportDomainDescriptor, channelRegistrationIds, channelDeviceDescriptors, channelForDevicePair); }; NOP_EXTERNAL_STRUCTURE(Descriptor::Payload, length, metadata); NOP_EXTERNAL_STRUCTURE( Descriptor::Tensor, length, sourceDevice, targetDevice, metadata); NOP_EXTERNAL_STRUCTURE(Descriptor, metadata, payloads, tensors); struct DescriptorReply { std::vector targetDevices; NOP_STRUCTURE(DescriptorReply, targetDevices); }; using Packet = nop::Variant; } // namespace tensorpipe ================================================ FILE: tensorpipe/core/pipe.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include namespace tensorpipe { Pipe::Pipe( ConstructorToken /* unused */, std::shared_ptr context, std::string id, std::string remoteName, const std::string& url) : impl_(std::make_shared( std::move(context), std::move(id), std::move(remoteName), url)) { impl_->init(); } Pipe::Pipe(ConstructorToken /* unused */, std::shared_ptr impl) : impl_(std::move(impl)) {} const std::string& Pipe::getRemoteName() { return impl_->getRemoteName(); } Pipe::~Pipe() { close(); } void Pipe::close() { impl_->close(); } void Pipe::readDescriptor(read_descriptor_callback_fn fn) { impl_->readDescriptor(std::move(fn)); } void Pipe::read(Allocation allocation, read_callback_fn fn) { impl_->read(std::move(allocation), std::move(fn)); } void Pipe::write(Message message, write_callback_fn fn) { impl_->write(std::move(message), std::move(fn)); } } // namespace tensorpipe ================================================ FILE: tensorpipe/core/pipe.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include namespace tensorpipe { class ContextImpl; class ListenerImpl; class PipeImpl; // The pipe. // // Pipes represent a set of connections between a pair of processes. // Unlike POSIX pipes, they are message oriented instead of byte // oriented. Messages that are sent through the pipe may use whatever // channels are at their disposal to make it happen. If the pair of // processes happen to be colocated on the same machine, they may // leverage a region of shared memory to communicate the primary // buffer of a message. Secondary buffers may use shared memory as // well, if they're located in CPU memory, or use a CUDA device to // device copy if they're located in NVIDIA GPU memory. If the pair is // located across the world, they may simply use a set of TCP // connections to communicate. // class Pipe final { // Use the passkey idiom to allow make_shared to call what should be a private // constructor. See https://abseil.io/tips/134 for more information. struct ConstructorToken {}; public: // // Initialization // Pipe( ConstructorToken token, std::shared_ptr context, std::string id, std::string remoteName, const std::string& url); Pipe(ConstructorToken token, std::shared_ptr impl); // // Entry points for user code // using read_descriptor_callback_fn = std::function; void readDescriptor(read_descriptor_callback_fn fn); using read_callback_fn = std::function; void read(Allocation allocation, read_callback_fn fn); using write_callback_fn = std::function; void write(Message message, write_callback_fn fn); // Retrieve the user-defined name that was given to the constructor of the // context on the remote side, if any (if not, this will be the empty string). // This is intended to help in logging and debugging only. const std::string& getRemoteName(); // Put the pipe in a terminal state, aborting its pending operations and // rejecting future ones, and release its resrouces. This may be carried out // asynchronously, in background. void close(); ~Pipe(); private: // Using a shared_ptr allows us to detach the lifetime of the implementation // from the public object's one and perform the destruction asynchronously. const std::shared_ptr impl_; // Allow context to access constructor token. friend ContextImpl; // Allow listener to access constructor token. friend ListenerImpl; }; } // namespace tensorpipe ================================================ FILE: tensorpipe/core/pipe_impl.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include namespace tensorpipe { namespace { void parseDescriptorReplyOfMessage( WriteOperation& op, DescriptorReply nopDescriptorReply) { const int numTensors = op.message.tensors.size(); size_t targetDeviceIdx = 0; for (size_t tensorIdx = 0; tensorIdx < numTensors; ++tensorIdx) { const Message::Tensor& tensor = op.message.tensors[tensorIdx]; WriteOperation::Tensor& tensorBeingSent = op.tensors[tensorIdx]; if (!tensor.targetDevice.has_value()) { tensorBeingSent.targetDevice = std::move(nopDescriptorReply.targetDevices[targetDeviceIdx++]); } } TP_DCHECK_EQ(targetDeviceIdx, nopDescriptorReply.targetDevices.size()); } // Raise an error if the number of payloads and tensors in the allocation do not // match the ones that are expected by the ReadOperation. Also checks that // tensors are allocated on the correct devices. void checkAllocationCompatibility( const Descriptor& descriptor, const Allocation& allocation) { size_t numPayloads = allocation.payloads.size(); TP_THROW_ASSERT_IF(numPayloads != descriptor.payloads.size()); size_t numTensors = allocation.tensors.size(); TP_THROW_ASSERT_IF(numTensors != descriptor.tensors.size()); for (size_t tensorIdx = 0; tensorIdx < numTensors; tensorIdx++) { const Allocation::Tensor& tensor = allocation.tensors[tensorIdx]; const Descriptor::Tensor& tensorDescriptor = descriptor.tensors[tensorIdx]; if (tensorDescriptor.targetDevice.has_value()) { TP_THROW_ASSERT_IF( !(tensor.buffer.device() == tensorDescriptor.targetDevice.value())); } } } // Produce a nop object containing a message descriptor using the information // contained in the WriteOperation: number and sizes of payloads and tensors, // tensor descriptors, ... std::shared_ptr> makeDescriptorForMessage( const WriteOperation& op) { auto nopHolderOut = std::make_shared>(); Descriptor& nopDescriptor = nopHolderOut->getObject(); nopDescriptor.metadata = op.message.metadata; for (int payloadIdx = 0; payloadIdx < op.message.payloads.size(); ++payloadIdx) { const Message::Payload& payload = op.message.payloads[payloadIdx]; nopDescriptor.payloads.emplace_back(); Descriptor::Payload& nopPayloadDescriptor = nopDescriptor.payloads.back(); nopPayloadDescriptor.length = payload.length; nopPayloadDescriptor.metadata = payload.metadata; } TP_DCHECK_EQ(op.message.tensors.size(), op.tensors.size()); for (int tensorIdx = 0; tensorIdx < op.tensors.size(); ++tensorIdx) { const Message::Tensor& tensor = op.message.tensors[tensorIdx]; nopDescriptor.tensors.emplace_back(); Descriptor::Tensor& nopTensorDescriptor = nopDescriptor.tensors.back(); nopTensorDescriptor.metadata = tensor.metadata; nopTensorDescriptor.sourceDevice = tensor.buffer.device(); if (tensor.targetDevice.has_value()) { nopTensorDescriptor.targetDevice = tensor.targetDevice.value(); } nopTensorDescriptor.length = tensor.length; } return nopHolderOut; } std::shared_ptr> makeDescriptorReplyForMessage( const ReadOperation& op) { auto nopHolderOut = std::make_shared>(); DescriptorReply& nopDescriptorReply = nopHolderOut->getObject(); for (size_t tensorIdx = 0; tensorIdx < op.descriptor.tensors.size(); ++tensorIdx) { if (!op.descriptor.tensors[tensorIdx].targetDevice.has_value()) { const Allocation::Tensor& tensor = op.allocation.tensors[tensorIdx]; nopDescriptorReply.targetDevices.push_back(tensor.buffer.device()); } } return nopHolderOut; } struct SelectedTransport { std::string name; std::string address; std::string domainDescriptor; }; SelectedTransport selectTransport( const ContextImpl::TOrderedTransports& orderedTransports, const std::unordered_map& remoteDomainDescriptors, const std::map& addresses) { for (const auto& transportContextIter : orderedTransports) { const std::string& transportName = std::get<0>(transportContextIter.second); const transport::Context& transportContext = *(std::get<1>(transportContextIter.second)); // This pipe's listener might not have an address for that transport. const auto addressIter = addresses.find(transportName); if (addressIter == addresses.cend()) { continue; } const auto& address = addressIter->second; const auto remoteDomainDescriptorsIter = remoteDomainDescriptors.find(transportName); if (remoteDomainDescriptorsIter == remoteDomainDescriptors.cend()) { continue; } const std::string& remoteDomainDescriptor = remoteDomainDescriptorsIter->second; if (!transportContext.canCommunicateWithRemote(remoteDomainDescriptor)) { continue; } return {transportName, address, transportContext.domainDescriptor()}; } TP_THROW_ASSERT() << "Could not find a viable transport"; // Returning dummy value to silence compiler warning. return {}; } struct SelectedChannels { std::unordered_map> descriptorsMap; std::unordered_map, std::string> channelForDevicePair; }; SelectedChannels selectChannels( const ContextImpl::TOrderedChannels& orderedChannels, const std::unordered_map< std::string, std::unordered_map>& remoteDescriptorsMap) { SelectedChannels result; for (const auto& channelIter : orderedChannels) { const std::string& channelName = std::get<0>(channelIter.second); const channel::Context& channelContext = *std::get<1>(channelIter.second); const auto& remoteDescriptorsMapIter = remoteDescriptorsMap.find(channelName); if (remoteDescriptorsMapIter == remoteDescriptorsMap.end()) { continue; } const std::unordered_map& localDeviceDescriptors = channelContext.deviceDescriptors(); const std::unordered_map& remoteDeviceDescriptors = remoteDescriptorsMapIter->second; bool selected = false; for (const auto& localDescIter : localDeviceDescriptors) { const Device& localDevice = localDescIter.first; const std::string& localDeviceDescriptor = localDescIter.second; for (const auto& remoteDescIter : remoteDeviceDescriptors) { const Device& remoteDevice = remoteDescIter.first; const std::string& remoteDeviceDescriptor = remoteDescIter.second; if (!channelContext.canCommunicateWithRemote( localDeviceDescriptor, remoteDeviceDescriptor)) { continue; } if (result.channelForDevicePair.count({localDevice, remoteDevice}) != 0) { // A channel with higher priority has already been selected for this // device pair. continue; } selected = true; result.channelForDevicePair[{localDevice, remoteDevice}] = channelName; } } if (selected) { result.descriptorsMap[channelName] = localDeviceDescriptors; } } return result; } } // namespace // // Initialization // PipeImpl::PipeImpl( std::shared_ptr context, std::string id, std::string remoteName, const std::string& url) : state_(CLIENT_ABOUT_TO_SEND_HELLO_AND_BROCHURE), context_(std::move(context)), id_(std::move(id)), remoteName_(std::move(remoteName)) { std::string address; std::tie(transport_, address) = splitSchemeOfURL(url); descriptorConnection_ = context_->getTransport(transport_)->connect(std::move(address)); descriptorConnection_->setId(id_ + ".d.tr_" + transport_); } PipeImpl::PipeImpl( std::shared_ptr context, std::shared_ptr listener, std::string id, std::string remoteName, std::string transport, std::shared_ptr connection) : state_(SERVER_WAITING_FOR_BROCHURE), context_(std::move(context)), listener_(std::move(listener)), id_(std::move(id)), remoteName_(std::move(remoteName)), transport_(std::move(transport)), descriptorConnection_(std::move(connection)) { descriptorConnection_->setId(id_ + ".d.tr_" + transport_); } void PipeImpl::init() { context_->deferToLoop( [impl{this->shared_from_this()}]() { impl->initFromLoop(); }); } void PipeImpl::initFromLoop() { TP_DCHECK(context_->inLoop()); if (context_->closed()) { // Set the error without calling setError because we do not want to invoke // handleError as it would find itself in a weird state (since the rest of // initFromLoop wouldn't have been called). error_ = TP_CREATE_ERROR(PipeClosedError); TP_VLOG(1) << "Pipe " << id_ << " is closing (without initing)"; return; } context_->enroll(*this); if (state_ == CLIENT_ABOUT_TO_SEND_HELLO_AND_BROCHURE) { auto nopHolderOut = std::make_shared>(); Packet& nopPacketOut = nopHolderOut->getObject(); nopPacketOut.Become(nopPacketOut.index_of()); SpontaneousConnection& nopSpontaneousConnection = *nopPacketOut.get(); nopSpontaneousConnection.contextName = context_->getName(); TP_VLOG(3) << "Pipe " << id_ << " is writing nop object (spontaneous connection)"; descriptorConnection_->write( *nopHolderOut, callbackWrapper_([nopHolderOut](PipeImpl& impl) { TP_VLOG(3) << "Pipe " << impl.id_ << " done writing nop object (spontaneous connection)"; })); auto nopHolderOut2 = std::make_shared>(); Brochure& nopBrochure = nopHolderOut2->getObject(); for (const auto& transportContextIter : context_->getOrderedTransports()) { const std::string& transportName = std::get<0>(transportContextIter.second); const transport::Context& transportContext = *(std::get<1>(transportContextIter.second)); nopBrochure.transportDomainDescriptors[transportName] = transportContext.domainDescriptor(); } for (const auto& channelContextIter : context_->getOrderedChannels()) { const std::string& channelName = std::get<0>(channelContextIter.second); const channel::Context& channelContext = *(std::get<1>(channelContextIter.second)); nopBrochure.channelDeviceDescriptors[channelName] = channelContext.deviceDescriptors(); } TP_VLOG(3) << "Pipe " << id_ << " is writing nop object (brochure)"; descriptorConnection_->write( *nopHolderOut2, callbackWrapper_([nopHolderOut2](PipeImpl& impl) { TP_VLOG(3) << "Pipe " << impl.id_ << " done writing nop object (brochure)"; })); state_ = CLIENT_WAITING_FOR_BROCHURE_ANSWER; auto nopHolderIn = std::make_shared>(); TP_VLOG(3) << "Pipe " << id_ << " is reading nop object (brochure answer)"; descriptorConnection_->read( *nopHolderIn, callbackWrapper_([nopHolderIn](PipeImpl& impl) { TP_VLOG(3) << "Pipe " << impl.id_ << " done reading nop object (brochure answer)"; if (!impl.error_) { impl.onReadWhileClientWaitingForBrochureAnswer( nopHolderIn->getObject()); } })); } if (state_ == SERVER_WAITING_FOR_BROCHURE) { auto nopHolderIn = std::make_shared>(); TP_VLOG(3) << "Pipe " << id_ << " is reading nop object (brochure)"; descriptorConnection_->read( *nopHolderIn, callbackWrapper_([nopHolderIn](PipeImpl& impl) { TP_VLOG(3) << "Pipe " << impl.id_ << " done reading nop object (brochure)"; if (!impl.error_) { impl.onReadWhileServerWaitingForBrochure(nopHolderIn->getObject()); } })); } } const std::string& PipeImpl::getRemoteName() { return remoteName_; } void PipeImpl::close() { context_->deferToLoop( [impl{this->shared_from_this()}]() { impl->closeFromLoop(); }); } void PipeImpl::closeFromLoop() { TP_DCHECK(context_->inLoop()); TP_VLOG(1) << "Pipe " << id_ << " is closing"; setError(TP_CREATE_ERROR(PipeClosedError)); } // // Entry points for user code // void PipeImpl::readDescriptor(read_descriptor_callback_fn fn) { context_->deferToLoop( [impl{this->shared_from_this()}, fn{std::move(fn)}]() mutable { impl->readDescriptorFromLoop(std::move(fn)); }); } void PipeImpl::readDescriptorFromLoop(read_descriptor_callback_fn fn) { TP_DCHECK(context_->inLoop()); ReadOpIter opIter = readOps_.emplaceBack(nextMessageBeingRead_++); ReadOperation& op = *opIter; TP_VLOG(1) << "Pipe " << id_ << " received a readDescriptor request (#" << op.sequenceNumber << ")"; fn = [this, sequenceNumber{op.sequenceNumber}, fn{std::move(fn)}]( const Error& error, Descriptor descriptor) { TP_DCHECK_EQ(sequenceNumber, nextReadDescriptorCallbackToCall_++); TP_VLOG(1) << "Pipe " << id_ << " is calling a readDescriptor callback (#" << sequenceNumber << ")"; fn(error, std::move(descriptor)); TP_VLOG(1) << "Pipe " << id_ << " done calling a readDescriptor callback (#" << sequenceNumber << ")"; }; op.readDescriptorCallback = std::move(fn); readOps_.advanceOperation(opIter); } void PipeImpl::read(Allocation allocation, read_callback_fn fn) { context_->deferToLoop([impl{this->shared_from_this()}, allocation{std::move(allocation)}, fn{std::move(fn)}]() mutable { impl->readFromLoop(std::move(allocation), std::move(fn)); }); } void PipeImpl::readFromLoop(Allocation allocation, read_callback_fn fn) { TP_DCHECK(context_->inLoop()); // This is such a bad logical error on the user's side that it doesn't deserve // to pass through the channel for "expected errors" (i.e., the callback). // This check fails when there is no message for which we are expecting an // allocation. TP_THROW_ASSERT_IF(!nextMessageGettingAllocation_.has_value()); ReadOpIter opIter = nextMessageGettingAllocation_.value(); ReadOperation& op = *opIter; nextMessageGettingAllocation_.reset(); checkAllocationCompatibility(op.descriptor, allocation); fn = [this, sequenceNumber{op.sequenceNumber}, fn{std::move(fn)}]( const Error& error) { TP_DCHECK_EQ(sequenceNumber, nextReadCallbackToCall_++); TP_VLOG(1) << "Pipe " << id_ << " is calling a read callback (#" << sequenceNumber << ")"; fn(error); TP_VLOG(1) << "Pipe " << id_ << " done calling a read callback (#" << sequenceNumber << ")"; }; op.allocation = std::move(allocation); op.readCallback = std::move(fn); op.doneGettingAllocation = true; TP_VLOG(1) << "Pipe " << id_ << " received a read request (#" << op.sequenceNumber << ", containing " << op.allocation.payloads.size() << " payloads and " << op.allocation.tensors.size() << " tensors)"; readOps_.advanceOperation(opIter); } void PipeImpl::readPayloadsOfMessage(ReadOpIter opIter) { TP_DCHECK(context_->inLoop()); ReadOperation& op = *opIter; TP_VLOG(2) << "Pipe " << id_ << " is reading payloads of message #" << op.sequenceNumber; TP_DCHECK_EQ(connectionState_, AWAITING_PAYLOADS); TP_DCHECK_EQ(messageBeingReadFromConnection_, op.sequenceNumber); for (size_t payloadIdx = 0; payloadIdx < op.allocation.payloads.size(); payloadIdx++) { Allocation::Payload& payload = op.allocation.payloads[payloadIdx]; Descriptor::Payload& payloadDescriptor = op.descriptor.payloads[payloadIdx]; TP_VLOG(3) << "Pipe " << id_ << " is reading payload #" << op.sequenceNumber << "." << payloadIdx; descriptorConnection_->read( payload.data, payloadDescriptor.length, callbackWrapper_( [opIter, payloadIdx]( PipeImpl& impl, const void* /* unused */, size_t /* unused */) { TP_VLOG(3) << "Pipe " << impl.id_ << " done reading payload #" << opIter->sequenceNumber << "." << payloadIdx; opIter->numPayloadsBeingRead--; impl.readOps_.advanceOperation(opIter); })); ++op.numPayloadsBeingRead; } connectionState_ = AWAITING_DESCRIPTOR; ++messageBeingReadFromConnection_; } void PipeImpl::receiveTensorsOfMessage(ReadOpIter opIter) { TP_DCHECK(context_->inLoop()); ReadOperation& op = *opIter; TP_VLOG(2) << "Pipe " << id_ << " is receiving tensors of message #" << op.sequenceNumber; TP_DCHECK_EQ(op.descriptor.tensors.size(), op.allocation.tensors.size()); for (size_t tensorIdx = 0; tensorIdx < op.descriptor.tensors.size(); ++tensorIdx) { Allocation::Tensor& tensor = op.allocation.tensors[tensorIdx]; const Descriptor::Tensor& tensorDescriptor = op.descriptor.tensors[tensorIdx]; const Device& localDevice = tensor.buffer.device(); const Device& remoteDevice = tensorDescriptor.sourceDevice; const auto& channelIter = channelForDevicePair_.find({localDevice, remoteDevice}); TP_THROW_ASSERT_IF(channelIter == channelForDevicePair_.end()) << "Could not find suitable channel for sending from local device " << localDevice.toString() << " to remote device " << remoteDevice.toString(); const std::string& channelName = channelIter->second; channel::Channel& channel = *channels_.at(channelName); TP_VLOG(3) << "Pipe " << id_ << " is receiving tensor #" << op.sequenceNumber << "." << tensorIdx; channel.recv( tensor.buffer, tensorDescriptor.length, callbackWrapper_([opIter, tensorIdx](PipeImpl& impl) { TP_VLOG(3) << "Pipe " << impl.id_ << " done receiving tensor #" << opIter->sequenceNumber << "." << tensorIdx; opIter->numTensorsBeingReceived--; impl.readOps_.advanceOperation(opIter); })); ++op.numTensorsBeingReceived; } } void PipeImpl::writeDescriptorReplyOfMessage(ReadOpIter opIter) { TP_DCHECK(context_->inLoop()); ReadOperation& op = *opIter; TP_DCHECK(op.hasMissingTargetDevices); std::shared_ptr> holder = makeDescriptorReplyForMessage(op); TP_VLOG(3) << "Pipe " << id_ << " is writing nop object (message descriptor reply #" << op.sequenceNumber << ")"; descriptorReplyConnection_->write( *holder, callbackWrapper_( [sequenceNumber{op.sequenceNumber}, holder](PipeImpl& impl) { TP_VLOG(3) << "Pipe " << impl.id_ << " done writing nop object (message descriptor reply #" << sequenceNumber << ")"; })); } void PipeImpl::write(Message message, write_callback_fn fn) { context_->deferToLoop([impl{this->shared_from_this()}, message{std::move(message)}, fn{std::move(fn)}]() mutable { impl->writeFromLoop(std::move(message), std::move(fn)); }); } void PipeImpl::writeFromLoop(Message message, write_callback_fn fn) { TP_DCHECK(context_->inLoop()); WriteOpIter opIter = writeOps_.emplaceBack(nextMessageBeingWritten_++); WriteOperation& op = *opIter; TP_VLOG(1) << "Pipe " << id_ << " received a write request (#" << op.sequenceNumber << ", contaning " << message.payloads.size() << " payloads and " << message.tensors.size() << " tensors)"; fn = [this, sequenceNumber{op.sequenceNumber}, fn{std::move(fn)}]( const Error& error) { TP_DCHECK_EQ(sequenceNumber, nextWriteCallbackToCall_++); TP_VLOG(1) << "Pipe " << id_ << " is calling a write callback (#" << sequenceNumber << ")"; fn(error); TP_VLOG(1) << "Pipe " << id_ << " done calling a write callback (#" << sequenceNumber << ")"; }; size_t numTensors = message.tensors.size(); op.tensors.resize(numTensors); for (size_t tensorIdx = 0; tensorIdx < numTensors; ++tensorIdx) { const Message::Tensor& tensor = message.tensors[tensorIdx]; WriteOperation::Tensor& tensorBeingSent = op.tensors[tensorIdx]; tensorBeingSent.sourceDevice = tensor.buffer.device(); if (tensor.targetDevice.has_value()) { tensorBeingSent.targetDevice = *tensor.targetDevice; } else { op.hasMissingTargetDevices = true; } } op.message = std::move(message); op.writeCallback = std::move(fn); writeOps_.advanceOperation(opIter); } // // Helpers to schedule our callbacks into user code // void PipeImpl::callReadDescriptorCallback(ReadOpIter opIter) { TP_DCHECK(context_->inLoop()); ReadOperation& op = *opIter; op.readDescriptorCallback(error_, op.descriptor); // Reset callback to release the resources it was holding. op.readDescriptorCallback = nullptr; } void PipeImpl::callReadCallback(ReadOpIter opIter) { TP_DCHECK(context_->inLoop()); ReadOperation& op = *opIter; op.readCallback(error_); // Reset callback to release the resources it was holding. op.readCallback = nullptr; } void PipeImpl::callWriteCallback(WriteOpIter opIter) { TP_DCHECK(context_->inLoop()); WriteOperation& op = *opIter; op.writeCallback(error_); // Reset callback to release the resources it was holding. op.writeCallback = nullptr; } // // Error handling // void PipeImpl::setError(Error error) { // Don't overwrite an error that's already set. if (error_ || !error) { return; } error_ = std::move(error); handleError(); } void PipeImpl::handleError() { TP_DCHECK(context_->inLoop()); TP_VLOG(2) << "Pipe " << id_ << " is handling error " << error_.what(); descriptorConnection_->close(); if (descriptorReplyConnection_) { descriptorReplyConnection_->close(); } for (auto& channelIter : channels_) { channelIter.second->close(); } for (const auto& tokenIter : registrationIds_) { listener_->unregisterConnectionRequest(tokenIter.second); } registrationIds_.clear(); for (const auto& iter : channelRegistrationIds_) { for (const auto& token : iter.second) { listener_->unregisterConnectionRequest(token); } } channelRegistrationIds_.clear(); channelReceivedConnections_.clear(); readOps_.advanceAllOperations(); writeOps_.advanceAllOperations(); context_->unenroll(*this); } // // Everything else // void PipeImpl::advanceReadOperation( ReadOpIter opIter, ReadOperation::State prevOpState) { TP_DCHECK(context_->inLoop()); ReadOperation& op = *opIter; // Needs to go after previous op to ensure ordering of callback invocations. readOps_.attemptTransition( opIter, /*from=*/ReadOperation::UNINITIALIZED, /*to=*/ReadOperation::ASKING_FOR_ALLOCATION, /*cond=*/error_ && prevOpState >= ReadOperation::ASKING_FOR_ALLOCATION, /*actions=*/{&PipeImpl::callReadDescriptorCallback}); // The ordering on the "wire" (the primary connection) is descriptor of op N, // then payloads of op N, then descriptor of op N+1. Hence this transition // must happen after the previous op scheduled its payload read, not just its // descriptor read. readOps_.attemptTransition( opIter, /*from=*/ReadOperation::UNINITIALIZED, /*to=*/ReadOperation::READING_DESCRIPTOR, /*cond=*/!error_ && state_ == ESTABLISHED && prevOpState >= ReadOperation::READING_PAYLOADS_AND_RECEIVING_TENSORS, /*actions=*/{&PipeImpl::readDescriptorOfMessage}); // Needs to go after previous op to ensure ordering of callback invocations. readOps_.attemptTransition( opIter, /*from=*/ReadOperation::READING_DESCRIPTOR, /*to=*/ReadOperation::ASKING_FOR_ALLOCATION, /*cond=*/op.doneReadingDescriptor && prevOpState >= ReadOperation::ASKING_FOR_ALLOCATION, /*actions=*/{&PipeImpl::callReadDescriptorCallback}); // Needs to wait for previous op to have _received_ the read call, as we can // only have exactly one operation at a time for which we expect a read call. readOps_.attemptTransition( opIter, /*from=*/ReadOperation::ASKING_FOR_ALLOCATION, /*to=*/ReadOperation::ASKING_FOR_ALLOCATION_FIRST_IN_LINE, /*cond=*/op.doneReadingDescriptor && prevOpState >= ReadOperation::READING_PAYLOADS_AND_RECEIVING_TENSORS, /*actions=*/{&PipeImpl::expectReadCall}); // Needs to go after previous op to ensure ordering of callback invocations. readOps_.attemptTransition( opIter, /*from=*/ReadOperation::ASKING_FOR_ALLOCATION_FIRST_IN_LINE, /*to=*/ReadOperation::FINISHED, /*cond=*/error_ && op.doneGettingAllocation && prevOpState >= ReadOperation::FINISHED, /*actions=*/{&PipeImpl::callReadCallback}); // No need to order this with the previous operation, since all it needs is // to come after this own op's descriptor read. // This transition shortcuts writing the descriptor reply when all target // devices were provided by the sender. readOps_.attemptTransition( opIter, /*from=*/ReadOperation::ASKING_FOR_ALLOCATION_FIRST_IN_LINE, /*to=*/ReadOperation::READING_PAYLOADS_AND_RECEIVING_TENSORS, /*cond=*/!error_ && op.doneGettingAllocation && !op.hasMissingTargetDevices, /*actions=*/ {&PipeImpl::readPayloadsOfMessage, &PipeImpl::receiveTensorsOfMessage}); // No need to order this with the previous operation, since all it needs is // to come after this own op's descriptor read. readOps_.attemptTransition( opIter, /*from=*/ReadOperation::ASKING_FOR_ALLOCATION_FIRST_IN_LINE, /*to=*/ReadOperation::READING_PAYLOADS_AND_RECEIVING_TENSORS, /*cond=*/!error_ && op.doneGettingAllocation && op.hasMissingTargetDevices, /*actions=*/ {&PipeImpl::readPayloadsOfMessage, &PipeImpl::writeDescriptorReplyOfMessage, &PipeImpl::receiveTensorsOfMessage}); // Needs to go after previous op to ensure ordering of callback invocations. readOps_.attemptTransition( opIter, /*from=*/ReadOperation::READING_PAYLOADS_AND_RECEIVING_TENSORS, /*to=*/ReadOperation::FINISHED, /*cond=*/op.numPayloadsBeingRead == 0 && op.numTensorsBeingReceived == 0 && prevOpState >= ReadOperation::FINISHED, /*actions=*/{&PipeImpl::callReadCallback}); } void PipeImpl::advanceWriteOperation( WriteOpIter opIter, WriteOperation::State prevOpState) { TP_DCHECK(context_->inLoop()); WriteOperation& op = *opIter; // Needs to go after previous op to ensure ordering of callback invocations. writeOps_.attemptTransition( opIter, /*from=*/WriteOperation::UNINITIALIZED, /*to=*/WriteOperation::FINISHED, /*cond=*/error_ && prevOpState >= WriteOperation::FINISHED, /*actions=*/{&PipeImpl::callWriteCallback}); // Needs to go after previous op to ensure predictable and consistent ordering // of write calls on the connection and send calls on the channels. // This transition shortcuts reading the target devices when they were all // provided by the user. writeOps_.attemptTransition( opIter, /*from=*/WriteOperation::UNINITIALIZED, /*to=*/WriteOperation::WRITING_PAYLOADS_AND_SENDING_TENSORS, /*cond=*/!error_ && state_ == ESTABLISHED && !op.hasMissingTargetDevices && prevOpState >= WriteOperation::WRITING_PAYLOADS_AND_SENDING_TENSORS, /*actions=*/ {&PipeImpl::writeDescriptorOfMessage, &PipeImpl::writePayloadsOfMessage, &PipeImpl::sendTensorsOfMessage}); // Needs to go after previous op to ensure predictable and consistent ordering // of write calls on the descriptor connection and read calls on the // descriptor reply connection. writeOps_.attemptTransition( opIter, /*from=*/WriteOperation::UNINITIALIZED, /*to=*/WriteOperation::WRITING_PAYLOADS_AND_READING_TARGET_DEVICES, /*cond=*/!error_ && state_ == ESTABLISHED && op.hasMissingTargetDevices && prevOpState >= WriteOperation::WRITING_PAYLOADS_AND_READING_TARGET_DEVICES, /*actions=*/ {&PipeImpl::writeDescriptorOfMessage, &PipeImpl::writePayloadsOfMessage, &PipeImpl::readDescriptorReplyOfMessage}); // Needs to go after previous op to ensure ordering of callback invocations. writeOps_.attemptTransition( opIter, /*from=*/WriteOperation::WRITING_PAYLOADS_AND_READING_TARGET_DEVICES, /*to=*/WriteOperation::FINISHED, /*cond=*/error_ && op.numPayloadsBeingWritten == 0 && op.doneReadingDescriptorReply && prevOpState >= WriteOperation::FINISHED, /*actions=*/{&PipeImpl::callWriteCallback}); // Needs to go after previous op to ensure predictable and consistent ordering // of send calls on channels. writeOps_.attemptTransition( opIter, /*from=*/WriteOperation::WRITING_PAYLOADS_AND_READING_TARGET_DEVICES, /*to=*/WriteOperation::WRITING_PAYLOADS_AND_SENDING_TENSORS, /*cond=*/!error_ && op.doneReadingDescriptorReply && prevOpState >= WriteOperation::WRITING_PAYLOADS_AND_SENDING_TENSORS, /*actions=*/{&PipeImpl::sendTensorsOfMessage}); // Needs to go after previous op to ensure ordering of callback invocations. writeOps_.attemptTransition( opIter, /*from=*/WriteOperation::WRITING_PAYLOADS_AND_SENDING_TENSORS, /*to=*/WriteOperation::FINISHED, /*cond=*/op.numPayloadsBeingWritten == 0 && op.numTensorsBeingSent == 0 && prevOpState >= WriteOperation::FINISHED, /*actions=*/{&PipeImpl::callWriteCallback}); } void PipeImpl::readDescriptorOfMessage(ReadOpIter opIter) { TP_DCHECK(context_->inLoop()); ReadOperation& op = *opIter; TP_DCHECK_EQ(connectionState_, AWAITING_DESCRIPTOR); TP_DCHECK_EQ(messageBeingReadFromConnection_, op.sequenceNumber); auto nopHolderIn = std::make_shared>(); TP_VLOG(3) << "Pipe " << id_ << " is reading nop object (message descriptor #" << op.sequenceNumber << ")"; descriptorConnection_->read( *nopHolderIn, callbackWrapper_([opIter, nopHolderIn](PipeImpl& impl) { TP_VLOG(3) << "Pipe " << impl.id_ << " done reading nop object (message descriptor #" << opIter->sequenceNumber << ")"; opIter->doneReadingDescriptor = true; if (!impl.error_) { opIter->descriptor = std::move(nopHolderIn->getObject()); for (const auto& tensor : opIter->descriptor.tensors) { if (!tensor.targetDevice.has_value()) { opIter->hasMissingTargetDevices = true; } } } impl.readOps_.advanceOperation(opIter); })); connectionState_ = AWAITING_PAYLOADS; } void PipeImpl::expectReadCall(ReadOpIter opIter) { TP_DCHECK(context_->inLoop()); ReadOperation& op = *opIter; TP_DCHECK(!nextMessageGettingAllocation_.has_value()); nextMessageGettingAllocation_ = opIter; } void PipeImpl::sendTensorsOfMessage(WriteOpIter opIter) { TP_DCHECK(context_->inLoop()); WriteOperation& op = *opIter; TP_VLOG(2) << "Pipe " << id_ << " is sending tensors of message #" << op.sequenceNumber; TP_DCHECK_EQ(op.message.tensors.size(), op.tensors.size()); for (size_t tensorIdx = 0; tensorIdx < op.message.tensors.size(); ++tensorIdx) { const auto& tensor = op.message.tensors[tensorIdx]; const Device& localDevice = op.tensors[tensorIdx].sourceDevice; TP_DCHECK(op.tensors[tensorIdx].targetDevice.has_value()); const Device& remoteDevice = *op.tensors[tensorIdx].targetDevice; const auto& channelIter = channelForDevicePair_.find({localDevice, remoteDevice}); TP_THROW_ASSERT_IF(channelIter == channelForDevicePair_.end()) << "Could not find suitable channel for sending from local device " << localDevice.toString() << " to remote device " << remoteDevice.toString(); const std::string& channelName = channelIter->second; channel::Channel& channel = *channels_[channelName]; TP_VLOG(3) << "Pipe " << id_ << " is sending tensor #" << op.sequenceNumber << "." << tensorIdx; channel.send( tensor.buffer, tensor.length, callbackWrapper_([opIter, tensorIdx](PipeImpl& impl) { TP_VLOG(3) << "Pipe " << impl.id_ << " done sending tensor #" << opIter->sequenceNumber << "." << tensorIdx; opIter->numTensorsBeingSent--; impl.writeOps_.advanceOperation(opIter); })); ++op.numTensorsBeingSent; } } void PipeImpl::writeDescriptorOfMessage(WriteOpIter opIter) { TP_DCHECK(context_->inLoop()); WriteOperation& op = *opIter; std::shared_ptr> holder = makeDescriptorForMessage(op); TP_VLOG(3) << "Pipe " << id_ << " is writing nop object (message descriptor #" << op.sequenceNumber << ")"; descriptorConnection_->write( *holder, callbackWrapper_( [sequenceNumber{op.sequenceNumber}, holder](PipeImpl& impl) { TP_VLOG(3) << "Pipe " << impl.id_ << " done writing nop object (message descriptor #" << sequenceNumber << ")"; })); } void PipeImpl::writePayloadsOfMessage(WriteOpIter opIter) { TP_DCHECK(context_->inLoop()); WriteOperation& op = *opIter; TP_VLOG(2) << "Pipe " << id_ << " is writing payloads of message #" << op.sequenceNumber; for (size_t payloadIdx = 0; payloadIdx < op.message.payloads.size(); payloadIdx++) { Message::Payload& payload = op.message.payloads[payloadIdx]; TP_VLOG(3) << "Pipe " << id_ << " is writing payload #" << op.sequenceNumber << "." << payloadIdx; descriptorConnection_->write( payload.data, payload.length, callbackWrapper_([opIter, payloadIdx](PipeImpl& impl) { TP_VLOG(3) << "Pipe " << impl.id_ << " done writing payload #" << opIter->sequenceNumber << "." << payloadIdx; opIter->numPayloadsBeingWritten--; impl.writeOps_.advanceOperation(opIter); })); ++op.numPayloadsBeingWritten; } } void PipeImpl::readDescriptorReplyOfMessage(WriteOpIter opIter) { TP_DCHECK(context_->inLoop()); WriteOperation& op = *opIter; TP_DCHECK(op.hasMissingTargetDevices); auto nopHolderIn = std::make_shared>(); TP_VLOG(3) << "Pipe " << id_ << " is reading nop object (message descriptor reply #" << op.sequenceNumber << ")"; descriptorReplyConnection_->read( *nopHolderIn, callbackWrapper_([opIter, nopHolderIn](PipeImpl& impl) { TP_VLOG(3) << "Pipe " << impl.id_ << " done reading nop object (message descriptor reply #" << opIter->sequenceNumber << ")"; opIter->doneReadingDescriptorReply = true; if (!impl.error_) { parseDescriptorReplyOfMessage( *opIter, std::move(nopHolderIn->getObject())); } impl.writeOps_.advanceOperation(opIter); })); } void PipeImpl::onReadWhileServerWaitingForBrochure( const Brochure& nopBrochure) { TP_DCHECK(context_->inLoop()); TP_DCHECK_EQ(state_, SERVER_WAITING_FOR_BROCHURE); auto nopHolderOut = std::make_shared>(); BrochureAnswer& nopBrochureAnswer = nopHolderOut->getObject(); auto transport = selectTransport( context_->getOrderedTransports(), nopBrochure.transportDomainDescriptors, listener_->addresses()); if (transport.name != transport_) { transport_ = transport.name; nopBrochureAnswer.transportRegistrationIds[ConnectionId::DESCRIPTOR] = registerTransport(ConnectionId::DESCRIPTOR); } nopBrochureAnswer.transportRegistrationIds[ConnectionId::DESCRIPTOR_REPLY] = registerTransport(ConnectionId::DESCRIPTOR_REPLY); nopBrochureAnswer.transport = transport.name; nopBrochureAnswer.address = transport.address; nopBrochureAnswer.transportDomainDescriptor = transport.domainDescriptor; SelectedChannels selectedChannels = selectChannels( context_->getOrderedChannels(), nopBrochure.channelDeviceDescriptors); channelForDevicePair_ = std::move(selectedChannels.channelForDevicePair); nopBrochureAnswer.channelForDevicePair = channelForDevicePair_; for (auto& descriptorsIter : selectedChannels.descriptorsMap) { const std::string& channelName = descriptorsIter.first; nopBrochureAnswer.channelRegistrationIds[channelName] = registerChannel(channelName); std::unordered_map& deviceDescriptors = descriptorsIter.second; nopBrochureAnswer.channelDeviceDescriptors[channelName] = std::move(deviceDescriptors); } TP_VLOG(3) << "Pipe " << id_ << " is writing nop object (brochure answer)"; descriptorConnection_->write( *nopHolderOut, callbackWrapper_([nopHolderOut](PipeImpl& impl) { TP_VLOG(3) << "Pipe " << impl.id_ << " done writing nop object (brochure answer)"; })); if (!pendingRegistrations()) { state_ = ESTABLISHED; readOps_.advanceAllOperations(); writeOps_.advanceAllOperations(); } else { state_ = SERVER_WAITING_FOR_CONNECTIONS; } } uint64_t PipeImpl::registerTransport(ConnectionId connId) { TP_DCHECK(registrationIds_.count(connId) == 0); TP_VLOG(3) << "Pipe " << id_ << " is requesting connection (as replacement)"; uint64_t token = listener_->registerConnectionRequest( callbackWrapper_([connId]( PipeImpl& impl, std::string transport, std::shared_ptr connection) { TP_VLOG(3) << "Pipe " << impl.id_ << " done requesting connection (as replacement)"; if (!impl.error_) { impl.onAcceptWhileServerWaitingForConnection( connId, std::move(transport), std::move(connection)); } })); registrationIds_[connId] = token; return token; } std::vector& PipeImpl::registerChannel( const std::string& channelName) { const channel::Context& channelContext = *context_->getChannel(channelName); const size_t numConnectionsNeeded = channelContext.numConnectionsNeeded(); auto& channelRegistrationIds = channelRegistrationIds_[channelName]; channelRegistrationIds.resize(numConnectionsNeeded); auto& channelReceivedConnections = channelReceivedConnections_[channelName]; channelReceivedConnections.resize(numConnectionsNeeded); for (size_t connId = 0; connId < numConnectionsNeeded; ++connId) { TP_VLOG(3) << "Pipe " << id_ << " is requesting connection " << connId << "/" << numConnectionsNeeded << " (for channel " << channelName << ")"; uint64_t token = listener_->registerConnectionRequest(callbackWrapper_( [channelName, connId, numConnectionsNeeded]( PipeImpl& impl, std::string transport, std::shared_ptr connection) { TP_VLOG(3) << "Pipe " << impl.id_ << " done requesting connection " << connId << "/" << numConnectionsNeeded << " (for channel " << channelName << ")"; if (!impl.error_) { impl.onAcceptWhileServerWaitingForChannel( channelName, connId, std::move(transport), std::move(connection)); } })); channelRegistrationIds[connId] = token; } return channelRegistrationIds; } void PipeImpl::onReadWhileClientWaitingForBrochureAnswer( const BrochureAnswer& nopBrochureAnswer) { TP_DCHECK(context_->inLoop()); TP_DCHECK_EQ(state_, CLIENT_WAITING_FOR_BROCHURE_ANSWER); const std::string& transport = nopBrochureAnswer.transport; std::string address = nopBrochureAnswer.address; std::shared_ptr transportContext = context_->getTransport(transport); TP_DCHECK(transportContext->canCommunicateWithRemote( nopBrochureAnswer.transportDomainDescriptor)) << "The two endpoints disagree on whether transport " << transport << " can be used to communicate"; if (transport != transport_) { TP_VLOG(3) << "Pipe " << id_ << " is opening connection (descriptor, as replacement)"; std::shared_ptr connection = transportContext->connect(address); connection->setId(id_ + ".d.tr_" + transport); const auto& transportRegistrationIter = nopBrochureAnswer.transportRegistrationIds.find( ConnectionId::DESCRIPTOR); TP_DCHECK( transportRegistrationIter != nopBrochureAnswer.transportRegistrationIds.end()); initConnection(*connection, transportRegistrationIter->second); transport_ = transport; descriptorConnection_ = std::move(connection); } { TP_VLOG(3) << "Pipe " << id_ << " is opening connection (descriptor_reply)"; std::shared_ptr connection = transportContext->connect(address); connection->setId(id_ + ".r.tr_" + transport); const auto& transportRegistrationIter = nopBrochureAnswer.transportRegistrationIds.find( ConnectionId::DESCRIPTOR_REPLY); TP_DCHECK( transportRegistrationIter != nopBrochureAnswer.transportRegistrationIds.end()); initConnection(*connection, transportRegistrationIter->second); descriptorReplyConnection_ = std::move(connection); } // Recompute the channel map based on this side's channels and priorities. SelectedChannels selectedChannels = selectChannels( context_->getOrderedChannels(), nopBrochureAnswer.channelDeviceDescriptors); channelForDevicePair_ = std::move(selectedChannels.channelForDevicePair); // Verify that the locally and remotely computed channel maps are consistent. TP_THROW_ASSERT_IF( nopBrochureAnswer.channelForDevicePair.size() != channelForDevicePair_.size()) << "Inconsistent channel selection"; for (const auto& iter : channelForDevicePair_) { Device localDevice; Device remoteDevice; std::tie(localDevice, remoteDevice) = iter.first; const std::string& channelName = iter.second; const auto& answerIter = nopBrochureAnswer.channelForDevicePair.find( {remoteDevice, localDevice}); TP_THROW_ASSERT_IF( answerIter == nopBrochureAnswer.channelForDevicePair.end()) << "Inconsistent channel selection"; TP_THROW_ASSERT_IF(answerIter->second != channelName) << "Inconsistent channel selection"; } for (const auto& channelDeviceDescriptorsIter : selectedChannels.descriptorsMap) { const std::string& channelName = channelDeviceDescriptorsIter.first; std::shared_ptr channelContext = context_->getChannel(channelName); const std::vector& registrationIds = nopBrochureAnswer.channelRegistrationIds.at(channelName); const size_t numConnectionsNeeded = channelContext->numConnectionsNeeded(); TP_DCHECK_EQ(numConnectionsNeeded, registrationIds.size()); std::vector> connections( numConnectionsNeeded); for (size_t connId = 0; connId < numConnectionsNeeded; ++connId) { TP_VLOG(3) << "Pipe " << id_ << " is opening connection " << connId << "/" << numConnectionsNeeded << " (for channel " << channelName << ")"; std::shared_ptr connection = transportContext->connect(address); connection->setId( id_ + ".ch_" + channelName + "_" + std::to_string(connId)); initConnection(*connection, registrationIds[connId]); connections[connId] = std::move(connection); } std::shared_ptr channel = channelContext->createChannel( std::move(connections), channel::Endpoint::kConnect); channel->setId(id_ + ".ch_" + channelName); channels_.emplace(channelName, std::move(channel)); } state_ = ESTABLISHED; readOps_.advanceAllOperations(); writeOps_.advanceAllOperations(); } void PipeImpl::initConnection( transport::Connection& connection, uint64_t token) { auto nopHolderOut = std::make_shared>(); Packet& nopPacketOut = nopHolderOut->getObject(); nopPacketOut.Become(nopPacketOut.index_of()); RequestedConnection& nopRequestedConnection = *nopPacketOut.get(); nopRequestedConnection.registrationId = token; TP_VLOG(3) << "Pipe " << id_ << " is writing nop object (requested connection)"; connection.write( *nopHolderOut, callbackWrapper_([nopHolderOut](PipeImpl& impl) { TP_VLOG(3) << "Pipe " << impl.id_ << " done writing nop object (requested connection)"; })); } void PipeImpl::onAcceptWhileServerWaitingForConnection( ConnectionId connId, std::string receivedTransport, std::shared_ptr receivedConnection) { TP_DCHECK(context_->inLoop()); TP_DCHECK_EQ(state_, SERVER_WAITING_FOR_CONNECTIONS); const auto& registrationIdIter = registrationIds_.find(connId); TP_DCHECK(registrationIdIter != registrationIds_.end()); size_t token = registrationIdIter->second; listener_->unregisterConnectionRequest(token); registrationIds_.erase(registrationIdIter); TP_DCHECK_EQ(transport_, receivedTransport); switch (connId) { case ConnectionId::DESCRIPTOR: receivedConnection->setId(id_ + ".d.tr_" + receivedTransport); descriptorConnection_ = std::move(receivedConnection); break; case ConnectionId::DESCRIPTOR_REPLY: receivedConnection->setId(id_ + ".r.tr_" + receivedTransport); descriptorReplyConnection_ = std::move(receivedConnection); break; default: TP_THROW_ASSERT() << "Unrecognized connection identifier"; } if (!pendingRegistrations()) { state_ = ESTABLISHED; readOps_.advanceAllOperations(); writeOps_.advanceAllOperations(); } } void PipeImpl::onAcceptWhileServerWaitingForChannel( std::string channelName, size_t connId, std::string receivedTransport, std::shared_ptr receivedConnection) { TP_DCHECK(context_->inLoop()); TP_DCHECK_EQ(state_, SERVER_WAITING_FOR_CONNECTIONS); TP_DCHECK_EQ(transport_, receivedTransport); auto channelRegistrationIdsIter = channelRegistrationIds_.find(channelName); TP_DCHECK(channelRegistrationIdsIter != channelRegistrationIds_.end()); listener_->unregisterConnectionRequest( channelRegistrationIdsIter->second[connId]); receivedConnection->setId( id_ + ".ch_" + channelName + "_" + std::to_string(connId)); channelReceivedConnections_[channelName][connId] = std::move(receivedConnection); // TODO: If we can guarantee the order in which the accept() calls happen, // this check can be replaced with `if (connId == numConnectionsNeeded - // 1)`. for (const auto& conn : channelReceivedConnections_[channelName]) { if (conn == nullptr) { return; } } std::shared_ptr channelContext = context_->getChannel(channelName); std::shared_ptr channel = channelContext->createChannel( std::move(channelReceivedConnections_[channelName]), channel::Endpoint::kListen); channel->setId(id_ + ".ch_" + channelName); channelRegistrationIds_.erase(channelRegistrationIdsIter); channelReceivedConnections_.erase(channelName); TP_DCHECK(channels_.find(channelName) == channels_.end()); channels_.emplace(channelName, std::move(channel)); if (!pendingRegistrations()) { state_ = ESTABLISHED; readOps_.advanceAllOperations(); writeOps_.advanceAllOperations(); } } bool PipeImpl::pendingRegistrations() { if (!registrationIds_.empty()) { return true; } if (!channelRegistrationIds_.empty()) { return true; } return false; } } // namespace tensorpipe ================================================ FILE: tensorpipe/core/pipe_impl.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include namespace tensorpipe { class ContextImpl; class ListenerImpl; struct ReadOperation { enum State { UNINITIALIZED, READING_DESCRIPTOR, ASKING_FOR_ALLOCATION, ASKING_FOR_ALLOCATION_FIRST_IN_LINE, READING_PAYLOADS_AND_RECEIVING_TENSORS, FINISHED }; // Fields used by the state machine uint64_t sequenceNumber{0}; State state{UNINITIALIZED}; // Progress flags bool doneReadingDescriptor{false}; bool doneGettingAllocation{false}; uint64_t numPayloadsBeingRead{0}; uint64_t numTensorsBeingReceived{0}; // Callbacks. Pipe::read_descriptor_callback_fn readDescriptorCallback; Pipe::read_callback_fn readCallback; // Arguments at creation bool hasMissingTargetDevices{false}; Descriptor descriptor; // Buffers allocated by the user. Allocation allocation; }; struct WriteOperation { enum State { UNINITIALIZED, WRITING_PAYLOADS_AND_READING_TARGET_DEVICES, WRITING_PAYLOADS_AND_SENDING_TENSORS, FINISHED }; // Fields used by the state machine uint64_t sequenceNumber{0}; State state{UNINITIALIZED}; // Progress flags bool doneReadingDescriptorReply{false}; uint64_t numPayloadsBeingWritten{0}; uint64_t numTensorsBeingSent{0}; // Callbacks. Pipe::write_callback_fn writeCallback; // Arguments at creation bool hasMissingTargetDevices{false}; Message message; struct Tensor { Device sourceDevice; optional targetDevice; }; std::vector tensors; }; class PipeImpl final : public std::enable_shared_from_this { public: PipeImpl( std::shared_ptr context, std::string id, std::string remoteName, const std::string& url); PipeImpl( std::shared_ptr context, std::shared_ptr listener, std::string id, std::string remoteName, std::string transport, std::shared_ptr connection); // Called by the pipe's constructor. void init(); using read_descriptor_callback_fn = Pipe::read_descriptor_callback_fn; using read_callback_fn = Pipe::read_callback_fn; using write_callback_fn = Pipe::write_callback_fn; void readDescriptor(read_descriptor_callback_fn fn); void read(Allocation allocation, read_callback_fn fn); void write(Message message, write_callback_fn fn); const std::string& getRemoteName(); void close(); private: void initFromLoop(); void readDescriptorFromLoop(read_descriptor_callback_fn fn); void readFromLoop(Allocation allocation, read_callback_fn fn); void writeFromLoop(Message message, write_callback_fn fn); void closeFromLoop(); enum State { INITIALIZING, CLIENT_ABOUT_TO_SEND_HELLO_AND_BROCHURE, SERVER_WAITING_FOR_BROCHURE, CLIENT_WAITING_FOR_BROCHURE_ANSWER, SERVER_WAITING_FOR_CONNECTIONS, ESTABLISHED }; State state_{INITIALIZING}; std::shared_ptr context_; std::shared_ptr listener_; // An identifier for the pipe, composed of the identifier for the context or // listener, combined with an increasing sequence number. It will only be used // for logging and debugging purposes. std::string id_; // The name the user has given to the connect method of the local context (for // outgoing pipes) or to the constructor of the context on the remote end (for // incoming pipes). std::string remoteName_; std::string transport_; enum ConnectionId { DESCRIPTOR, DESCRIPTOR_REPLY }; std::shared_ptr descriptorConnection_; std::shared_ptr descriptorReplyConnection_; std::unordered_map> channels_; std::unordered_map, std::string> channelForDevicePair_; // The server will set this up when it tell the client to switch to a // different connection or to open some channels. std::unordered_map registrationIds_; std::unordered_map> channelRegistrationIds_; std::unordered_map< std::string, std::vector>> channelReceivedConnections_; OpsStateMachine readOps_{ *this, &PipeImpl::advanceReadOperation}; using ReadOpIter = decltype(readOps_)::Iter; OpsStateMachine writeOps_{ *this, &PipeImpl::advanceWriteOperation}; using WriteOpIter = decltype(writeOps_)::Iter; // A sequence number for the calls to read and write. uint64_t nextMessageBeingRead_{0}; uint64_t nextMessageBeingWritten_{0}; // A sequence number for the invocations of the callbacks of read and write. uint64_t nextReadDescriptorCallbackToCall_{0}; uint64_t nextReadCallbackToCall_{0}; uint64_t nextWriteCallbackToCall_{0}; // When reading, we first read the descriptor, then signal this to the user, // and only once the user has allocated the memory we read the payloads. These // members store where we are in this loop, i.e., whether the next buffer we // will read from the connection will be a descriptor or a payload, and the // sequence number of which message that will be for. enum ConnectionState { AWAITING_DESCRIPTOR, AWAITING_PAYLOADS }; ConnectionState connectionState_{AWAITING_DESCRIPTOR}; uint64_t messageBeingReadFromConnection_{0}; // When reading, each message will be presented to the user in order for some // memory to be allocated for its payloads and tensors (this happens by // calling the readDescriptor callback and waiting for a read call). Under // normal operation there will be either 0 or 1 messages whose allocation is // pending, but there could be more after an error occurs, as we'll flush all // callbacks. We need to remember which is the first such operation for which // we're waiting for allocation in order to match calls to read to the right // message and for sanity checks. We do so by using a special state in the // state machine to identify the next operation that will receive a read call, // and store its iterator in this field. optional nextMessageGettingAllocation_; Error error_{Error::kSuccess}; // // Helpers to prepare callbacks from transports and listener // CallbackWrapper callbackWrapper_{*this, *this->context_}; // // Error handling // void setError(Error error); void handleError(); // // State machines // // Transitions for the pipe's initial handshake. // On the client side: void onReadWhileClientWaitingForBrochureAnswer( const BrochureAnswer& nopBrochureAnswer); // On the server side: void onReadWhileServerWaitingForBrochure(const Brochure& nopBrochure); void onAcceptWhileServerWaitingForConnection( ConnectionId connId, std::string receivedTransport, std::shared_ptr receivedConnection); void onAcceptWhileServerWaitingForChannel( std::string channelName, size_t connId, std::string receivedTransport, std::shared_ptr receivedConnection); // State machines for read and write ops. void advanceReadOperation( ReadOpIter opIter, ReadOperation::State prevOpState); void advanceWriteOperation( WriteOpIter opIter, WriteOperation::State prevOpState); // Actions (i.e., methods that begin a state transition). // For read operations: void readDescriptorOfMessage(ReadOpIter opIter); void callReadDescriptorCallback(ReadOpIter opIter); void expectReadCall(ReadOpIter opIter); void readPayloadsOfMessage(ReadOpIter opIter); void receiveTensorsOfMessage(ReadOpIter opIter); void writeDescriptorReplyOfMessage(ReadOpIter opIter); void callReadCallback(ReadOpIter opIter); // For write operations: void writeDescriptorOfMessage(WriteOpIter opIter); void writePayloadsOfMessage(WriteOpIter opIter); void readDescriptorReplyOfMessage(WriteOpIter opIter); void sendTensorsOfMessage(WriteOpIter opIter); void callWriteCallback(WriteOpIter opIter); // // Everything else // void initConnection(transport::Connection& connection, uint64_t token); uint64_t registerTransport(ConnectionId connId); std::vector& registerChannel(const std::string& channelName); bool pendingRegistrations(); template friend class CallbackWrapper; // Contexts and listeners do sometimes need to call directly into initFromLoop // and closeFromLoop, in order to make sure that some of their operations can // happen "atomically" on the connection, without possibly other operations // occurring in between (e.g., an error). friend ContextImpl; friend ListenerImpl; }; } // namespace tensorpipe ================================================ FILE: tensorpipe/misc/CMakeLists.txt ================================================ # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. add_executable(dump_state_machine dump_state_machine.cc) find_package(Clang REQUIRED) target_include_directories(dump_state_machine PRIVATE ${CLANG_INCLUDE_DIRS}) target_link_libraries(dump_state_machine PRIVATE clangTooling clangBasic clangASTMatchers) ================================================ FILE: tensorpipe/misc/dump_state_machine.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include #include #include #include #include using namespace clang::ast_matchers; using namespace clang::tooling; using namespace llvm; namespace { std::string exprToString(const clang::Expr& e) { std::string statement; raw_string_ostream stream(statement); e.printPretty(stream, nullptr, clang::PrintingPolicy(clang::LangOptions())); stream.flush(); return statement; } std::string cleanUp(const std::string& s) { std::string res = s; res = std::regex_replace(res, std::regex("(struct|class) [a-zA-Z_]+::"), ""); res = std::regex_replace(res, std::regex("this->"), ""); return res; } std::string escape(const std::string& s) { std::string res = s; res = std::regex_replace(res, std::regex("\\{"), "\\{"); res = std::regex_replace(res, std::regex("\\}"), "\\}"); res = std::regex_replace(res, std::regex(">"), "\\>"); res = std::regex_replace(res, std::regex("<"), "\\<"); res = std::regex_replace(res, std::regex("\\|"), "\\|"); return res; } class MethodPrinter : public MatchFinder::MatchCallback { std::unordered_set nodes_; void addNode(const std::string& label) { std::cout << label << " [label=<" << label << ">,group=states,fontstyle=\"bold\"];" << std::endl; nodes_.insert(label); } public: void run(const MatchFinder::MatchResult& result) override { static int edgeCount = 0; const clang::CallExpr& e = *result.Nodes.getNodeAs("x"); std::string edgeId = "edge" + std::to_string(edgeCount++); std::string fromId = cleanUp(exprToString(*e.getArg(1))); std::string toId = cleanUp(exprToString(*e.getArg(2))); if (nodes_.count(fromId) == 0) { addNode(fromId); } if (nodes_.count(toId) == 0) { addNode(toId); } std::string edgeColor = "orange3"; int edgeWeight = 100; std::string cond = cleanUp(exprToString(*e.getArg(3))); if (std::regex_search(cond, std::regex("^error_"))) { edgeColor = "red3"; edgeWeight = 0; } if (std::regex_search(cond, std::regex("^!error_"))) { edgeColor = "forestgreen"; } cond = std::regex_replace(cond, std::regex(" \\&\\&"), "\\n"); cond = escape(cond); std::string actions = cleanUp(exprToString(*e.getArg(4))); actions = std::regex_replace(actions, std::regex("(\\{|\\})"), ""); actions = std::regex_replace(actions, std::regex(", "), "\\n"); actions = std::regex_replace(actions, std::regex("\\&"), ""); std::cout << edgeId << " [label=\"{" << cond << "|" << actions << "}\",shape=record,style=\"rounded,dashed\",color=\"" << edgeColor << "\"];" << std::endl; std::cout << fromId << " -> " << edgeId << "[dir=\"none\",color=\"" << edgeColor << "\",style=\"dashed\",weight=" << edgeWeight << "];" << std::endl; std::cout << edgeId << " -> " << toId << "[color=\"" << edgeColor << "\",style=\"dashed\",weight=" << edgeWeight << "];" << std::endl; } }; } // namespace int main(int argc, const char* argv[]) { cl::OptionCategory category("dump_state_machine"); cl::opt methodName( "method", cl::Required, cl::cat(category), cl::desc( "Name of the method implementing the state machine's transitions."), cl::value_desc("method_name")); CommonOptionsParser optionsParser(argc, argv, category, cl::Required); ClangTool tool( optionsParser.getCompilations(), optionsParser.getSourcePathList()); auto methodMatcher = callExpr( callee(cxxMethodDecl(hasName("attemptTransition"))), hasAncestor(cxxMethodDecl(hasName(methodName)))) .bind("x"); MethodPrinter printer; MatchFinder finder; finder.addMatcher(methodMatcher, &printer); std::cout << "digraph {" << std::endl << "graph [rankdir=TB]" << std::endl << "node [shape=box]" << std::endl; int res = tool.run(newFrontendActionFactory(&finder).get()); std::cout << "}" << std::endl; return res; } ================================================ FILE: tensorpipe/python/CMakeLists.txt ================================================ # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. if(NOT (COMMAND pybind11_add_module)) add_subdirectory( ${PROJECT_SOURCE_DIR}/third_party/pybind11 ${PROJECT_BINARY_DIR}/third_party/pybind11 EXCLUDE_FROM_ALL) endif() set(PYBIND11_CPP_STANDARD -std=c++17) pybind11_add_module(pytensorpipe tensorpipe.cc) target_link_libraries(pytensorpipe PRIVATE tensorpipe) ================================================ FILE: tensorpipe/python/tensorpipe.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include #include #include #include #include namespace py = pybind11; namespace { using tensorpipe::optional; // RAII wrapper to reliably release every buffer we get. class BufferWrapper { public: BufferWrapper(const py::buffer& buffer, int flags) { if (PyObject_GetBuffer(buffer.ptr(), &buffer_, flags) != 0) { throw py::error_already_set(); } } BufferWrapper(const BufferWrapper& other) = delete; BufferWrapper(BufferWrapper&& other) = delete; BufferWrapper& operator=(const BufferWrapper& other) = delete; BufferWrapper& operator=(BufferWrapper&& other) = delete; ~BufferWrapper() { PyBuffer_Release(&buffer_); } void* ptr() const { return buffer_.buf; } size_t length() const { return buffer_.len; } py::buffer_info getBuffer() { return py::buffer_info( buffer_.buf, 1, py::format_descriptor::format(), 1, {static_cast(buffer_.len)}, {1}); } private: Py_buffer buffer_; }; class OutgoingPayload { public: BufferWrapper buffer; BufferWrapper metadata; OutgoingPayload(const py::buffer& buffer, const py::buffer& metadata) : buffer(buffer, PyBUF_SIMPLE), metadata(metadata, PyBUF_SIMPLE) {} }; class OutgoingTensor { public: BufferWrapper buffer; BufferWrapper metadata; OutgoingTensor(const py::buffer& buffer, const py::buffer& metadata) : buffer(buffer, PyBUF_SIMPLE), metadata(metadata, PyBUF_SIMPLE) {} }; class OutgoingMessage { public: BufferWrapper metadata; std::vector> payloads; std::vector> tensors; OutgoingMessage( const py::buffer& metadata, const std::vector>& payloads, const std::vector>& tensors) : metadata(metadata, PyBUF_SIMPLE), payloads(payloads), tensors(tensors) {} }; tensorpipe::Message prepareToWrite(std::shared_ptr pyMessage) { tensorpipe::Message tpMessage{ {reinterpret_cast(pyMessage->metadata.ptr()), pyMessage->metadata.length()}}; tpMessage.payloads.reserve(pyMessage->payloads.size()); for (const auto& pyPayload : pyMessage->payloads) { tensorpipe::Message::Payload tpPayload{ .data = pyPayload->buffer.ptr(), .length = pyPayload->buffer.length(), .metadata = {reinterpret_cast(pyPayload->metadata.ptr()), pyPayload->metadata.length()}, }; tpMessage.payloads.push_back(std::move(tpPayload)); } tpMessage.tensors.reserve(pyMessage->tensors.size()); for (const auto& pyTensor : pyMessage->tensors) { tensorpipe::Message::Tensor tpTensor{ .buffer = tensorpipe::CpuBuffer{.ptr = pyTensor->buffer.ptr()}, .length = pyTensor->buffer.length(), .metadata = {reinterpret_cast(pyTensor->metadata.ptr()), pyTensor->metadata.length()}, }; tpMessage.tensors.push_back(std::move(tpTensor)); } return tpMessage; } class IncomingPayload { public: size_t length; optional buffer; py::bytes metadata; IncomingPayload(size_t length, py::bytes metadata) : length(length), metadata(metadata) {} void set_buffer(const py::buffer& pyBuffer) { TP_THROW_ASSERT_IF(buffer.has_value()) << "Buffer already set"; buffer.emplace(pyBuffer, PyBUF_SIMPLE | PyBUF_WRITABLE); if (buffer->length() != length) { buffer.reset(); TP_THROW_ASSERT() << "Bad length"; } } }; class IncomingTensor { public: size_t length; optional buffer; py::bytes metadata; IncomingTensor(size_t length, py::bytes metadata) : length(length), metadata(metadata) {} void set_buffer(const py::buffer& pyBuffer) { TP_THROW_ASSERT_IF(buffer.has_value()) << "Buffer already set"; buffer.emplace(pyBuffer, PyBUF_SIMPLE | PyBUF_WRITABLE); if (buffer->length() != length) { buffer.reset(); TP_THROW_ASSERT() << "Bad length"; } } }; class IncomingMessage { public: py::bytes metadata; std::vector> payloads; std::vector> tensors; IncomingMessage( py::bytes metadata, std::vector> payloads, std::vector> tensors) : metadata(metadata), payloads(payloads), tensors(tensors) {} }; std::shared_ptr prepareToAllocate( const tensorpipe::Descriptor& tpDescriptor) { std::vector> pyPayloads; pyPayloads.reserve(tpDescriptor.payloads.size()); for (const auto& tpPayload : tpDescriptor.payloads) { pyPayloads.push_back(std::make_shared( tpPayload.length, tpPayload.metadata)); } std::vector> pyTensors; pyTensors.reserve(tpDescriptor.tensors.size()); for (const auto& tpTensor : tpDescriptor.tensors) { pyTensors.push_back( std::make_shared(tpTensor.length, tpTensor.metadata)); } auto pyMessage = std::make_shared( tpDescriptor.metadata, std::move(pyPayloads), std::move(pyTensors)); return pyMessage; } tensorpipe::Allocation prepareToRead( std::shared_ptr pyMessage) { tensorpipe::Allocation tpAllocation; tpAllocation.payloads.reserve(pyMessage->payloads.size()); for (const auto& pyPayload : pyMessage->payloads) { TP_THROW_ASSERT_IF(!pyPayload->buffer.has_value()) << "No buffer"; tensorpipe::Allocation::Payload tpPayload{ .data = pyPayload->buffer.value().ptr(), }; tpAllocation.payloads.push_back(std::move(tpPayload)); } tpAllocation.tensors.reserve(pyMessage->tensors.size()); for (const auto& pyTensor : pyMessage->tensors) { TP_THROW_ASSERT_IF(!pyTensor->buffer.has_value()) << "No buffer"; tensorpipe::Allocation::Tensor tpTensor{ .buffer = tensorpipe::CpuBuffer{.ptr = pyTensor->buffer.value().ptr()}, }; tpAllocation.tensors.push_back(std::move(tpTensor)); } return tpAllocation; } template using shared_ptr_class_ = py::class_>; } // namespace PYBIND11_MODULE(pytensorpipe, module) { py::print( "These bindings are EXPERIMENTAL, intended to give a PREVIEW of the API, " "and, as such, may CHANGE AT ANY TIME."); shared_ptr_class_ context(module, "Context"); shared_ptr_class_ listener(module, "Listener"); shared_ptr_class_ pipe(module, "Pipe"); shared_ptr_class_ outgoingPayload(module, "OutgoingPayload"); outgoingPayload.def( py::init(), py::arg("buffer"), py::arg("metadata")); shared_ptr_class_ outgoingTensor(module, "OutgoingTensor"); outgoingTensor.def( py::init(), py::arg("buffer"), py::arg("metadata")); shared_ptr_class_ outgoingMessage(module, "OutgoingMessage"); outgoingMessage.def( py::init< py::buffer, const std::vector>, const std::vector>>(), py::arg("metadata"), py::arg("payloads"), py::arg("tensors")); shared_ptr_class_ incomingPayload( module, "IncomingPayload", py::buffer_protocol()); incomingPayload.def_readonly("length", &IncomingPayload::length); incomingPayload.def_readonly("metadata", &IncomingPayload::metadata); incomingPayload.def_property( "buffer", [](IncomingPayload& pyPayload) -> py::buffer_info { TP_THROW_ASSERT_IF(!pyPayload.buffer.has_value()) << "No buffer"; return pyPayload.buffer->getBuffer(); }, &IncomingPayload::set_buffer); shared_ptr_class_ incomingTensor( module, "IncomingTensor", py::buffer_protocol()); incomingTensor.def_readonly("length", &IncomingTensor::length); incomingTensor.def_readonly("metadata", &IncomingTensor::metadata); incomingTensor.def_property( "buffer", [](IncomingTensor& pyTensor) -> py::buffer_info { TP_THROW_ASSERT_IF(!pyTensor.buffer.has_value()) << "No buffer"; return pyTensor.buffer->getBuffer(); }, &IncomingTensor::set_buffer); shared_ptr_class_ incomingMessage( module, "IncomingMessage", py::buffer_protocol()); incomingMessage.def_readonly("metadata", &IncomingMessage::metadata); incomingMessage.def_readonly("payloads", &IncomingMessage::payloads); incomingMessage.def_readonly("tensors", &IncomingMessage::tensors); // Creators. context.def(py::init<>()); context.def( "listen", [](std::shared_ptr context, const std::vector& urls) { return context->listen(urls); }, py::arg("urls")); context.def( "connect", [](std::shared_ptr context, const std::string& url) { return context->connect(url); }, py::arg("url")); context.def( "join", &tensorpipe::Context::join, py::call_guard()); // Callback registration. listener.def( "listen", [](std::shared_ptr listener, py::object callback) { listener->accept([callback{std::move(callback)}]( const tensorpipe::Error& error, std::shared_ptr pipe) mutable { if (error) { TP_LOG_ERROR() << error.what(); return; } TP_THROW_ASSERT_IF(!pipe) << "No pipe"; py::gil_scoped_acquire acquire; try { callback(std::move(pipe)); } catch (const py::error_already_set& err) { TP_LOG_ERROR() << "Callback raised exception: " << err.what(); } // Leaving the scope will decrease the refcount of callback which // may cause it to get destructed, which might segfault since we // won't be holding the GIL anymore. So we reset callback now, // while we're still holding the GIL. callback = py::object(); }); }); pipe.def( "read_descriptor", [](std::shared_ptr pipe, py::object callback) { pipe->readDescriptor([callback{std::move(callback)}]( const tensorpipe::Error& error, tensorpipe::Descriptor descriptor) mutable { if (error) { TP_LOG_ERROR() << error.what(); return; } py::gil_scoped_acquire acquire; try { callback(prepareToAllocate(std::move(descriptor))); } catch (const py::error_already_set& err) { TP_LOG_ERROR() << "Callback raised exception: " << err.what(); } // Leaving the scope will decrease the refcount of callback which // may cause it to get destructed, which might segfault since we // won't be holding the GIL anymore. So we reset callback now, // while we're still holding the GIL. callback = py::object(); }); }); pipe.def( "read", [](std::shared_ptr pipe, std::shared_ptr pyMessage, py::object callback) { tensorpipe::Allocation tpAllocation = prepareToRead(std::move(pyMessage)); pipe->read( std::move(tpAllocation), [callback{std::move(callback)}]( const tensorpipe::Error& error) mutable { if (error) { TP_LOG_ERROR() << error.what(); return; } py::gil_scoped_acquire acquire; try { callback(); } catch (const py::error_already_set& err) { TP_LOG_ERROR() << "Callback raised exception: " << err.what(); } // Leaving the scope will decrease the refcount of callback which // may cause it to get destructed, which might segfault since we // won't be holding the GIL anymore. So we reset callback now, // while we're still holding the GIL. callback = py::object(); }); }); pipe.def( "write", [](std::shared_ptr pipe, std::shared_ptr pyMessage, py::object callback) { tensorpipe::Message tpMessage = prepareToWrite(std::move(pyMessage)); pipe->write( std::move(tpMessage), [callback{std::move(callback)}]( const tensorpipe::Error& error) mutable { if (error) { TP_LOG_ERROR() << error.what(); return; } py::gil_scoped_acquire acquire; try { callback(); } catch (const py::error_already_set& err) { TP_LOG_ERROR() << "Callback raised exception: " << err.what(); } // Leaving the scope will decrease the refcount of callback which // may cause it to get destructed, which might segfault since we // won't be holding the GIL anymore. So we reset callback now, // while we're still holding the GIL. callback = py::object(); }); }); // Transports and channels shared_ptr_class_ abstractTransport( module, "AbstractTransport"); module.def("create_uv_transport", &tensorpipe::transport::uv::create); #if TENSORPIPE_HAS_SHM_TRANSPORT module.def("create_shm_transport", &tensorpipe::transport::shm::create); #endif // TENSORPIPE_HAS_SHM_TRANSPORT context.def( "register_transport", &tensorpipe::Context::registerTransport, py::arg("priority"), py::arg("name"), py::arg("transport")); shared_ptr_class_ abstractChannel( module, "AbstractChannel"); module.def("create_basic_channel", &tensorpipe::channel::basic::create); #if TENSORPIPE_HAS_CMA_CHANNEL module.def("create_cma_channel", &tensorpipe::channel::cma::create); #endif // TENSORPIPE_HAS_CMA_CHANNEL context.def( "register_channel", &tensorpipe::Context::registerChannel, py::arg("priority"), py::arg("name"), py::arg("channel")); // Helpers listener.def("get_url", &tensorpipe::Listener::url, py::arg("transport")); } ================================================ FILE: tensorpipe/tensorpipe.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include // High-level API #include #include #include #include #include #include #include // Transports #include #include #include #include #include #if TENSORPIPE_HAS_SHM_TRANSPORT #include #endif // TENSORPIPE_HAS_SHM_TRANSPORT #if TENSORPIPE_HAS_IBV_TRANSPORT #include #include #include #endif // TENSORPIPE_HAS_IBV_TRANSPORT // Channels #include #include #include #include #include #if TENSORPIPE_HAS_CMA_CHANNEL #include #endif // TENSORPIPE_HAS_CMA_CHANNEL ================================================ FILE: tensorpipe/tensorpipe_cuda.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include // High-level API #include // Channels #include #include #if TENSORPIPE_HAS_CUDA_GDR_CHANNEL #include #endif // TENSORPIPE_HAS_CUDA_GDR_CHANNEL #if TENSORPIPE_HAS_CUDA_IPC_CHANNEL #include #endif // TENSORPIPE_HAS_CUDA_IPC_CHANNEL ================================================ FILE: tensorpipe/test/CMakeLists.txt ================================================ # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. # List of source files that we need to build tensorpipe_test executable. set(TP_TEST_SRCS) # TP_TEST_LINK_LIBRARIES is list of dependent libraries to be linked set(TP_TEST_LINK_LIBRARIES) # TP_TEST_INCLUDE_DIRS is list of include path to be used set(TP_TEST_INCLUDE_DIRS) # TP_TEST_COMPILE_DEFS is list of compile definitions to be used set(TP_TEST_COMPILE_DEFS) list(APPEND TP_TEST_SRCS test.cc test_environment.cc transport/context_test.cc transport/connection_test.cc transport/uv/uv_test.cc transport/uv/context_test.cc transport/uv/loop_test.cc transport/uv/connection_test.cc transport/uv/sockaddr_test.cc transport/listener_test.cc core/context_test.cc core/pipe_test.cc channel/basic/basic_test.cc channel/xth/xth_test.cc channel/mpt/mpt_test.cc channel/channel_test.cc channel/channel_test_cpu.cc common/system_test.cc common/defs_test.cc ) if(TP_ENABLE_SHM) list(APPEND TP_TEST_SRCS common/epoll_loop_test.cc common/ringbuffer_test.cc common/shm_ringbuffer_test.cc common/shm_segment_test.cc transport/shm/reactor_test.cc transport/shm/connection_test.cc transport/shm/listener_test.cc transport/shm/sockaddr_test.cc transport/shm/shm_test.cc ) endif() if(TP_ENABLE_IBV) list(APPEND TP_TEST_SRCS common/epoll_loop_test.cc common/ringbuffer_test.cc transport/ibv/connection_test.cc transport/ibv/ibv_test.cc transport/ibv/sockaddr_test.cc ) endif() if(TP_ENABLE_CMA) list(APPEND TP_TEST_SRCS channel/cma/cma_test.cc ) add_subdirectory(channel/cma) endif() if(TP_USE_CUDA) find_package(CUDA REQUIRED) list(APPEND TP_TEST_LINK_LIBRARIES ${CUDA_LIBRARIES}) list(APPEND TP_TEST_INCLUDE_DIRS ${CUDA_INCLUDE_DIRS}) list(APPEND TP_TEST_COMPILE_DEFS TP_USE_CUDA) list(APPEND TP_TEST_SRCS channel/channel_test_cuda.cc channel/channel_test_cuda_multi_gpu.cc channel/channel_test_cuda_xdtt.cc common/cuda_test.cc core/pipe_cuda_test.cc ) list(APPEND TP_TEST_SRCS channel/cuda_xth/cuda_xth_test.cc channel/cuda_basic/cuda_basic_test.cc ) if(TP_ENABLE_CUDA_IPC) list(APPEND TP_TEST_SRCS channel/cuda_ipc/cuda_ipc_test.cc ) endif() list(APPEND TP_TEST_SRCS channel/cuda_gdr/cuda_gdr_test.cc ) cuda_add_library(tensorpipe_cuda_kernel channel/kernel.cu) list(APPEND TP_TEST_LINK_LIBRARIES tensorpipe_cuda_kernel) list(APPEND TP_TEST_LINK_LIBRARIES tensorpipe_cuda) endif() add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/googletest ${PROJECT_BINARY_DIR}/third_party/googletest EXCLUDE_FROM_ALL) list(APPEND TP_TEST_LINK_LIBRARIES tensorpipe uv::uv gmock gtest_main) add_executable(tensorpipe_test ${TP_TEST_SRCS}) # Add all the dependent link libraries to the tensorpipe_test target target_link_libraries(tensorpipe_test PRIVATE ${TP_TEST_LINK_LIBRARIES}) target_include_directories(tensorpipe_test PUBLIC ${TP_TEST_INCLUDE_DIRS}) target_compile_definitions(tensorpipe_test PRIVATE ${TP_TEST_COMPILE_DEFS}) ================================================ FILE: tensorpipe/test/channel/basic/basic_test.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include namespace { class BasicChannelTestHelper : public CpuChannelTestHelper { protected: std::shared_ptr makeContextInternal( std::string id) override { auto context = tensorpipe::channel::basic::create(); context->setId(std::move(id)); return context; } }; BasicChannelTestHelper helper; } // namespace INSTANTIATE_TEST_CASE_P(Basic, ChannelTestSuite, ::testing::Values(&helper)); INSTANTIATE_TEST_CASE_P(Basic, CpuChannelTestSuite, ::testing::Values(&helper)); ================================================ FILE: tensorpipe/test/channel/channel_test.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include using namespace tensorpipe; using namespace tensorpipe::channel; // Implement this in a subprocess as in some cases it may initialize CUDA and // thus would otherwise "pollute" the parent process. class DeviceDescriptorsTest : public ChannelTestCase { public: void run(ChannelTestHelper* helper) override { auto peerGroup = helper->makePeerGroup(); peerGroup->spawn( [&] { std::shared_ptr context1 = helper->makeContext("ctx1"); std::shared_ptr context2 = helper->makeContext("ctx2"); const auto& descriptors1 = context1->deviceDescriptors(); const auto& descriptors2 = context2->deviceDescriptors(); EXPECT_FALSE(descriptors1.empty()); EXPECT_FALSE(descriptors2.empty()); EXPECT_EQ(descriptors1.size(), descriptors2.size()); for (const auto& deviceIter : descriptors1) { EXPECT_FALSE(deviceIter.second.empty()); EXPECT_EQ(descriptors2.count(deviceIter.first), 1); EXPECT_EQ(deviceIter.second, descriptors2.at(deviceIter.first)); } }, [] {}); } }; CHANNEL_TEST(ChannelTestSuite, DeviceDescriptors); class ClientToServerTest : public ClientServerChannelTestCase { public: static constexpr int kDataSize = 256; void server(std::shared_ptr channel) override { // Initialize with sequential values. std::vector data(kDataSize); std::iota(data.begin(), data.end(), 0); std::unique_ptr wrappedData = helper_->makeDataWrapper(data); // Perform send and wait for completion. std::future sendFuture = sendWithFuture(channel, *wrappedData); Error sendError = sendFuture.get(); EXPECT_FALSE(sendError) << sendError.what(); this->peers_->done(PeerGroup::kServer); this->peers_->join(PeerGroup::kServer); } void client(std::shared_ptr channel) override { std::unique_ptr wrappedData = helper_->makeDataWrapper(kDataSize); // Perform recv and wait for completion. std::future recvFuture = recvWithFuture(channel, *wrappedData); Error recvError = recvFuture.get(); EXPECT_FALSE(recvError) << recvError.what(); // Validate contents of vector. auto unwrappedData = wrappedData->unwrap(); for (auto i = 0; i < kDataSize; i++) { EXPECT_EQ(unwrappedData[i], i); } this->peers_->done(PeerGroup::kClient); this->peers_->join(PeerGroup::kClient); } }; CHANNEL_TEST(ChannelTestSuite, ClientToServer); class ServerToClientTest : public ClientServerChannelTestCase { static constexpr int kDataSize = 256; public: void server(std::shared_ptr channel) override { std::unique_ptr wrappedData = helper_->makeDataWrapper(kDataSize); // Perform recv and wait for completion. std::future recvFuture = recvWithFuture(channel, *wrappedData); Error recvError = recvFuture.get(); EXPECT_FALSE(recvError) << recvError.what(); // Validate contents of vector. auto unwrappedData = wrappedData->unwrap(); for (auto i = 0; i < kDataSize; i++) { EXPECT_EQ(unwrappedData[i], i); } this->peers_->done(PeerGroup::kServer); this->peers_->join(PeerGroup::kServer); } void client(std::shared_ptr channel) override { // Initialize with sequential values. std::vector data(kDataSize); std::iota(data.begin(), data.end(), 0); std::unique_ptr wrappedData = helper_->makeDataWrapper(data); // Perform send and wait for completion. std::future sendFuture = sendWithFuture(channel, *wrappedData); Error sendError = sendFuture.get(); EXPECT_FALSE(sendError) << sendError.what(); this->peers_->done(PeerGroup::kClient); this->peers_->join(PeerGroup::kClient); } }; CHANNEL_TEST(ChannelTestSuite, ServerToClient); class SendMultipleTensorsTest : public ClientServerChannelTestCase { // FIXME This is very puzzling, as in CircleCI making this field static (and // possibly even constexpr) causes a undefined symbol link error. const int dataSize_ = 256 * 1024; // 256KB static constexpr int kNumTensors = 100; public: void server(std::shared_ptr channel) override { // Initialize with sequential values. std::vector data(dataSize_); std::iota(data.begin(), data.end(), 0); std::unique_ptr wrappedData = helper_->makeDataWrapper(data); // Error futures std::vector> sendFutures; // Perform send and wait for completion. for (int i = 0; i < kNumTensors; i++) { std::future sendFuture = sendWithFuture(channel, *wrappedData); sendFutures.push_back(std::move(sendFuture)); } for (auto& sendFuture : sendFutures) { Error sendError = sendFuture.get(); EXPECT_FALSE(sendError) << sendError.what(); } this->peers_->done(PeerGroup::kServer); this->peers_->join(PeerGroup::kServer); } void client(std::shared_ptr channel) override { std::vector> wrappedDataVec; for (int i = 0; i < kNumTensors; i++) { wrappedDataVec.push_back(helper_->makeDataWrapper(dataSize_)); } // Error futures std::vector> recvFutures; // Perform recv and wait for completion. for (auto& wrappedData : wrappedDataVec) { std::future recvFuture = recvWithFuture(channel, *wrappedData); recvFutures.push_back(std::move(recvFuture)); } for (auto& recvFuture : recvFutures) { Error recvError = recvFuture.get(); EXPECT_FALSE(recvError) << recvError.what(); } // Validate contents of vector. for (auto& wrappedData : wrappedDataVec) { auto unwrappedData = wrappedData->unwrap(); for (int i = 0; i < dataSize_; i++) { EXPECT_EQ(unwrappedData[i], i % 256); } } this->peers_->done(PeerGroup::kClient); this->peers_->join(PeerGroup::kClient); } }; CHANNEL_TEST(ChannelTestSuite, SendMultipleTensors); class SendTensorsBothWaysTest : public ClientServerChannelTestCase { static constexpr int kDataSize = 256; void server(std::shared_ptr channel) override { // Initialize sendBuffer with sequential values. std::vector sendData(kDataSize); std::iota(sendData.begin(), sendData.end(), 0); std::unique_ptr wrappedSendData = helper_->makeDataWrapper(sendData); // Recv buffer. std::unique_ptr wrappedRecvData = helper_->makeDataWrapper(kDataSize); // Perform send. std::future sendFuture = sendWithFuture(channel, *wrappedSendData); // Perform recv. std::future recvFuture = recvWithFuture(channel, *wrappedRecvData); // Wait for completion of both. Error sendError = sendFuture.get(); EXPECT_FALSE(sendError) << sendError.what(); Error recvError = recvFuture.get(); EXPECT_FALSE(recvError) << recvError.what(); // Verify recvd buffers. auto unwrappedData = wrappedRecvData->unwrap(); for (int i = 0; i < kDataSize; i++) { EXPECT_EQ(unwrappedData[i], i % 256); } this->peers_->done(PeerGroup::kServer); this->peers_->join(PeerGroup::kServer); } void client(std::shared_ptr channel) override { // Initialize sendBuffer with sequential values. std::vector sendData(kDataSize); std::iota(sendData.begin(), sendData.end(), 0); std::unique_ptr wrappedSendData = helper_->makeDataWrapper(sendData); // Recv buffer. std::unique_ptr wrappedRecvData = helper_->makeDataWrapper(kDataSize); // Perform send. std::future sendFuture = sendWithFuture(channel, *wrappedSendData); // Perform recv. std::future recvFuture = recvWithFuture(channel, *wrappedRecvData); // Wait for completion of both. Error sendError = sendFuture.get(); EXPECT_FALSE(sendError) << sendError.what(); Error recvError = recvFuture.get(); EXPECT_FALSE(recvError) << recvError.what(); // Verify recvd buffers. auto unwrappedData = wrappedRecvData->unwrap(); for (int i = 0; i < kDataSize; i++) { EXPECT_EQ(unwrappedData[i], i % 256); } this->peers_->done(PeerGroup::kClient); this->peers_->join(PeerGroup::kClient); } }; CHANNEL_TEST(ChannelTestSuite, SendTensorsBothWays); // Call send and recv with a length of 0 but a non-null pointer. class EmptyTensorTest : public ClientServerChannelTestCase { void server(std::shared_ptr channel) override { // Allocate a non-empty vector so that its .data() pointer is non-null. std::vector data(1); std::unique_ptr wrappedData = helper_->makeDataWrapper(data); Buffer buffer = wrappedData->buffer(); // Perform send and wait for completion. std::future sendFuture = sendWithFuture(channel, buffer, 0); Error sendError = sendFuture.get(); EXPECT_FALSE(sendError) << sendError.what(); this->peers_->done(PeerGroup::kServer); this->peers_->join(PeerGroup::kServer); } void client(std::shared_ptr channel) override { // Allocate a non-empty vector so that its .data() pointer is non-null. std::unique_ptr wrappedData = helper_->makeDataWrapper(1); Buffer buffer = wrappedData->buffer(); // Perform recv and wait for completion. std::future recvFuture = recvWithFuture(channel, buffer, 0); Error recvError = recvFuture.get(); EXPECT_FALSE(recvError) << recvError.what(); this->peers_->done(PeerGroup::kClient); this->peers_->join(PeerGroup::kClient); } }; CHANNEL_TEST(ChannelTestSuite, EmptyTensor); // Call send and recv with a length of 0, between sends and recvs with // positive length. class EmptyAndNonEmptyTensorsTest : public ClientServerChannelTestCase { void server(std::shared_ptr channel) override { std::vector data(1); std::unique_ptr wrappedData = helper_->makeDataWrapper(data); Buffer buffer = wrappedData->buffer(); std::vector> sendFutures; sendFutures.push_back(sendWithFuture(channel, buffer, 1)); sendFutures.push_back(sendWithFuture(channel, buffer, 0)); sendFutures.push_back(sendWithFuture(channel, buffer, 1)); for (auto& f : sendFutures) { Error sendError = f.get(); EXPECT_FALSE(sendError) << sendError.what(); } this->peers_->done(PeerGroup::kServer); this->peers_->join(PeerGroup::kServer); } void client(std::shared_ptr channel) override { std::unique_ptr wrappedData = helper_->makeDataWrapper(1); Buffer buffer = wrappedData->buffer(); std::vector> sendFutures; sendFutures.push_back(recvWithFuture(channel, buffer, 1)); sendFutures.push_back(recvWithFuture(channel, buffer, 0)); sendFutures.push_back(recvWithFuture(channel, buffer, 1)); for (auto& f : sendFutures) { Error sendError = f.get(); EXPECT_FALSE(sendError) << sendError.what(); } this->peers_->done(PeerGroup::kClient); this->peers_->join(PeerGroup::kClient); } }; CHANNEL_TEST(ChannelTestSuite, EmptyAndNonEmptyTensors); ================================================ FILE: tensorpipe/test/channel/channel_test.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include class DataWrapper { public: virtual tensorpipe::Buffer buffer() const = 0; virtual size_t bufferLength() const = 0; virtual std::vector unwrap() = 0; virtual ~DataWrapper() = default; }; class ChannelTestHelper { public: virtual ~ChannelTestHelper() = default; std::shared_ptr makeContext( std::string id, bool skipViabilityCheck = false) { std::shared_ptr ctx = makeContextInternal(std::move(id)); if (!skipViabilityCheck) { EXPECT_TRUE(ctx->isViable()); } return ctx; } virtual std::shared_ptr makePeerGroup() { return std::make_shared(); } virtual std::unique_ptr makeDataWrapper(size_t length) = 0; virtual std::unique_ptr makeDataWrapper( std::vector v) = 0; protected: virtual std::shared_ptr makeContextInternal( std::string id) = 0; }; [[nodiscard]] inline std::future sendWithFuture( std::shared_ptr channel, tensorpipe::Buffer buffer, size_t length) { auto promise = std::make_shared>(); auto future = promise->get_future(); channel->send( buffer, length, [promise{std::move(promise)}](const tensorpipe::Error& error) { promise->set_value(error); }); return future; } [[nodiscard]] inline std::future sendWithFuture( std::shared_ptr channel, const DataWrapper& dataWrapper) { return sendWithFuture( std::move(channel), dataWrapper.buffer(), dataWrapper.bufferLength()); } [[nodiscard]] inline std::future recvWithFuture( std::shared_ptr channel, tensorpipe::Buffer buffer, size_t length) { auto promise = std::make_shared>(); auto future = promise->get_future(); channel->recv( buffer, length, [promise{std::move(promise)}](const tensorpipe::Error& error) { promise->set_value(error); }); return future; } [[nodiscard]] inline std::future recvWithFuture( std::shared_ptr channel, const DataWrapper& dataWrapper) { return recvWithFuture( std::move(channel), dataWrapper.buffer(), dataWrapper.bufferLength()); } class ChannelTestCase { public: virtual void run(ChannelTestHelper* helper) = 0; virtual ~ChannelTestCase() = default; }; class ClientServerChannelTestCase : public ChannelTestCase { using MultiAcceptResult = std::pair< tensorpipe::Error, std::vector>>; class MultiAcceptResultPromise { public: explicit MultiAcceptResultPromise(size_t numConnections) : connections_(numConnections) {} ~MultiAcceptResultPromise() { // Sanity check if (!error_) { for (const auto& conn : connections_) { EXPECT_NE(conn, nullptr); } } promise_.set_value( MultiAcceptResult(std::move(error_), std::move(connections_))); } std::future getFuture() { return promise_.get_future(); } void setConnection( size_t connId, std::shared_ptr connection) { EXPECT_LT(connId, connections_.size()); connections_[connId] = std::move(connection); } void setError(tensorpipe::Error error) { std::unique_lock lock(errorMutex_); if (error_) { return; } error_ = std::move(error); } private: tensorpipe::Error error_{tensorpipe::Error::kSuccess}; std::mutex errorMutex_; std::vector> connections_; std::promise promise_; }; std::future accept( tensorpipe::transport::Listener& listener, size_t numConnections) { auto promise = std::make_shared(numConnections); for (size_t i = 0; i < numConnections; ++i) { listener.accept( [promise]( const tensorpipe::Error& error, std::shared_ptr connection) { if (error) { promise->setError(std::move(error)); return; } connection->read([promise, connection]( const tensorpipe::Error& error, const void* connIdBuf, size_t length) mutable { if (error) { promise->setError(std::move(error)); return; } ASSERT_EQ(sizeof(uint64_t), length); uint64_t connId = *static_cast(connIdBuf); promise->setConnection(connId, std::move(connection)); }); }); } return promise->getFuture(); } std::vector> connect( std::shared_ptr transportCtx, std::string addr, size_t numConnections) { std::vector> connections( numConnections); for (size_t connId = 0; connId < numConnections; ++connId) { connections[connId] = transportCtx->connect(addr); auto connIdBuf = std::make_shared(connId); connections[connId]->write( connIdBuf.get(), sizeof(uint64_t), [connIdBuf](const tensorpipe::Error& error) { EXPECT_FALSE(error) << error.what(); }); } return connections; } public: void run(ChannelTestHelper* helper) override { auto addr = "127.0.0.1"; helper_ = helper; peers_ = helper_->makePeerGroup(); peers_->spawn( [&] { auto transportCtx = tensorpipe::transport::uv::create(); transportCtx->setId("server_harness"); auto ctx = helper_->makeContext("server"); auto listener = transportCtx->listen(addr); auto connectionsFuture = accept(*listener, ctx->numConnectionsNeeded()); peers_->send(PeerGroup::kClient, listener->addr()); tensorpipe::Error connectionsError; std::vector> connections; std::tie(connectionsError, connections) = connectionsFuture.get(); EXPECT_FALSE(connectionsError) << connectionsError.what(); auto channel = ctx->createChannel( std::move(connections), tensorpipe::channel::Endpoint::kListen); server(std::move(channel)); ctx->join(); transportCtx->join(); afterServer(); }, [&] { auto transportCtx = tensorpipe::transport::uv::create(); transportCtx->setId("client_harness"); auto ctx = helper_->makeContext("client"); auto laddr = peers_->recv(PeerGroup::kClient); auto connections = connect(transportCtx, laddr, ctx->numConnectionsNeeded()); auto channel = ctx->createChannel( std::move(connections), tensorpipe::channel::Endpoint::kConnect); client(std::move(channel)); ctx->join(); transportCtx->join(); afterClient(); }); } virtual void server( std::shared_ptr /* channel */) {} virtual void client( std::shared_ptr /* channel */) {} virtual void afterServer() {} virtual void afterClient() {} protected: ChannelTestHelper* helper_; std::shared_ptr peers_; }; class ChannelTestSuite : public ::testing::TestWithParam {}; // Register a channel test. #define CHANNEL_TEST(suite, name) \ TEST_P(suite, name) { \ name##Test t; \ t.run(GetParam()); \ } ================================================ FILE: tensorpipe/test/channel/channel_test_cpu.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include using namespace tensorpipe; using namespace tensorpipe::channel; // Call send and recv with a null pointer and a length of 0. class NullPointerTest : public ClientServerChannelTestCase { void server(std::shared_ptr channel) override { // Perform send and wait for completion. std::future sendFuture = sendWithFuture(channel, CpuBuffer{.ptr = nullptr}, 0); Error sendError = sendFuture.get(); EXPECT_FALSE(sendError) << sendError.what(); this->peers_->done(PeerGroup::kServer); this->peers_->join(PeerGroup::kServer); } void client(std::shared_ptr channel) override { // Perform recv and wait for completion. std::future recvFuture = recvWithFuture(channel, CpuBuffer{.ptr = nullptr}, 0); Error recvError = recvFuture.get(); EXPECT_FALSE(recvError) << recvError.what(); this->peers_->done(PeerGroup::kClient); this->peers_->join(PeerGroup::kClient); } }; CHANNEL_TEST(CpuChannelTestSuite, NullPointer); // This test wants to make sure that the "heavy lifting" of copying data isn't // performed inline inside the recv method as that would make the user-facing // read method of the pipe blocking. // However, since we can't really check that behavior, we'll check a highly // correlated one: that the recv callback isn't called inline from within the // recv method. We do so by having that behavior cause a deadlock. class CallbacksAreDeferredTest : public ClientServerChannelTestCase { static constexpr auto kDataSize = 256; public: void server(std::shared_ptr channel) override { // Initialize with sequential values. std::vector data(kDataSize); std::iota(data.begin(), data.end(), 0); // Perform send and wait for completion. std::promise sendPromise; auto mutex = std::make_shared(); std::unique_lock callerLock(*mutex); channel->send( CpuBuffer{.ptr = data.data()}, kDataSize, [&sendPromise, mutex](const Error& error) { std::unique_lock calleeLock(*mutex); sendPromise.set_value(error); }); callerLock.unlock(); Error sendError = sendPromise.get_future().get(); EXPECT_FALSE(sendError) << sendError.what(); this->peers_->done(PeerGroup::kServer); this->peers_->join(PeerGroup::kServer); } void client(std::shared_ptr channel) override { // Initialize with zeroes. std::vector data(kDataSize); std::fill(data.begin(), data.end(), 0); // Perform recv and wait for completion. std::promise recvPromise; std::mutex mutex; std::unique_lock callerLock(mutex); channel->recv( CpuBuffer{.ptr = data.data()}, kDataSize, [&recvPromise, &mutex](const Error& error) { std::unique_lock calleeLock(mutex); recvPromise.set_value(error); }); callerLock.unlock(); Error recvError = recvPromise.get_future().get(); EXPECT_FALSE(recvError) << recvError.what(); // Validate contents of vector. for (auto i = 0; i < kDataSize; i++) { EXPECT_EQ(data[i], i); } this->peers_->done(PeerGroup::kClient); this->peers_->join(PeerGroup::kClient); } }; CHANNEL_TEST(CpuChannelTestSuite, CallbacksAreDeferred); ================================================ FILE: tensorpipe/test/channel/channel_test_cpu.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include class CpuDataWrapper : public DataWrapper { public: explicit CpuDataWrapper(size_t length) : vector_(length) {} explicit CpuDataWrapper(std::vector v) : vector_(v) {} tensorpipe::Buffer buffer() const override { return tensorpipe::CpuBuffer{.ptr = const_cast(vector_.data())}; } size_t bufferLength() const override { return vector_.size(); } std::vector unwrap() override { return vector_; } private: std::vector vector_; }; class CpuChannelTestHelper : public ChannelTestHelper { public: std::unique_ptr makeDataWrapper(size_t length) override { return std::make_unique(length); } std::unique_ptr makeDataWrapper( std::vector v) override { return std::make_unique(std::move(v)); } }; class CpuChannelTestSuite : public ::testing::TestWithParam {}; ================================================ FILE: tensorpipe/test/channel/channel_test_cuda.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include using namespace tensorpipe; using namespace tensorpipe::channel; class ReceiverWaitsForStartEventTest : public ClientServerChannelTestCase { static constexpr size_t kSize = 1024; void server(std::shared_ptr channel) override { TP_CUDA_CHECK(cudaSetDevice(0)); cudaStream_t sendStream; TP_CUDA_CHECK( cudaStreamCreateWithFlags(&sendStream, cudaStreamNonBlocking)); void* ptr; TP_CUDA_CHECK(cudaMalloc(&ptr, kSize)); // Delay sendStream with computations on buffer. slowKernel(ptr, kSize, sendStream); // Set buffer to target value. TP_CUDA_CHECK(cudaMemsetAsync(ptr, 0x42, kSize, sendStream)); // Perform send and wait for completion. auto sendPromise = std::make_shared>(); auto sendFuture = sendPromise->get_future(); channel->send( CudaBuffer{ .ptr = ptr, .stream = sendStream, }, kSize, [sendPromise{std::move(sendPromise)}](const tensorpipe::Error& error) { sendPromise->set_value(error); }); Error sendError = sendFuture.get(); EXPECT_FALSE(sendError) << sendError.what(); TP_CUDA_CHECK(cudaFree(ptr)); this->peers_->done(PeerGroup::kServer); this->peers_->join(PeerGroup::kServer); } void client(std::shared_ptr channel) override { TP_CUDA_CHECK(cudaSetDevice(0)); cudaStream_t recvStream; TP_CUDA_CHECK( cudaStreamCreateWithFlags(&recvStream, cudaStreamNonBlocking)); void* ptr; TP_CUDA_CHECK(cudaMalloc(&ptr, kSize)); // Perform recv and wait for completion. auto recvPromise = std::make_shared>(); auto recvFuture = recvPromise->get_future(); channel->recv( CudaBuffer{ .ptr = ptr, .stream = recvStream, }, kSize, [recvPromise{std::move(recvPromise)}](const tensorpipe::Error& error) { recvPromise->set_value(error); }); Error recvError = recvFuture.get(); EXPECT_FALSE(recvError) << recvError.what(); std::array data; TP_CUDA_CHECK(cudaStreamSynchronize(recvStream)); TP_CUDA_CHECK(cudaMemcpy(data.data(), ptr, kSize, cudaMemcpyDefault)); EXPECT_THAT(data, ::testing::Each(0x42)); TP_CUDA_CHECK(cudaFree(ptr)); this->peers_->done(PeerGroup::kClient); this->peers_->join(PeerGroup::kClient); } }; CHANNEL_TEST(CudaChannelTestSuite, ReceiverWaitsForStartEvent); class SendOffsetAllocationTest : public ClientServerChannelTestCase { public: static constexpr int kDataSize = 256; static constexpr int kOffset = 128; void server(std::shared_ptr channel) override { // Initialize with sequential values. void* ptr; TP_CUDA_CHECK(cudaMalloc(&ptr, kOffset + kDataSize)); // Set buffer to target value. TP_CUDA_CHECK(cudaMemset(ptr, 0xff, kOffset)); TP_CUDA_CHECK( cudaMemset(static_cast(ptr) + kOffset, 0x42, kDataSize)); // Perform send and wait for completion. std::future sendFuture = sendWithFuture( channel, CudaBuffer{.ptr = static_cast(ptr) + kOffset}, kDataSize); Error sendError = sendFuture.get(); EXPECT_FALSE(sendError) << sendError.what(); this->peers_->done(PeerGroup::kServer); this->peers_->join(PeerGroup::kServer); } void client(std::shared_ptr channel) override { std::unique_ptr wrappedData = helper_->makeDataWrapper(kDataSize); // Perform recv and wait for completion. std::future recvFuture = recvWithFuture(channel, *wrappedData); Error recvError = recvFuture.get(); EXPECT_FALSE(recvError) << recvError.what(); // Validate contents of vector. EXPECT_THAT(wrappedData->unwrap(), ::testing::Each(0x42)); this->peers_->done(PeerGroup::kClient); this->peers_->join(PeerGroup::kClient); } }; CHANNEL_TEST(CudaChannelTestSuite, SendOffsetAllocation); ================================================ FILE: tensorpipe/test/channel/channel_test_cuda.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include class CudaDataWrapper : public DataWrapper { public: // Non-copyable. CudaDataWrapper(const CudaDataWrapper&) = delete; CudaDataWrapper& operator=(const CudaDataWrapper&) = delete; // Non-movable. CudaDataWrapper(CudaDataWrapper&& other) = delete; CudaDataWrapper& operator=(CudaDataWrapper&& other) = delete; explicit CudaDataWrapper(size_t length) : length_(length) { if (length_ > 0) { TP_CUDA_CHECK(cudaSetDevice(0)); TP_CUDA_CHECK(cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking)); TP_CUDA_CHECK(cudaMalloc(&cudaPtr_, length_)); } } explicit CudaDataWrapper(std::vector v) : CudaDataWrapper(v.size()) { if (length_ > 0) { TP_CUDA_CHECK(cudaMemcpyAsync( cudaPtr_, v.data(), length_, cudaMemcpyDefault, stream_)); } } tensorpipe::Buffer buffer() const override { return tensorpipe::CudaBuffer{ .ptr = cudaPtr_, .stream = stream_, }; } size_t bufferLength() const override { return length_; } std::vector unwrap() override { std::vector v(length_); if (length_ > 0) { TP_CUDA_CHECK(cudaStreamSynchronize(stream_)); TP_CUDA_CHECK(cudaMemcpy(v.data(), cudaPtr_, length_, cudaMemcpyDefault)); } return v; } ~CudaDataWrapper() override { if (length_ > 0) { TP_CUDA_CHECK(cudaFree(cudaPtr_)); TP_CUDA_CHECK(cudaStreamDestroy(stream_)); } } private: void* cudaPtr_{nullptr}; size_t length_{0}; cudaStream_t stream_{cudaStreamDefault}; }; class CudaChannelTestHelper : public ChannelTestHelper { public: std::unique_ptr makeDataWrapper(size_t length) override { return std::make_unique(length); } std::unique_ptr makeDataWrapper( std::vector v) override { return std::make_unique(std::move(v)); } }; class CudaChannelTestSuite : public ::testing::TestWithParam {}; class CudaMultiGPUChannelTestSuite : public ::testing::TestWithParam {}; class CudaXDTTChannelTestSuite : public ::testing::TestWithParam {}; ================================================ FILE: tensorpipe/test/channel/channel_test_cuda_multi_gpu.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include using namespace tensorpipe; using namespace tensorpipe::channel; class SendAcrossDevicesTest : public ClientServerChannelTestCase { static constexpr size_t kSize = 1024; public: void run(ChannelTestHelper* helper) override { if (TestEnvironment::numCudaDevices() < 2) { GTEST_SKIP() << "Skipping test requiring >=2 CUDA devices."; } ClientServerChannelTestCase::run(helper); } private: void server(std::shared_ptr channel) override { cudaStream_t sendStream; void* ptr; { // Send happens from device #0. CudaDeviceGuard guard(0); TP_CUDA_CHECK( cudaStreamCreateWithFlags(&sendStream, cudaStreamNonBlocking)); TP_CUDA_CHECK(cudaMalloc(&ptr, kSize)); // Set buffer to target value. TP_CUDA_CHECK(cudaMemsetAsync(ptr, 0x42, kSize, sendStream)); } // Perform send and wait for completion. auto sendPromise = std::make_shared>(); auto sendFuture = sendPromise->get_future(); channel->send( CudaBuffer{ .ptr = ptr, .stream = sendStream, }, kSize, [sendPromise{std::move(sendPromise)}](const tensorpipe::Error& error) { sendPromise->set_value(error); }); Error sendError = sendFuture.get(); EXPECT_FALSE(sendError) << sendError.what(); { CudaDeviceGuard guard(0); TP_CUDA_CHECK(cudaFree(ptr)); TP_CUDA_CHECK(cudaStreamDestroy(sendStream)); } this->peers_->done(PeerGroup::kServer); this->peers_->join(PeerGroup::kServer); } void afterServer() override { if (this->peers_->endpointsInSameProcess()) { EXPECT_TRUE(initializedCudaContexts({0, 1})); } else { EXPECT_TRUE(initializedCudaContexts({0})); } } void client(std::shared_ptr channel) override { cudaStream_t recvStream; void* ptr; { // Recv happens on device #1. CudaDeviceGuard guard(1); TP_CUDA_CHECK( cudaStreamCreateWithFlags(&recvStream, cudaStreamNonBlocking)); TP_CUDA_CHECK(cudaMalloc(&ptr, kSize)); } // Perform recv and wait for completion. auto recvPromise = std::make_shared>(); auto recvFuture = recvPromise->get_future(); channel->recv( CudaBuffer{ .ptr = ptr, .stream = recvStream, }, kSize, [recvPromise{std::move(recvPromise)}](const tensorpipe::Error& error) { recvPromise->set_value(error); }); Error recvError = recvFuture.get(); EXPECT_FALSE(recvError) << recvError.what(); { CudaDeviceGuard guard(1); std::array data; TP_CUDA_CHECK(cudaStreamSynchronize(recvStream)); TP_CUDA_CHECK(cudaMemcpy(data.data(), ptr, kSize, cudaMemcpyDefault)); EXPECT_THAT(data, ::testing::Each(0x42)); TP_CUDA_CHECK(cudaFree(ptr)); TP_CUDA_CHECK(cudaStreamDestroy(recvStream)); } this->peers_->done(PeerGroup::kClient); this->peers_->join(PeerGroup::kClient); } void afterClient() override { if (this->peers_->endpointsInSameProcess()) { EXPECT_TRUE(initializedCudaContexts({0, 1})); } else { EXPECT_TRUE(initializedCudaContexts({1})); } } }; CHANNEL_TEST(CudaMultiGPUChannelTestSuite, SendAcrossDevices); class SendReverseAcrossDevicesTest : public ClientServerChannelTestCase { static constexpr size_t kSize = 1024; public: void run(ChannelTestHelper* helper) override { if (TestEnvironment::numCudaDevices() < 2) { GTEST_SKIP() << "Skipping test requiring >=2 CUDA devices."; } ClientServerChannelTestCase::run(helper); } private: void server(std::shared_ptr channel) override { cudaStream_t sendStream; void* ptr; { // Send happens from device #1. CudaDeviceGuard guard(1); TP_CUDA_CHECK( cudaStreamCreateWithFlags(&sendStream, cudaStreamNonBlocking)); TP_CUDA_CHECK(cudaMalloc(&ptr, kSize)); // Set buffer to target value. TP_CUDA_CHECK(cudaMemsetAsync(ptr, 0x42, kSize, sendStream)); } // Perform send and wait for completion. auto sendPromise = std::make_shared>(); auto sendFuture = sendPromise->get_future(); channel->send( CudaBuffer{ .ptr = ptr, .stream = sendStream, }, kSize, [sendPromise{std::move(sendPromise)}](const tensorpipe::Error& error) { sendPromise->set_value(error); }); Error sendError = sendFuture.get(); EXPECT_FALSE(sendError) << sendError.what(); { CudaDeviceGuard guard(1); TP_CUDA_CHECK(cudaFree(ptr)); TP_CUDA_CHECK(cudaStreamDestroy(sendStream)); } this->peers_->done(PeerGroup::kServer); this->peers_->join(PeerGroup::kServer); } void afterServer() override { if (this->peers_->endpointsInSameProcess()) { EXPECT_TRUE(initializedCudaContexts({0, 1})); } else { EXPECT_TRUE(initializedCudaContexts({1})); } } void client(std::shared_ptr channel) override { cudaStream_t recvStream; void* ptr; { // Recv happens on device #0. CudaDeviceGuard guard(0); TP_CUDA_CHECK( cudaStreamCreateWithFlags(&recvStream, cudaStreamNonBlocking)); TP_CUDA_CHECK(cudaMalloc(&ptr, kSize)); } // Perform recv and wait for completion. auto recvPromise = std::make_shared>(); auto recvFuture = recvPromise->get_future(); channel->recv( CudaBuffer{ .ptr = ptr, .stream = recvStream, }, kSize, [recvPromise{std::move(recvPromise)}](const tensorpipe::Error& error) { recvPromise->set_value(error); }); Error recvError = recvFuture.get(); EXPECT_FALSE(recvError) << recvError.what(); { CudaDeviceGuard guard(0); std::array data; TP_CUDA_CHECK(cudaStreamSynchronize(recvStream)); TP_CUDA_CHECK(cudaMemcpy(data.data(), ptr, kSize, cudaMemcpyDefault)); EXPECT_THAT(data, ::testing::Each(0x42)); TP_CUDA_CHECK(cudaFree(ptr)); TP_CUDA_CHECK(cudaStreamDestroy(recvStream)); } this->peers_->done(PeerGroup::kClient); this->peers_->join(PeerGroup::kClient); } void afterClient() override { if (this->peers_->endpointsInSameProcess()) { EXPECT_TRUE(initializedCudaContexts({0, 1})); } else { EXPECT_TRUE(initializedCudaContexts({0})); } } }; CHANNEL_TEST(CudaMultiGPUChannelTestSuite, SendReverseAcrossDevices); class SendAcrossNonDefaultDevicesTest : public ClientServerChannelTestCase { static constexpr size_t kSize = 1024; public: void run(ChannelTestHelper* helper) override { if (TestEnvironment::numCudaDevices() < 2) { GTEST_SKIP() << "Skipping test requiring >=2 CUDA devices."; } ClientServerChannelTestCase::run(helper); } private: void server(std::shared_ptr channel) override { cudaStream_t sendStream; void* ptr; { // Send happens from device #1. CudaDeviceGuard guard(1); TP_CUDA_CHECK( cudaStreamCreateWithFlags(&sendStream, cudaStreamNonBlocking)); TP_CUDA_CHECK(cudaMalloc(&ptr, kSize)); // Set buffer to target value. TP_CUDA_CHECK(cudaMemsetAsync(ptr, 0x42, kSize, sendStream)); } // Perform send and wait for completion. auto sendPromise = std::make_shared>(); auto sendFuture = sendPromise->get_future(); channel->send( CudaBuffer{ .ptr = ptr, .stream = sendStream, }, kSize, [sendPromise{std::move(sendPromise)}](const tensorpipe::Error& error) { sendPromise->set_value(error); }); Error sendError = sendFuture.get(); EXPECT_FALSE(sendError) << sendError.what(); { CudaDeviceGuard guard(1); TP_CUDA_CHECK(cudaFree(ptr)); TP_CUDA_CHECK(cudaStreamDestroy(sendStream)); } this->peers_->done(PeerGroup::kServer); this->peers_->join(PeerGroup::kServer); } void afterServer() override { EXPECT_TRUE(initializedCudaContexts({1})); } void client(std::shared_ptr channel) override { cudaStream_t recvStream; void* ptr; { // Recv happens on device #1. CudaDeviceGuard guard(1); TP_CUDA_CHECK( cudaStreamCreateWithFlags(&recvStream, cudaStreamNonBlocking)); TP_CUDA_CHECK(cudaMalloc(&ptr, kSize)); } // Perform recv and wait for completion. auto recvPromise = std::make_shared>(); auto recvFuture = recvPromise->get_future(); channel->recv( CudaBuffer{ .ptr = ptr, .stream = recvStream, }, kSize, [recvPromise{std::move(recvPromise)}](const tensorpipe::Error& error) { recvPromise->set_value(error); }); Error recvError = recvFuture.get(); EXPECT_FALSE(recvError) << recvError.what(); { CudaDeviceGuard guard(1); std::array data; TP_CUDA_CHECK(cudaStreamSynchronize(recvStream)); TP_CUDA_CHECK(cudaMemcpy(data.data(), ptr, kSize, cudaMemcpyDefault)); EXPECT_THAT(data, ::testing::Each(0x42)); TP_CUDA_CHECK(cudaFree(ptr)); TP_CUDA_CHECK(cudaStreamDestroy(recvStream)); } this->peers_->done(PeerGroup::kClient); this->peers_->join(PeerGroup::kClient); } void afterClient() override { EXPECT_TRUE(initializedCudaContexts({1})); } }; CHANNEL_TEST(CudaMultiGPUChannelTestSuite, SendAcrossNonDefaultDevices); ================================================ FILE: tensorpipe/test/channel/channel_test_cuda_xdtt.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include using namespace tensorpipe; using namespace tensorpipe::channel; class SendFromCpuToGpuTest : public ClientServerChannelTestCase { static constexpr size_t kSize = 1024; void server(std::shared_ptr channel) override { // Perform send and wait for completion. auto sendPromise = std::make_shared>(); auto sendFuture = sendPromise->get_future(); std::vector data(kSize, 0x42); channel->send( CpuBuffer{ .ptr = data.data(), }, kSize, [sendPromise{std::move(sendPromise)}](const tensorpipe::Error& error) { sendPromise->set_value(error); }); Error sendError = sendFuture.get(); EXPECT_FALSE(sendError) << sendError.what(); this->peers_->done(PeerGroup::kServer); this->peers_->join(PeerGroup::kServer); } void client(std::shared_ptr channel) override { TP_CUDA_CHECK(cudaSetDevice(0)); cudaStream_t recvStream; TP_CUDA_CHECK( cudaStreamCreateWithFlags(&recvStream, cudaStreamNonBlocking)); void* ptr; TP_CUDA_CHECK(cudaMalloc(&ptr, kSize)); // Perform recv and wait for completion. auto recvPromise = std::make_shared>(); auto recvFuture = recvPromise->get_future(); channel->recv( CudaBuffer{ .ptr = ptr, .stream = recvStream, }, kSize, [recvPromise{std::move(recvPromise)}](const tensorpipe::Error& error) { recvPromise->set_value(error); }); Error recvError = recvFuture.get(); EXPECT_FALSE(recvError) << recvError.what(); std::array data; TP_CUDA_CHECK(cudaStreamSynchronize(recvStream)); TP_CUDA_CHECK(cudaMemcpy(data.data(), ptr, kSize, cudaMemcpyDefault)); EXPECT_THAT(data, ::testing::Each(0x42)); TP_CUDA_CHECK(cudaFree(ptr)); this->peers_->done(PeerGroup::kClient); this->peers_->join(PeerGroup::kClient); } }; CHANNEL_TEST(CudaXDTTChannelTestSuite, SendFromCpuToGpu); class SendFromGpuToCpuTest : public ClientServerChannelTestCase { static constexpr size_t kSize = 1024; void server(std::shared_ptr channel) override { TP_CUDA_CHECK(cudaSetDevice(0)); cudaStream_t sendStream; TP_CUDA_CHECK( cudaStreamCreateWithFlags(&sendStream, cudaStreamNonBlocking)); void* ptr; TP_CUDA_CHECK(cudaMalloc(&ptr, kSize)); // Set buffer to target value. TP_CUDA_CHECK(cudaMemsetAsync(ptr, 0x42, kSize, sendStream)); // Perform send and wait for completion. auto sendPromise = std::make_shared>(); auto sendFuture = sendPromise->get_future(); channel->send( CudaBuffer{ .ptr = ptr, .stream = sendStream, }, kSize, [sendPromise{std::move(sendPromise)}](const tensorpipe::Error& error) { sendPromise->set_value(error); }); Error sendError = sendFuture.get(); EXPECT_FALSE(sendError) << sendError.what(); TP_CUDA_CHECK(cudaFree(ptr)); this->peers_->done(PeerGroup::kServer); this->peers_->join(PeerGroup::kServer); } void client(std::shared_ptr channel) override { // Perform recv and wait for completion. auto recvPromise = std::make_shared>(); auto recvFuture = recvPromise->get_future(); std::vector data(kSize); channel->recv( CpuBuffer{ .ptr = data.data(), }, kSize, [recvPromise{std::move(recvPromise)}](const tensorpipe::Error& error) { recvPromise->set_value(error); }); Error recvError = recvFuture.get(); EXPECT_FALSE(recvError) << recvError.what(); EXPECT_THAT(data, ::testing::Each(0x42)); this->peers_->done(PeerGroup::kClient); this->peers_->join(PeerGroup::kClient); } }; CHANNEL_TEST(CudaXDTTChannelTestSuite, SendFromGpuToCpu); class SendFromCpuToCpuTest : public ClientServerChannelTestCase { static constexpr size_t kSize = 1024; void server(std::shared_ptr channel) override { // Perform send and wait for completion. auto sendPromise = std::make_shared>(); auto sendFuture = sendPromise->get_future(); std::vector data(kSize, 0x42); channel->send( CpuBuffer{ .ptr = data.data(), }, kSize, [sendPromise{std::move(sendPromise)}](const tensorpipe::Error& error) { sendPromise->set_value(error); }); Error sendError = sendFuture.get(); EXPECT_FALSE(sendError) << sendError.what(); this->peers_->done(PeerGroup::kServer); this->peers_->join(PeerGroup::kServer); } void client(std::shared_ptr channel) override { // Perform recv and wait for completion. auto recvPromise = std::make_shared>(); auto recvFuture = recvPromise->get_future(); std::vector data(kSize); channel->recv( CpuBuffer{ .ptr = data.data(), }, kSize, [recvPromise{std::move(recvPromise)}](const tensorpipe::Error& error) { recvPromise->set_value(error); }); Error recvError = recvFuture.get(); EXPECT_FALSE(recvError) << recvError.what(); EXPECT_THAT(data, ::testing::Each(0x42)); this->peers_->done(PeerGroup::kClient); this->peers_->join(PeerGroup::kClient); } }; CHANNEL_TEST(CudaXDTTChannelTestSuite, SendFromCpuToCpu); ================================================ FILE: tensorpipe/test/channel/cma/CMakeLists.txt ================================================ # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. add_executable(tensorpipe_channel_cma_probe probe.cc ) target_link_libraries(tensorpipe_channel_cma_probe PRIVATE tensorpipe ) ================================================ FILE: tensorpipe/test/channel/cma/cma_test.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include namespace { class CmaChannelTestHelper : public CpuChannelTestHelper { protected: std::shared_ptr makeContextInternal( std::string id) override { auto context = tensorpipe::channel::cma::create(); context->setId(std::move(id)); return context; } }; CmaChannelTestHelper helper; } // namespace INSTANTIATE_TEST_CASE_P(Cma, ChannelTestSuite, ::testing::Values(&helper)); INSTANTIATE_TEST_CASE_P(Cma, CpuChannelTestSuite, ::testing::Values(&helper)); ================================================ FILE: tensorpipe/test/channel/cma/docker_tests.sh ================================================ #!/usr/bin/env bash # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. # We use a lot of trailing backslashes inside single-quoted string literals when # we pass sub-scripts to sh -c, in order to wrap lines for long commands. # Removing them would be incorrect, hence we just silence the linter warning. # shellcheck disable=SC1004 set -eo pipefail echo "Both endpoints in same vanilla container" # This is not supposed to work, as Docker by default has a seccomp-bpf rule that # blocks the process_vm_readv syscall. # See https://jvns.ca/blog/2020/04/29/why-strace-doesnt-work-in-docker/ # and https://docs.docker.com/engine/security/seccomp/ TEMPDIR=$(mktemp --directory) chmod ugo+rwx "$TEMPDIR" echo "Using $TEMPDIR for staging data" docker run \ --volume "$TEMPDIR:/tmp/report" \ --volume "$(pwd)/build:/tmp/build" \ cimg/base:2020.01 \ sh -c ' \ TP_VERBOSE_LOGGING=5 \ /tmp/build/tensorpipe/test/channel/cma/tensorpipe_channel_cma_probe \ 0 /tmp/report/socket \ > /tmp/report/probe1_report.json & \ probe1_pid=$!; \ while [ ! -S /tmp/report/socket ]; do sleep 0.1; done; \ TP_VERBOSE_LOGGING=5 \ /tmp/build/tensorpipe/test/channel/cma/tensorpipe_channel_cma_probe \ 1 /tmp/report/socket \ > /tmp/report/probe2_report.json & \ probe2_pid=$!; \ wait $probe1_pid; \ wait $probe2_pid' python3 \ "$(pwd)/tensorpipe/test/channel/cma/probe_report_checker.py" \ "$TEMPDIR/probe1_report.json" "$TEMPDIR/probe2_report.json" 0 echo "Both endpoints in same container, seccomp-bpf disabled" # This fixes the above problem, and makes it work. TEMPDIR=$(mktemp --directory) chmod ugo+rwx "$TEMPDIR" echo "Using $TEMPDIR for staging data" docker run \ --volume "$TEMPDIR:/tmp/report" \ --volume "$(pwd)/build:/tmp/build" \ --security-opt seccomp=unconfined \ cimg/base:2020.01 \ sh -c ' \ TP_VERBOSE_LOGGING=5 \ /tmp/build/tensorpipe/test/channel/cma/tensorpipe_channel_cma_probe \ 0 /tmp/report/socket \ > /tmp/report/probe1_report.json & \ probe1_pid=$!; \ while [ ! -S /tmp/report/socket ]; do sleep 0.1; done; \ TP_VERBOSE_LOGGING=5 \ /tmp/build/tensorpipe/test/channel/cma/tensorpipe_channel_cma_probe \ 1 /tmp/report/socket \ > /tmp/report/probe2_report.json & \ probe2_pid=$!; \ wait $probe1_pid; \ wait $probe2_pid' python3 \ "$(pwd)/tensorpipe/test/channel/cma/probe_report_checker.py" \ "$TEMPDIR/probe1_report.json" "$TEMPDIR/probe2_report.json" 1 echo "Both endpoints in same container, capability SYS_PTRACE added" # This should not really matter, but Docker adds a "side effect" to this which # also re-enables process_vm_readv in seccomp-bpf. TEMPDIR=$(mktemp --directory) chmod ugo+rwx "$TEMPDIR" echo "Using $TEMPDIR for staging data" docker run \ --volume "$TEMPDIR:/tmp/report" \ --volume "$(pwd)/build:/tmp/build" \ --cap-add SYS_PTRACE \ cimg/base:2020.01 \ sh -c ' \ TP_VERBOSE_LOGGING=5 \ /tmp/build/tensorpipe/test/channel/cma/tensorpipe_channel_cma_probe \ 0 /tmp/report/socket \ > /tmp/report/probe1_report.json & \ probe1_pid=$!; \ while [ ! -S /tmp/report/socket ]; do sleep 0.1; done; \ TP_VERBOSE_LOGGING=5 \ /tmp/build/tensorpipe/test/channel/cma/tensorpipe_channel_cma_probe \ 1 /tmp/report/socket \ > /tmp/report/probe2_report.json & \ probe2_pid=$!; \ wait $probe1_pid; \ wait $probe2_pid' python3 \ "$(pwd)/tensorpipe/test/channel/cma/probe_report_checker.py" \ "$TEMPDIR/probe1_report.json" "$TEMPDIR/probe2_report.json" 1 echo "Both endpoints in same container, privileged" # This should not really matter, but Docker adds a "side effect" to this which # also re-enables process_vm_readv in seccomp-bpf. TEMPDIR=$(mktemp --directory) chmod ugo+rwx "$TEMPDIR" echo "Using $TEMPDIR for staging data" docker run \ --volume "$TEMPDIR:/tmp/report" \ --volume "$(pwd)/build:/tmp/build" \ --privileged \ cimg/base:2020.01 \ sh -c ' \ TP_VERBOSE_LOGGING=5 \ /tmp/build/tensorpipe/test/channel/cma/tensorpipe_channel_cma_probe \ 0 /tmp/report/socket \ > /tmp/report/probe1_report.json & \ probe1_pid=$!; \ while [ ! -S /tmp/report/socket ]; do sleep 0.1; done; \ TP_VERBOSE_LOGGING=5 \ /tmp/build/tensorpipe/test/channel/cma/tensorpipe_channel_cma_probe \ 1 /tmp/report/socket \ > /tmp/report/probe2_report.json & \ probe2_pid=$!; \ wait $probe1_pid; \ wait $probe2_pid' python3 \ "$(pwd)/tensorpipe/test/channel/cma/probe_report_checker.py" \ "$TEMPDIR/probe1_report.json" "$TEMPDIR/probe2_report.json" 1 echo "Both endpoints in same container, stronger YAMA limits" # CMA is able to work under YAMA when the latter is set to levels 0 or 1, as # in the first case YAMA adds no extra limit and in the second case CMA will # configure YAMA so that it allows the process to be ptraced by any other one. # However CMA can't handle YAMA at level 2 or higher. # We keep disabling seccomp-bpf as otherwise this would be shadowed. sudo sh -c 'echo 2 > /proc/sys/kernel/yama/ptrace_scope' TEMPDIR=$(mktemp --directory) chmod ugo+rwx "$TEMPDIR" echo "Using $TEMPDIR for staging data" docker run \ --volume "$TEMPDIR:/tmp/report" \ --volume "$(pwd)/build:/tmp/build" \ --security-opt seccomp=unconfined \ cimg/base:2020.01 \ sh -c ' \ TP_VERBOSE_LOGGING=5 \ /tmp/build/tensorpipe/test/channel/cma/tensorpipe_channel_cma_probe \ 0 /tmp/report/socket \ > /tmp/report/probe1_report.json & \ probe1_pid=$!; \ while [ ! -S /tmp/report/socket ]; do sleep 0.1; done; \ TP_VERBOSE_LOGGING=5 \ /tmp/build/tensorpipe/test/channel/cma/tensorpipe_channel_cma_probe \ 1 /tmp/report/socket \ > /tmp/report/probe2_report.json & \ probe2_pid=$!; \ wait $probe1_pid; \ wait $probe2_pid' python3 \ "$(pwd)/tensorpipe/test/channel/cma/probe_report_checker.py" \ "$TEMPDIR/probe1_report.json" "$TEMPDIR/probe2_report.json" 0 sudo sh -c 'echo 1 > /proc/sys/kernel/yama/ptrace_scope' # TODO # echo "Both endpoints in same container, different users/groups" # TODO # echo "Both endpoints in same container, same users/groups but different effective user/group" echo "Each endpoint in own container, with separate namespace" # This isn't supposed to work, as each container gets its own user and PID # namespace, but CMA needs them to match. We disable seccomp-bpf to give this # test a fighting chance. TEMPDIR=$(mktemp --directory) chmod ugo+rwx "$TEMPDIR" echo "Using $TEMPDIR for staging data" docker run \ --volume "$TEMPDIR:/tmp/report" \ --volume "$(pwd)/build:/tmp/build" \ --security-opt seccomp=unconfined \ cimg/base:2020.01 \ sh -c ' \ TP_VERBOSE_LOGGING=5 \ /tmp/build/tensorpipe/test/channel/cma/tensorpipe_channel_cma_probe \ 0 /tmp/report/socket \ > /tmp/report/probe1_report.json' & probe1_pid=$! while [ ! -S "$TEMPDIR/socket" ]; do sleep 0.1; done docker run \ --volume "$TEMPDIR:/tmp/report" \ --volume "$(pwd)/build:/tmp/build" \ --security-opt seccomp=unconfined \ cimg/base:2020.01 \ sh -c ' \ TP_VERBOSE_LOGGING=5 \ /tmp/build/tensorpipe/test/channel/cma/tensorpipe_channel_cma_probe \ 1 /tmp/report/socket \ > /tmp/report/probe2_report.json' & probe2_pid=$! wait $probe1_pid wait $probe2_pid python3 \ "$(pwd)/tensorpipe/test/channel/cma/probe_report_checker.py" \ "$TEMPDIR/probe1_report.json" "$TEMPDIR/probe2_report.json" 0 # Docker allows a container to reuse another one's PID namespace, but doesn't # allow the same for user namespaces. echo "Each endpoint in own container, reusing host namespaces" # This should fix the issues of the above. TEMPDIR=$(mktemp --directory) chmod ugo+rwx "$TEMPDIR" echo "Using $TEMPDIR for staging data" docker run \ --volume "$TEMPDIR:/tmp/report" \ --volume "$(pwd)/build:/tmp/build" \ --security-opt seccomp=unconfined \ --pid host \ --userns host \ cimg/base:2020.01 \ sh -c ' \ TP_VERBOSE_LOGGING=5 \ /tmp/build/tensorpipe/test/channel/cma/tensorpipe_channel_cma_probe \ 0 /tmp/report/socket \ > /tmp/report/probe1_report.json' & probe1_pid=$! while [ ! -S "$TEMPDIR/socket" ]; do sleep 0.1; done docker run \ --volume "$TEMPDIR:/tmp/report" \ --volume "$(pwd)/build:/tmp/build" \ --security-opt seccomp=unconfined \ --pid host \ --userns host \ cimg/base:2020.01 \ sh -c ' \ TP_VERBOSE_LOGGING=5 \ /tmp/build/tensorpipe/test/channel/cma/tensorpipe_channel_cma_probe \ 1 /tmp/report/socket \ > /tmp/report/probe2_report.json' & probe2_pid=$! wait $probe1_pid wait $probe2_pid python3 \ "$(pwd)/tensorpipe/test/channel/cma/probe_report_checker.py" \ "$TEMPDIR/probe1_report.json" "$TEMPDIR/probe2_report.json" 1 echo "Each endpoint in own container, privileged, sharing PID namespace" # This should also help. TEMPDIR=$(mktemp --directory) chmod ugo+rwx "$TEMPDIR" echo "Using $TEMPDIR for staging data" docker run \ --cidfile "$TEMPDIR/probe1_container_id" \ --volume "$TEMPDIR:/tmp/report" \ --volume "$(pwd)/build:/tmp/build" \ --privileged \ cimg/base:2020.01 \ sh -c ' \ TP_VERBOSE_LOGGING=5 \ /tmp/build/tensorpipe/test/channel/cma/tensorpipe_channel_cma_probe \ 0 /tmp/report/socket \ > /tmp/report/probe1_report.json' & probe1_pid=$! while [ ! -S "$TEMPDIR/socket" ]; do sleep 0.1; done docker run \ --volume "$TEMPDIR:/tmp/report" \ --volume "$(pwd)/build:/tmp/build" \ --pid "container:$(cat "$TEMPDIR/probe1_container_id")" \ --privileged \ cimg/base:2020.01 \ sh -c ' \ TP_VERBOSE_LOGGING=5 \ /tmp/build/tensorpipe/test/channel/cma/tensorpipe_channel_cma_probe \ 1 /tmp/report/socket \ > /tmp/report/probe2_report.json' & probe2_pid=$! wait $probe1_pid wait $probe2_pid python3 \ "$(pwd)/tensorpipe/test/channel/cma/probe_report_checker.py" \ "$TEMPDIR/probe1_report.json" "$TEMPDIR/probe2_report.json" 1 echo "One endpoint on host, other in container, with own namespace" # This isn't supposed to work, as each container gets its own user and PID # namespace, but CMA needs them to match. We disable seccomp-bpf to give this # test a fighting chance. And also AppArmor, as it starts mattering here, # because Docker sets its own profile (docker-default) which is different than # the host's one (unconfined). TEMPDIR=$(mktemp --directory) chmod ugo+rwx "$TEMPDIR" echo "Using $TEMPDIR for staging data" docker run \ --volume "$TEMPDIR:/tmp/report" \ --volume "$(pwd)/build:/tmp/build" \ --security-opt seccomp=unconfined \ --security-opt apparmor=unconfined \ --user "$(id -u):$(id -g)" \ cimg/base:2020.01 \ sh -c ' \ TP_VERBOSE_LOGGING=5 \ /tmp/build/tensorpipe/test/channel/cma/tensorpipe_channel_cma_probe \ 0 /tmp/report/socket \ > /tmp/report/probe1_report.json' & probe1_pid=$! while [ ! -S "$TEMPDIR/socket" ]; do sleep 0.1; done sudo chmod ugo+rwx "$TEMPDIR"/socket TP_VERBOSE_LOGGING=5 \ "$(pwd)/build/tensorpipe/test/channel/cma/tensorpipe_channel_cma_probe" \ 1 "$TEMPDIR/socket" \ > "$TEMPDIR/probe2_report.json" & probe2_pid=$! wait $probe1_pid wait $probe2_pid python3 \ "$(pwd)/tensorpipe/test/channel/cma/probe_report_checker.py" \ "$TEMPDIR/probe1_report.json" "$TEMPDIR/probe2_report.json" 0 echo "One endpoint on host, other in container, reusing host namespace" # This should fix the issues of the above. TEMPDIR=$(mktemp --directory) chmod ugo+rwx "$TEMPDIR" echo "Using $TEMPDIR for staging data" docker run \ --volume "$TEMPDIR:/tmp/report" \ --volume "$(pwd)/build:/tmp/build" \ --security-opt seccomp=unconfined \ --security-opt apparmor=unconfined \ --pid host \ --userns host \ --user "$(id -u):$(id -g)" \ cimg/base:2020.01 \ sh -c ' \ TP_VERBOSE_LOGGING=5 \ /tmp/build/tensorpipe/test/channel/cma/tensorpipe_channel_cma_probe \ 0 /tmp/report/socket \ > /tmp/report/probe1_report.json' & probe1_pid=$! while [ ! -S "$TEMPDIR/socket" ]; do sleep 0.1; done sudo chmod ugo+rwx "$TEMPDIR"/socket TP_VERBOSE_LOGGING=5 \ "$(pwd)/build/tensorpipe/test/channel/cma/tensorpipe_channel_cma_probe" \ 1 "$TEMPDIR/socket" \ > "$TEMPDIR/probe2_report.json" & probe2_pid=$! wait $probe1_pid wait $probe2_pid python3 \ "$(pwd)/tensorpipe/test/channel/cma/probe_report_checker.py" \ "$TEMPDIR/probe1_report.json" "$TEMPDIR/probe2_report.json" 1 echo "One endpoint on host, other in container, privileged" # This should also help. TEMPDIR=$(mktemp --directory) chmod ugo+rwx "$TEMPDIR" echo "Using $TEMPDIR for staging data" docker run \ --volume "$TEMPDIR:/tmp/report" \ --volume "$(pwd)/build:/tmp/build" \ --security-opt seccomp=unconfined \ --security-opt apparmor=unconfined \ --pid host \ --user "$(id -u):$(id -g)" \ --privileged \ cimg/base:2020.01 \ sh -c ' \ TP_VERBOSE_LOGGING=5 \ /tmp/build/tensorpipe/test/channel/cma/tensorpipe_channel_cma_probe \ 0 /tmp/report/socket \ > /tmp/report/probe1_report.json' & probe1_pid=$! while [ ! -S "$TEMPDIR/socket" ]; do sleep 0.1; done sudo chmod ugo+rwx "$TEMPDIR"/socket TP_VERBOSE_LOGGING=5 \ "$(pwd)/build/tensorpipe/test/channel/cma/tensorpipe_channel_cma_probe" \ 1 "$TEMPDIR/socket" \ > "$TEMPDIR/probe2_report.json" & probe2_pid=$! wait $probe1_pid wait $probe2_pid python3 \ "$(pwd)/tensorpipe/test/channel/cma/probe_report_checker.py" \ "$TEMPDIR/probe1_report.json" "$TEMPDIR/probe2_report.json" 1 echo "Both endpoints on host" # Should be a no-brainer? TEMPDIR=$(mktemp --directory) chmod ugo+rwx "$TEMPDIR" echo "Using $TEMPDIR for staging data" TP_VERBOSE_LOGGING=5 \ "$(pwd)/build/tensorpipe/test/channel/cma/tensorpipe_channel_cma_probe" \ 0 "$TEMPDIR/socket" \ > "$TEMPDIR/probe1_report.json" & probe1_pid=$! while [ ! -S "$TEMPDIR/socket" ]; do sleep 0.1; done TP_VERBOSE_LOGGING=5 \ "$(pwd)/build/tensorpipe/test/channel/cma/tensorpipe_channel_cma_probe" \ 1 "$TEMPDIR/socket" \ > "$TEMPDIR/probe2_report.json" & probe2_pid=$! wait $probe1_pid wait $probe2_pid python3 \ "$(pwd)/tensorpipe/test/channel/cma/probe_report_checker.py" \ "$TEMPDIR/probe1_report.json" "$TEMPDIR/probe2_report.json" 1 ================================================ FILE: tensorpipe/test/channel/cma/probe.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include #include #include #include #include namespace {} int main(int argc, char* argv[]) { TP_THROW_ASSERT_IF(argc < 1); if (argc != 3) { TP_LOG_INFO() << "Usage: " << argv[0] << " [rank] [path to a UNIX domain socket]"; return 0; } TP_LOG_INFO() << "My PID is " << ::getpid(); int rank = std::strtol(argv[1], nullptr, 10); int rv; int fd = ::socket(AF_UNIX, SOCK_STREAM, 0); TP_THROW_SYSTEM_IF(fd < 0, errno); struct sockaddr_un socketAddr; std::memset(&socketAddr, 0, sizeof(struct sockaddr_un)); socketAddr.sun_family = AF_UNIX; std::strcpy(socketAddr.sun_path, argv[2]); if (rank == 0) { rv = ::bind( fd, reinterpret_cast(&socketAddr), sizeof(struct sockaddr_un)); TP_THROW_SYSTEM_IF(rv < 0, errno); rv = ::listen(fd, 0); TP_THROW_SYSTEM_IF(rv < 0, errno); struct sockaddr_storage peerAddr; socklen_t peerAddrlen = sizeof(struct sockaddr_storage); do { rv = ::accept( fd, reinterpret_cast(&peerAddr), &peerAddrlen); TP_THROW_SYSTEM_IF(rv < 0 && errno != EINTR, errno); } while (rv < 0); int otherFd = rv; rv = ::close(fd); TP_THROW_SYSTEM_IF(rv < 0, errno); rv = ::unlink(argv[2]); TP_THROW_SYSTEM_IF(rv < 0, errno); fd = otherFd; } else { do { rv = ::connect( fd, reinterpret_cast(&socketAddr), sizeof(struct sockaddr_un)); TP_THROW_SYSTEM_IF(rv < 0 && errno != EINTR, errno); } while (rv < 0); } struct ucred peerCreds; std::memset(&peerCreds, 0, sizeof(struct ucred)); socklen_t peerCredsLen = sizeof(struct ucred); rv = ::getsockopt(fd, SOL_SOCKET, SO_PEERCRED, &peerCreds, &peerCredsLen); pid_t peerPid = peerCreds.pid; TP_LOG_INFO() << "The peer's PID is " << peerPid; rv = ::prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY, 0, 0, 0); TP_THROW_SYSTEM_IF(rv < 0, errno); uint64_t outbox = 0x0123456789abcdef; void* outboxPtr = &outbox; TP_LOG_INFO() << "My outbox's address is 0x" << std::hex << reinterpret_cast(outboxPtr); rv = ::write(fd, &outboxPtr, sizeof(void*)); TP_THROW_SYSTEM_IF(rv < 0, errno); TP_THROW_ASSERT_IF(rv != sizeof(void*)); void* peerOutboxPtr; rv = ::read(fd, &peerOutboxPtr, sizeof(void*)); TP_THROW_SYSTEM_IF(rv < 0, errno); TP_THROW_ASSERT_IF(rv != sizeof(void*)); TP_LOG_INFO() << "The peer's inbox address is 0x" << std::hex << reinterpret_cast(peerOutboxPtr); uint64_t inbox; struct iovec localIov; std::memset(&localIov, 0, sizeof(struct iovec)); localIov.iov_base = &inbox; localIov.iov_len = sizeof(uint64_t); struct iovec remoteIov; std::memset(&remoteIov, 0, sizeof(struct iovec)); remoteIov.iov_base = peerOutboxPtr; remoteIov.iov_len = sizeof(uint64_t); ssize_t result = ::process_vm_readv(peerPid, &localIov, 1, &remoteIov, 1, 0); TP_LOG_INFO() << "Calling process_vm_readv returned " << result << ", errno is set to " << errno << " and my inbox now has value 0x" << std::hex << inbox; bool successful = false; if (result >= 0) { TP_THROW_ASSERT_IF(result != sizeof(uint64_t)); TP_THROW_ASSERT_IF(inbox != 0x0123456789abcdef); successful = true; } uint8_t ack; rv = ::write(fd, &ack, sizeof(uint8_t)); TP_THROW_SYSTEM_IF(rv < 0, errno); TP_THROW_ASSERT_IF(rv != sizeof(uint8_t)); rv = ::read(fd, &ack, sizeof(uint8_t)); TP_THROW_SYSTEM_IF(rv < 0, errno); TP_THROW_ASSERT_IF(rv != sizeof(uint8_t)); rv = ::close(fd); TP_THROW_SYSTEM_IF(rv < 0, errno); auto ctx = tensorpipe::channel::cma::create(); TP_LOG_INFO() << "The CMA context's viability is: " << std::boolalpha << ctx->isViable(); std::string descriptor; if (ctx->isViable()) { auto cpuDevice = tensorpipe::Device{tensorpipe::kCpuDeviceType, 0}; auto deviceDescriptors = ctx->deviceDescriptors(); auto iter = deviceDescriptors.find(cpuDevice); TP_DCHECK(iter != deviceDescriptors.end()); descriptor = iter->second; } TP_LOG_INFO() << "Its descriptor is: " << descriptor; std::cout << "{\"syscall_success\": " << successful << ", \"viability\": " << ctx->isViable() << ", \"device_descriptor\": \"" << descriptor << "\"}" << std::endl; } ================================================ FILE: tensorpipe/test/channel/cma/probe_report_checker.py ================================================ #!/usr/bin/env python3 # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. import json import sys if __name__ == "__main__": if len(sys.argv) < 1: raise RuntimeError() if len(sys.argv) != 4: print( f"Usage: {sys.argv[0]} [first report] [second report] [supposed to work]", file=sys.stderr, ) sys.exit(0) with open(sys.argv[1], "rb") as f: first_report = json.load(f) with open(sys.argv[2], "rb") as f: second_report = json.load(f) supposed_to_work = int(sys.argv[3]) worked_in_practice = ( first_report["syscall_success"] == 1 and second_report["syscall_success"] == 1 ) if worked_in_practice != supposed_to_work: raise RuntimeError( f"The syscall didn't behave as the test expected it to. It " f"{'succeeded' if worked_in_practice else 'failed'} whereas it was " f"supposed to {'succeed' if supposed_to_work else 'fail'}." ) detected_as_working = ( first_report["viability"] == 1 and second_report["viability"] == 1 and first_report["device_descriptor"] == second_report["device_descriptor"] ) if detected_as_working != worked_in_practice: print( f"The CMA autodetection didn't correctly predict the behavior of the " f"syscall. It determined it would " f"{'succeed' if detected_as_working else 'fail'} whereas it actually " f"{'succeeded' if worked_in_practice else 'failed'}.", file=sys.stderr, ) sys.exit(1) sys.exit(0) ================================================ FILE: tensorpipe/test/channel/cuda_basic/cuda_basic_test.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include namespace { class CudaBasicChannelTestHelper : public CudaChannelTestHelper { protected: std::shared_ptr makeContextInternal( std::string id) override { auto cpuContext = tensorpipe::channel::basic::create(); auto context = tensorpipe::channel::cuda_basic::create(std::move(cpuContext)); context->setId(std::move(id)); return context; } public: std::shared_ptr makePeerGroup() override { return std::make_shared(); } }; CudaBasicChannelTestHelper helper; class CudaBasicChannelTestSuite : public ChannelTestSuite {}; } // namespace class CannotCommunicateCpuToCpuTest : public ChannelTestCase { public: void run(ChannelTestHelper* /* unused */) override { ForkedThreadPeerGroup pg; pg.spawn( [&]() { auto cpuContext = tensorpipe::channel::basic::create(); auto ctx = tensorpipe::channel::cuda_basic::create(std::move(cpuContext)); auto deviceDescriptors = ctx->deviceDescriptors(); auto it = deviceDescriptors.find( tensorpipe::Device{tensorpipe::kCpuDeviceType, 0}); EXPECT_FALSE(it == deviceDescriptors.end()); auto descriptor = it->second; EXPECT_FALSE(ctx->canCommunicateWithRemote(descriptor, descriptor)); }, [&]() { // Do nothing. }); } }; CHANNEL_TEST(CudaBasicChannelTestSuite, CannotCommunicateCpuToCpu); INSTANTIATE_TEST_CASE_P( CudaBasic, ChannelTestSuite, ::testing::Values(&helper)); INSTANTIATE_TEST_CASE_P( CudaBasic, CudaChannelTestSuite, ::testing::Values(&helper)); INSTANTIATE_TEST_CASE_P( CudaBasic, CudaMultiGPUChannelTestSuite, ::testing::Values(&helper)); INSTANTIATE_TEST_CASE_P( CudaBasic, CudaXDTTChannelTestSuite, ::testing::Values(&helper)); INSTANTIATE_TEST_CASE_P( CudaBasic, CudaBasicChannelTestSuite, ::testing::Values(&helper)); ================================================ FILE: tensorpipe/test/channel/cuda_gdr/cuda_gdr_test.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include namespace { class CudaGdrChannelTestHelper : public CudaChannelTestHelper { protected: std::shared_ptr makeContextInternal( std::string id) override { auto context = tensorpipe::channel::cuda_gdr::create(); context->setId(std::move(id)); return context; } public: std::shared_ptr makePeerGroup() override { return std::make_shared(); } }; CudaGdrChannelTestHelper helper; } // namespace INSTANTIATE_TEST_CASE_P(CudaGdr, ChannelTestSuite, ::testing::Values(&helper)); INSTANTIATE_TEST_CASE_P( CudaGdr, CudaChannelTestSuite, ::testing::Values(&helper)); INSTANTIATE_TEST_CASE_P( CudaGdr, CudaMultiGPUChannelTestSuite, ::testing::Values(&helper)); ================================================ FILE: tensorpipe/test/channel/cuda_helpers.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include #include #include #include namespace tensorpipe { inline bool isContextOpenOnDevice(const NvmlLib& nvmlLib, nvmlDevice_t device) { unsigned int count = 0; std::vector processInfos; while (true) { nvmlReturn_t res = nvmlLib.deviceGetComputeRunningProcesses( device, &count, processInfos.data()); processInfos.resize(count); if (res == NVML_SUCCESS) { break; } if (res == NVML_ERROR_INSUFFICIENT_SIZE) { continue; } TP_NVML_CHECK(nvmlLib, res); } pid_t myPid = ::getpid(); for (const nvmlProcessInfo_t& processInfo : processInfos) { if (processInfo.pid == myPid) { return true; } } return false; } inline ::testing::AssertionResult initializedCudaContexts( const std::vector& expectedDeviceIndices) { // This check won't work when the test is running in a PID namespace, as NVML // will return the PIDs in the root namespace but it doesn't seem possible for // us to map them back to our namespace. Hence we use an env var to allow to // disable this check in such environments. char* shouldSkip = std::getenv("TP_SKIP_CHECK_OPEN_CUDA_CTXS"); if (shouldSkip != nullptr) { return ::testing::AssertionSuccess(); } Error error; CudaLib cudaLib; std::tie(error, cudaLib) = CudaLib::create(); TP_THROW_ASSERT_IF(error) << error.what(); NvmlLib nvmlLib; std::tie(error, nvmlLib) = NvmlLib::create(); TP_THROW_ASSERT_IF(error) << error.what(); std::vector uuids = getUuidsOfVisibleDevices(cudaLib); for (int deviceIdx = 0; deviceIdx < uuids.size(); deviceIdx++) { // NVML uses a different format for UUIDs. std::string nvmlUuid = "GPU-" + uuids[deviceIdx]; nvmlDevice_t nvmlDevice; TP_NVML_CHECK( nvmlLib, nvmlLib.deviceGetHandleByUUID(nvmlUuid.c_str(), &nvmlDevice)); bool actualHasCtx = isContextOpenOnDevice(nvmlLib, nvmlDevice); bool expectedHasCtx = std::find( expectedDeviceIndices.begin(), expectedDeviceIndices.end(), deviceIdx) != expectedDeviceIndices.end(); if (actualHasCtx && !expectedHasCtx) { return ::testing::AssertionFailure() << "a CUDA context was initialized on device #" << deviceIdx << " but that shouldn't have happened"; } if (!actualHasCtx && expectedHasCtx) { return ::testing::AssertionFailure() << "a CUDA context should have been initialized on device #" << deviceIdx << " but that didn't happen"; } } return ::testing::AssertionSuccess(); } } // namespace tensorpipe ================================================ FILE: tensorpipe/test/channel/cuda_ipc/cuda_ipc_test.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include namespace { class CudaIpcChannelTestHelper : public CudaChannelTestHelper { protected: std::shared_ptr makeContextInternal( std::string id) override { auto context = tensorpipe::channel::cuda_ipc::create(); context->setId(std::move(id)); return context; } public: std::shared_ptr makePeerGroup() override { return std::make_shared(); } }; CudaIpcChannelTestHelper helper; class CudaIpcChannelTestSuite : public ChannelTestSuite {}; } // namespace class CannotCommunicateInSameProcessTest : public ChannelTestCase { public: void run(ChannelTestHelper* /* unused */) override { ForkedThreadPeerGroup pg; pg.spawn( [&]() { auto ctx = tensorpipe::channel::cuda_ipc::create(); auto deviceDescriptors = ctx->deviceDescriptors(); EXPECT_GT(deviceDescriptors.size(), 0); auto descriptor = deviceDescriptors.begin()->second; // From within a given process, the device descriptors will be the // same. EXPECT_FALSE(ctx->canCommunicateWithRemote(descriptor, descriptor)); }, [&]() { // Do nothing. }); } }; CHANNEL_TEST(CudaIpcChannelTestSuite, CannotCommunicateInSameProcess); INSTANTIATE_TEST_CASE_P(CudaIpc, ChannelTestSuite, ::testing::Values(&helper)); INSTANTIATE_TEST_CASE_P( CudaIpc, CudaChannelTestSuite, ::testing::Values(&helper)); INSTANTIATE_TEST_CASE_P( CudaIpc, CudaMultiGPUChannelTestSuite, ::testing::Values(&helper)); INSTANTIATE_TEST_CASE_P( CudaIpc, CudaIpcChannelTestSuite, ::testing::Values(&helper)); ================================================ FILE: tensorpipe/test/channel/cuda_xth/cuda_xth_test.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include namespace { class CudaXthChannelTestHelper : public CudaChannelTestHelper { protected: std::shared_ptr makeContextInternal( std::string id) override { auto context = tensorpipe::channel::cuda_xth::create(); context->setId(std::move(id)); return context; } public: std::shared_ptr makePeerGroup() override { return std::make_shared(); } }; CudaXthChannelTestHelper helper; } // namespace INSTANTIATE_TEST_CASE_P(CudaXth, ChannelTestSuite, ::testing::Values(&helper)); INSTANTIATE_TEST_CASE_P( CudaXth, CudaChannelTestSuite, ::testing::Values(&helper)); INSTANTIATE_TEST_CASE_P( CudaXth, CudaMultiGPUChannelTestSuite, ::testing::Values(&helper)); ================================================ FILE: tensorpipe/test/channel/kernel.cu ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include __global__ void _slowKernel(char* ptr, int sz) { int idx = blockIdx.x * blockDim.x + threadIdx.x; for (; idx < sz; idx += (gridDim.x * blockDim.x)) { for (int i = 0; i < 100000; ++i) { ptr[idx] += ptr[(idx + 1007) % sz] + i; } } } void slowKernel(void* ptr, int kSize, cudaStream_t stream) { _slowKernel<<<128, 128, 0, stream>>>((char*)ptr, kSize); } ================================================ FILE: tensorpipe/test/channel/kernel.cuh ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include // This kernel takes time and puts garbage data in the buffer. It is used to // test proper synchronization in CUDA channels. void slowKernel(void* ptr, int kSize, cudaStream_t stream); ================================================ FILE: tensorpipe/test/channel/mpt/mpt_test.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include namespace { class MptChannelTestHelper : public CpuChannelTestHelper { protected: std::shared_ptr makeContextInternal( std::string id) override { std::vector> contexts = { tensorpipe::transport::uv::create(), tensorpipe::transport::uv::create(), tensorpipe::transport::uv::create()}; std::vector> listeners = { contexts[0]->listen("127.0.0.1"), contexts[1]->listen("127.0.0.1"), contexts[2]->listen("127.0.0.1")}; auto context = tensorpipe::channel::mpt::create( std::move(contexts), std::move(listeners)); context->setId(std::move(id)); return context; } }; MptChannelTestHelper helper; class MptChannelTestSuite : public ChannelTestSuite {}; } // namespace class ContextIsNotJoinedTest : public ChannelTestCase { // Because it's static we must define it out-of-line (until C++-17, where we // can mark this inline). static const std::string kReady; public: void run(ChannelTestHelper* helper) override { auto addr = "127.0.0.1"; helper_ = helper; peers_ = helper_->makePeerGroup(); peers_->spawn( [&] { auto context = tensorpipe::transport::uv::create(); context->setId("server_harness"); auto listener = context->listen(addr); std::promise> connectionProm; listener->accept( [&](const tensorpipe::Error& error, std::shared_ptr connection) { ASSERT_FALSE(error) << error.what(); connectionProm.set_value(std::move(connection)); }); peers_->send(PeerGroup::kClient, listener->addr()); server(connectionProm.get_future().get()); context->join(); }, [&] { auto context = tensorpipe::transport::uv::create(); context->setId("client_harness"); auto laddr = peers_->recv(PeerGroup::kClient); client(context->connect(laddr)); context->join(); }); } void server(std::shared_ptr conn) { std::shared_ptr context = this->helper_->makeContext("server"); this->peers_->send(PeerGroup::kClient, kReady); context->createChannel( {std::move(conn)}, tensorpipe::channel::Endpoint::kListen); } void client(std::shared_ptr conn) { std::shared_ptr context = this->helper_->makeContext("client"); EXPECT_EQ(kReady, this->peers_->recv(PeerGroup::kClient)); context->createChannel( {std::move(conn)}, tensorpipe::channel::Endpoint::kConnect); } protected: ChannelTestHelper* helper_; std::shared_ptr peers_; }; const std::string ContextIsNotJoinedTest::kReady = "ready"; CHANNEL_TEST(MptChannelTestSuite, ContextIsNotJoined); INSTANTIATE_TEST_CASE_P(Mpt, ChannelTestSuite, ::testing::Values(&helper)); INSTANTIATE_TEST_CASE_P(Mpt, CpuChannelTestSuite, ::testing::Values(&helper)); INSTANTIATE_TEST_CASE_P(Mpt, MptChannelTestSuite, ::testing::Values(&helper)); ================================================ FILE: tensorpipe/test/channel/xth/xth_test.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include namespace { class XthChannelTestHelper : public CpuChannelTestHelper { protected: std::shared_ptr makeContextInternal( std::string id) override { auto context = tensorpipe::channel::xth::create(); context->setId(std::move(id)); return context; } }; XthChannelTestHelper helper; } // namespace INSTANTIATE_TEST_CASE_P(Xth, ChannelTestSuite, ::testing::Values(&helper)); INSTANTIATE_TEST_CASE_P(Xth, CpuChannelTestSuite, ::testing::Values(&helper)); ================================================ FILE: tensorpipe/test/common/cuda_test.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include #include namespace { tensorpipe::CudaLib getCudaLib() { tensorpipe::Error error; tensorpipe::CudaLib cudaLib; std::tie(error, cudaLib) = tensorpipe::CudaLib::create(); EXPECT_FALSE(error) << error.what(); return cudaLib; } } // namespace // This tests whether we can retrieve the index of the device on which a pointer // resides under "normal" circumstances (in the same context where it was // allocated, or in a "fresh" thread). TEST(Cuda, DeviceForPointer) { if (TestEnvironment::numCudaDevices() < 2) { GTEST_SKIP() << "Skipping test requiring >=2 CUDA devices."; } ForkedThreadPeerGroup pg; pg.spawn( [&]() { TP_CUDA_CHECK(cudaSetDevice(1)); void* ptr; TP_CUDA_CHECK(cudaMalloc(&ptr, 1024)); EXPECT_EQ(tensorpipe::cudaDeviceForPointer(getCudaLib(), ptr), 1); std::string ptrStr( reinterpret_cast(&ptr), reinterpret_cast(&ptr) + sizeof(void*)); pg.send(PeerGroup::kClient, ptrStr); }, [&]() { std::string ptrStr = pg.recv(PeerGroup::kClient); void* ptr = *reinterpret_cast(&ptrStr[0]); EXPECT_EQ(tensorpipe::cudaDeviceForPointer(getCudaLib(), ptr), 1); }); } // This tests whether we can retrieve the index of the device on which a pointer // resided after we've explicitly set the current device to an invalid value. // This is known to cause problems in recent versions of CUDA, possibly because // of a bug. TEST(Cuda, DeviceForPointerAfterReset) { if (TestEnvironment::numCudaDevices() < 2) { GTEST_SKIP() << "Skipping test requiring >=2 CUDA devices."; } ForkedThreadPeerGroup pg; pg.spawn( [&]() { TP_CUDA_CHECK(cudaSetDevice(1)); void* ptr; TP_CUDA_CHECK(cudaMalloc(&ptr, 1024)); TP_CUDA_CHECK(cudaSetDevice(0)); EXPECT_EQ(tensorpipe::cudaDeviceForPointer(getCudaLib(), ptr), 1); std::string ptrStr( reinterpret_cast(&ptr), reinterpret_cast(&ptr) + sizeof(void*)); pg.send(PeerGroup::kClient, ptrStr); }, [&]() { std::string ptrStr = pg.recv(PeerGroup::kClient); void* ptr = *reinterpret_cast(&ptrStr[0]); TP_CUDA_CHECK(cudaSetDevice(0)); EXPECT_EQ(tensorpipe::cudaDeviceForPointer(getCudaLib(), ptr), 1); }); } ================================================ FILE: tensorpipe/test/common/defs_test.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include TEST(Defs, Exception) { EXPECT_THROW(TP_THROW_EINVAL(), std::invalid_argument); EXPECT_THROW(TP_THROW_EINVAL() << "hola", std::invalid_argument); EXPECT_THROW(TP_THROW_EINVAL() << "adioshola", std::invalid_argument); EXPECT_THROW(TP_THROW_SYSTEM(ENODATA) << "adioshola", std::system_error); EXPECT_THROW(TP_THROW_SYSTEM(EBUSY), std::system_error); EXPECT_THROW(TP_THROW_SYSTEM(EBUSY) << "my message", std::system_error); } ================================================ FILE: tensorpipe/test/common/epoll_loop_test.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include using namespace tensorpipe; namespace { class Handler : public EpollLoop::EventHandler { public: void handleEventsFromLoop(int events) override { std::unique_lock lock(m_); events_.push_back(events); cv_.notify_all(); } int nextEvents() { std::unique_lock lock(m_); cv_.wait(lock, [&]() { return !events_.empty(); }); int events = events_.front(); events_.pop_front(); return events; } private: std::mutex m_; std::condition_variable cv_; std::deque events_; }; // Monitor an fd for events and execute function when triggered. // // The lifetime of an instance dictates when the specified function // may be called. The function is guaranteed to not be called after // the monitor has been destructed. // class FunctionEventHandler : public EpollLoop::EventHandler, public std::enable_shared_from_this { public: using TFunction = std::function; FunctionEventHandler( DeferredExecutor& deferredExecutor, EpollLoop& loop, int fd, int event, TFunction fn); ~FunctionEventHandler() override; void start(); void cancel(); void handleEventsFromLoop(int events) override; private: DeferredExecutor& deferredExecutor_; EpollLoop& loop_; const int fd_; const int event_; TFunction fn_; std::mutex mutex_; bool cancelled_{false}; }; FunctionEventHandler::FunctionEventHandler( DeferredExecutor& deferredExecutor, EpollLoop& loop, int fd, int event, TFunction fn) : deferredExecutor_(deferredExecutor), loop_(loop), fd_(fd), event_(event), fn_(std::move(fn)) {} FunctionEventHandler::~FunctionEventHandler() { cancel(); } void FunctionEventHandler::start() { deferredExecutor_.runInLoop( [&]() { loop_.registerDescriptor(fd_, event_, shared_from_this()); }); } void FunctionEventHandler::cancel() { std::unique_lock lock(mutex_); if (!cancelled_) { deferredExecutor_.runInLoop([&]() { loop_.unregisterDescriptor(fd_); cancelled_ = true; }); } } void FunctionEventHandler::handleEventsFromLoop(int events) { if (events & event_) { fn_(*this); } } // Instantiates an event monitor for the specified fd. template std::shared_ptr createMonitor( DeferredExecutor& reactor, EpollLoop& loop, std::shared_ptr shared, int fd, int event, std::function fn) { auto handler = std::make_shared( reactor, loop, fd, event, [weak{std::weak_ptr{shared}}, fn{std::move(fn)}](FunctionEventHandler& handler) { auto shared = weak.lock(); if (shared) { fn(*shared, handler); } }); handler->start(); return handler; } } // namespace TEST(ShmLoop, RegisterUnregister) { OnDemandDeferredExecutor deferredExecutor; EpollLoop loop{deferredExecutor}; auto handler = std::make_shared(); auto efd = Fd(eventfd(0, EFD_NONBLOCK)); { // Test if writable (always). deferredExecutor.runInLoop([&]() { loop.registerDescriptor(efd.fd(), EPOLLOUT | EPOLLONESHOT, handler); }); ASSERT_EQ(handler->nextEvents(), EPOLLOUT); efd.writeOrThrow(1337); // Test if readable (only if previously written to). deferredExecutor.runInLoop([&]() { loop.registerDescriptor(efd.fd(), EPOLLIN | EPOLLONESHOT, handler); }); ASSERT_EQ(handler->nextEvents(), EPOLLIN); ASSERT_EQ(efd.readOrThrow(), 1337); // Test if we can unregister the descriptor. deferredExecutor.runInLoop([&]() { loop.unregisterDescriptor(efd.fd()); }); } loop.join(); } TEST(ShmLoop, Monitor) { OnDemandDeferredExecutor deferredExecutor; EpollLoop loop{deferredExecutor}; auto efd = Fd(eventfd(0, EFD_NONBLOCK)); constexpr uint64_t kValue = 1337; { std::mutex mutex; std::condition_variable cv; bool done = false; // Test if writable (always). auto shared = std::make_shared(1338); auto monitor = createMonitor( deferredExecutor, loop, shared, efd.fd(), EPOLLOUT, [&](int& i, FunctionEventHandler& handler) { EXPECT_EQ(i, 1338); efd.writeOrThrow(kValue); handler.cancel(); { std::unique_lock lock(mutex); done = true; cv.notify_all(); } }); // Wait for monitor to trigger and perform a write. std::unique_lock lock(mutex); cv.wait(lock, [&]() { return done; }); } { std::mutex mutex; std::condition_variable cv; bool done = false; uint64_t value = 0; // Test if readable (only if previously written to). auto shared = std::make_shared(1338); auto monitor = createMonitor( deferredExecutor, loop, shared, efd.fd(), EPOLLIN, [&](int& i, FunctionEventHandler& handler) { EXPECT_EQ(i, 1338); value = efd.readOrThrow(); handler.cancel(); { std::unique_lock lock(mutex); done = true; cv.notify_all(); } }); // Wait for monitor to trigger and perform a read. std::unique_lock lock(mutex); cv.wait(lock, [&]() { return done; }); // Verify we read the correct value. ASSERT_EQ(value, kValue); } loop.join(); } TEST(ShmLoop, Defer) { OnDemandDeferredExecutor deferredExecutor; auto promise = std::make_shared>(); auto future = promise->get_future(); deferredExecutor.deferToLoop([promise]() { promise->set_value(); }); future.wait(); ASSERT_TRUE(future.valid()); } ================================================ FILE: tensorpipe/test/common/ringbuffer_test.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include using namespace tensorpipe; struct TestData { uint16_t a; uint16_t b; uint16_t c; bool operator==(const TestData& other) const { return a == other.a && b == other.b && c == other.c; } }; constexpr static int kNumRingbufferRoles = 2; constexpr static int kConsumerRoleIdx = 0; constexpr static int kProducerRoleIdx = 1; using Consumer = RingBufferRole; using Producer = RingBufferRole; // Holds and owns the memory for the ringbuffer's header and data. class RingBufferStorage { public: explicit RingBufferStorage(size_t size) : header_(size) {} RingBuffer getRb() { return {&header_, data_.get()}; } private: RingBufferHeader header_; std::unique_ptr data_ = std::make_unique(header_.kDataPoolByteSize); }; size_t usedSize(RingBuffer& rb) { return rb.getHeader().template readMarker() - rb.getHeader().template readMarker(); } TEST(RingBuffer, WriteCopy) { EXPECT_EQ(sizeof(TestData), 6); // 16 bytes buffer. Fits two full TestData (each 6). size_t size = 1u << 4; RingBufferStorage storage(size); RingBuffer rb = storage.getRb(); // Make a producer. Producer p{rb}; // Make a consumer. Consumer c{rb}; EXPECT_EQ(usedSize(rb), 0); TestData d0{.a = 0xBA98, .b = 0x7654, .c = 0xA312}; TestData d1{.a = 0xA987, .b = 0x7777, .c = 0x2812}; TestData d2{.a = 0xFFFF, .b = 0x3333, .c = 0x1212}; { ssize_t ret = p.write(&d0, sizeof(d0)); EXPECT_EQ(ret, sizeof(TestData)); } EXPECT_EQ(usedSize(rb), 6); { ssize_t ret = p.write(&d1, sizeof(d1)); EXPECT_EQ(ret, sizeof(TestData)); } EXPECT_EQ(usedSize(rb), 12); { ssize_t ret = p.write(&d2, sizeof(d2)); EXPECT_EQ(ret, -ENODATA) << "Needs 2 more bytes to write the 6 required, " "because 12 out of 16 are used."; } TestData r; { ssize_t ret = c.read(&r, sizeof(r)); EXPECT_EQ(ret, sizeof(r)); EXPECT_EQ(r, d0); } { ssize_t ret = c.read(&r, sizeof(r)); EXPECT_EQ(ret, sizeof(r)); EXPECT_EQ(r, d1); } // It should be empty by now. EXPECT_EQ(usedSize(rb), 0); { ssize_t ret = p.write(&d2, sizeof(d2)); EXPECT_EQ(ret, sizeof(TestData)); } { ssize_t ret = c.read(&r, sizeof(r)); EXPECT_EQ(ret, sizeof(r)); EXPECT_EQ(r, d2); } // It should be empty by now. EXPECT_EQ(usedSize(rb), 0); } TEST(RingBuffer, ReadMultipleElems) { // 256 bytes buffer. size_t size = 1u << 8u; RingBufferStorage storage(size); RingBuffer rb = storage.getRb(); // Make a producer. Producer p{rb}; // Make a consumer. Consumer c{rb}; EXPECT_EQ(usedSize(rb), 0); uint16_t n = 0xACAC; // fits 128 times { for (int i = 0; i < 128; ++i) { ssize_t ret = p.write(&n, sizeof(n)); EXPECT_EQ(ret, sizeof(n)); } // It must be full by now. EXPECT_EQ(usedSize(rb), 256); ssize_t ret = p.write(&n, sizeof(n)); EXPECT_EQ(ret, -ENODATA); } { uint8_t b = 0xEE; ssize_t ret = p.write(&b, sizeof(b)); EXPECT_EQ(ret, -ENODATA) << "Needs an extra byte"; } { // read the three bytes at once. ssize_t ret; ret = c.startTx(); EXPECT_EQ(ret, 0); std::array r; ret = c.readInTx(r.data(), sizeof(r)); EXPECT_EQ(ret, 3); EXPECT_EQ(r[0], 0xAC); EXPECT_EQ(r[1], 0xAC); EXPECT_EQ(r[2], 0xAC); ret = c.commitTx(); EXPECT_EQ(ret, 0); } { // read 253 bytes at once. ssize_t ret; ret = c.startTx(); EXPECT_EQ(ret, 0); std::array r; ret = c.readInTx(r.data(), sizeof(r)); EXPECT_EQ(ret, 253); for (int i = 0; i < 253; ++i) { EXPECT_EQ(r[i], 0xAC); } ret = c.commitTx(); EXPECT_EQ(ret, 0); } { // No more elements ssize_t ret; ret = c.startTx(); EXPECT_EQ(ret, 0); uint8_t ch; ret = c.readInTx(&ch, sizeof(ch)); EXPECT_EQ(ret, -ENODATA); ret = c.cancelTx(); EXPECT_EQ(ret, 0); EXPECT_TRUE(!c.inTx()) << "Canceled transaction should've been canceled"; } } TEST(RingBuffer, CopyWrapping) { // 8 bytes buffer. size_t size = 1u << 3; RingBufferStorage storage(size); RingBuffer rb = storage.getRb(); // Make a producer. Producer p{rb}; // Make a consumer. Consumer c{rb}; EXPECT_EQ(usedSize(rb), 0); uint8_t ch = 0xA7; uint64_t n = 0xFFFFFFFFFFFFFFFF; // Put one byte. EXPECT_EQ(rb.getHeader().template readMarker(), 0); EXPECT_EQ(rb.getHeader().template readMarker(), 0); EXPECT_EQ(usedSize(rb), 0); ssize_t ret = p.write(&ch, sizeof(ch)); EXPECT_EQ(ret, sizeof(ch)); EXPECT_EQ(rb.getHeader().template readMarker(), 1); EXPECT_EQ(rb.getHeader().template readMarker(), 0); EXPECT_EQ(usedSize(rb), 1); // Next 8 bytes won't fit. ret = p.write(&n, sizeof(n)); EXPECT_EQ(ret, -ENODATA) << "Needs an extra byte to write the 8 bytes element. " "Capacity 8, used 1."; // Remove the one byte in, now head is one off. uint8_t cr; uint64_t nr; ret = c.read(&cr, sizeof(cr)); EXPECT_EQ(ret, sizeof(cr)); EXPECT_EQ(cr, ch); EXPECT_EQ(rb.getHeader().template readMarker(), 1); EXPECT_EQ(rb.getHeader().template readMarker(), 1); // Next 8 bytes will fit, but wrap. ret = p.write(&n, sizeof(n)); EXPECT_EQ(ret, sizeof(n)); EXPECT_EQ(rb.getHeader().template readMarker(), 9); EXPECT_EQ(rb.getHeader().template readMarker(), 1); ret = c.read(&nr, sizeof(nr)); EXPECT_EQ(ret, sizeof(nr)); EXPECT_EQ(nr, n); EXPECT_EQ(rb.getHeader().template readMarker(), 9); EXPECT_EQ(rb.getHeader().template readMarker(), 9); } TEST(RingBuffer, ReadTxWrappingOneCons) { // 8 bytes buffer. size_t size = 1u << 3; RingBufferStorage storage(size); RingBuffer rb = storage.getRb(); // Make a producer. Producer p{rb}; // Make a consumer. Consumer c1{rb}; EXPECT_EQ(usedSize(rb), 0); uint8_t ch = 0xA7; uint64_t n = 0xFFFFFFFFFFFFFFFF; // Put one byte. { EXPECT_EQ(rb.getHeader().template readMarker(), 0); EXPECT_EQ(rb.getHeader().template readMarker(), 0); EXPECT_EQ(usedSize(rb), 0); ssize_t ret = p.write(&ch, sizeof(ch)); EXPECT_EQ(ret, sizeof(ch)); EXPECT_EQ(rb.getHeader().template readMarker(), 1); EXPECT_EQ(rb.getHeader().template readMarker(), 0); EXPECT_EQ(usedSize(rb), 1); } // Next 8 bytes won't fit. { ssize_t ret = p.write(&n, sizeof(n)); EXPECT_EQ(ret, -ENODATA) << "Needs an extra byte to write the 8 bytes element. " "Capacity 8, used 1."; } // Remove the one byte in, now head is one off. EXPECT_FALSE(c1.inTx()); { // Start c1 read Tx ssize_t ret; ret = c1.startTx(); EXPECT_EQ(ret, 0); uint8_t rch; ret = c1.readInTx(&rch, sizeof(rch)); EXPECT_EQ(ret, sizeof(uint8_t)); EXPECT_EQ(rch, ch); EXPECT_EQ(rb.getHeader().template readMarker(), 1); EXPECT_EQ(rb.getHeader().template readMarker(), 0); EXPECT_TRUE(c1.inTx()); } { // Complete c1's Tx. ssize_t ret = c1.commitTx(); EXPECT_EQ(ret, 0); EXPECT_EQ(rb.getHeader().template readMarker(), 1); EXPECT_EQ(rb.getHeader().template readMarker(), 1); } { // Retrying to commit should fail. ssize_t ret = c1.commitTx(); EXPECT_EQ(ret, -EINVAL); } { // Next 8 bytes will fit, but wrap. ssize_t ret = p.write(&n, sizeof(n)); EXPECT_EQ(ret, sizeof(n)); EXPECT_EQ(rb.getHeader().template readMarker(), 9); EXPECT_EQ(rb.getHeader().template readMarker(), 1); } { // Start c1 read Tx again. ssize_t ret; ret = c1.startTx(); EXPECT_EQ(ret, 0); uint64_t rn; ret = c1.readInTx(&rn, sizeof(rn)); EXPECT_EQ(ret, sizeof(uint64_t)); EXPECT_EQ(rn, n); EXPECT_EQ(rb.getHeader().template readMarker(), 9); EXPECT_EQ(rb.getHeader().template readMarker(), 1); EXPECT_TRUE(c1.inTx()); } { // Complete c1. ssize_t ret = c1.commitTx(); EXPECT_EQ(ret, 0); ret = c1.commitTx(); EXPECT_EQ(ret, -EINVAL); } { // Next 8 bytes will fit, but wrap. ssize_t ret = p.write(&n, sizeof(n)); EXPECT_EQ(ret, sizeof(n)); EXPECT_EQ(rb.getHeader().template readMarker(), 17); EXPECT_EQ(rb.getHeader().template readMarker(), 9); } { ssize_t ret; ret = c1.startTx(); EXPECT_EQ(ret, 0); uint64_t rn; ret = c1.readInTx(&rn, sizeof(rn)); EXPECT_EQ(ret, sizeof(uint64_t)); EXPECT_EQ(rn, n); EXPECT_EQ(rb.getHeader().template readMarker(), 17); EXPECT_EQ(rb.getHeader().template readMarker(), 9); } { // Cancel tx, data should be readable again. ssize_t ret = c1.cancelTx(); EXPECT_EQ(ret, 0); } { // Now c1 can read. ssize_t ret; ret = c1.startTx(); EXPECT_EQ(ret, 0); uint64_t rn; ret = c1.readInTx(&rn, sizeof(rn)); EXPECT_EQ(ret, sizeof(uint64_t)); EXPECT_EQ(rn, n); EXPECT_EQ(rb.getHeader().template readMarker(), 17); EXPECT_EQ(rb.getHeader().template readMarker(), 9); } { // Commit succeds. ssize_t ret = c1.commitTx(); EXPECT_EQ(ret, 0); EXPECT_FALSE(c1.inTx()); } } TEST(RingBuffer, ReadTxWrapping) { // 8 bytes buffer. size_t size = 1u << 3; RingBufferStorage storage(size); RingBuffer rb = storage.getRb(); // Make a producer. Producer p{rb}; // Make consumers. Consumer c1{rb}; Consumer c2{rb}; EXPECT_EQ(usedSize(rb), 0); uint8_t ch = 0xA7; uint64_t n = 0x3333333333333333; // Put one byte. { EXPECT_EQ(rb.getHeader().template readMarker(), 0); EXPECT_EQ(rb.getHeader().template readMarker(), 0); EXPECT_EQ(usedSize(rb), 0); ssize_t ret = p.write(&ch, sizeof(ch)); EXPECT_EQ(ret, sizeof(ch)); EXPECT_EQ(rb.getHeader().template readMarker(), 1); EXPECT_EQ(rb.getHeader().template readMarker(), 0); EXPECT_EQ(usedSize(rb), 1); } // Next 8 bytes won't fit. { ssize_t ret = p.write(&n, sizeof(n)); EXPECT_EQ(ret, -ENODATA) << "Needs an extra byte to write the 8 bytes element. " "Capacity 8, used 1."; } // Remove the one byte in, now head is one off. EXPECT_FALSE(c1.inTx()); EXPECT_FALSE(c2.inTx()); { // Start c1 read Tx ssize_t ret; ret = c1.startTx(); EXPECT_EQ(ret, 0); uint8_t rch; ret = c1.readInTx(&rch, sizeof(rch)); EXPECT_EQ(ret, sizeof(uint8_t)); EXPECT_EQ(rch, ch); EXPECT_EQ(rb.getHeader().template readMarker(), 1); EXPECT_EQ(rb.getHeader().template readMarker(), 0); EXPECT_TRUE(c1.inTx()); } { // Complete c1's Tx. ssize_t ret = c1.commitTx(); EXPECT_EQ(ret, 0); EXPECT_EQ(rb.getHeader().template readMarker(), 1); EXPECT_EQ(rb.getHeader().template readMarker(), 1); } { // Retrying to commit should fail. ssize_t ret = c1.commitTx(); EXPECT_EQ(ret, -EINVAL); } { // Next 8 bytes will fit, but wrap. ssize_t ret = p.write(&n, sizeof(n)); EXPECT_EQ(ret, sizeof(n)); EXPECT_EQ(rb.getHeader().template readMarker(), 9); EXPECT_EQ(rb.getHeader().template readMarker(), 1); } { // Start c1 read Tx again. ssize_t ret; ret = c1.startTx(); EXPECT_EQ(ret, 0); uint64_t rn; ret = c1.readInTx(&rn, sizeof(rn)); EXPECT_EQ(ret, sizeof(uint64_t)); EXPECT_EQ(rn, n); EXPECT_EQ(rb.getHeader().template readMarker(), 9); EXPECT_EQ(rb.getHeader().template readMarker(), 1); EXPECT_TRUE(c1.inTx()); } { // Try to start read tx before c1 completing and get -EAGAIN. ssize_t ret; ret = c2.startTx(); EXPECT_EQ(ret, -EAGAIN); } { // Complete c1. ssize_t ret = c1.commitTx(); EXPECT_EQ(ret, 0); ret = c1.commitTx(); EXPECT_EQ(ret, -EINVAL); } { // Next 8 bytes will fit, but wrap. ssize_t ret = p.write(&n, sizeof(n)); EXPECT_EQ(ret, sizeof(n)); EXPECT_EQ(rb.getHeader().template readMarker(), 17); EXPECT_EQ(rb.getHeader().template readMarker(), 9); } { ssize_t ret; ret = c2.startTx(); EXPECT_EQ(ret, 0); uint64_t rn; ret = c2.readInTx(&rn, sizeof(rn)); EXPECT_EQ(ret, sizeof(uint64_t)); EXPECT_EQ(rn, n); EXPECT_EQ(rb.getHeader().template readMarker(), 17); EXPECT_EQ(rb.getHeader().template readMarker(), 9); } { // Cancel tx, data should be readable again. ssize_t ret = c2.cancelTx(); EXPECT_EQ(ret, 0); } { // Now c1 can read. ssize_t ret; ret = c1.startTx(); EXPECT_EQ(ret, 0); uint64_t rn; ret = c1.readInTx(&rn, sizeof(rn)); EXPECT_EQ(ret, sizeof(uint64_t)); EXPECT_EQ(rn, n); EXPECT_EQ(rb.getHeader().template readMarker(), 17); EXPECT_EQ(rb.getHeader().template readMarker(), 9); } { // Commit succeds. ssize_t ret = c1.commitTx(); EXPECT_EQ(ret, 0); EXPECT_FALSE(c1.inTx()); EXPECT_FALSE(c2.inTx()); } } TEST(RingBuffer, accessContiguousInTx) { // 256 bytes buffer. size_t size = 1u << 8u; RingBufferStorage storage(size); RingBuffer rb = storage.getRb(); // Make a producer. Producer p{rb}; // Make a consumer. Consumer c{rb}; EXPECT_EQ(usedSize(rb), 0); // Use different values for the three writing passes to tell them apart. uint16_t value1 = 0xACAC; // fits 128 times uint16_t value2 = 0xDCDC; // fits 128 times uint16_t value3 = 0xEFEF; // fits 128 times { for (int i = 0; i < 128; ++i) { ssize_t ret = p.write(&value1, sizeof(value1)); EXPECT_EQ(ret, sizeof(value1)); } // It must be full by now. EXPECT_EQ(usedSize(rb), 256); uint8_t b = 0xEE; ssize_t ret = p.write(&b, sizeof(b)); EXPECT_EQ(ret, -ENODATA); } { // Read a 128-byte buffer that is left-aligned with the start. ssize_t ret; ret = c.startTx(); EXPECT_EQ(ret, 0); std::array buffers; std::tie(ret, buffers) = c.accessContiguousInTx(128); EXPECT_EQ(ret, 1); EXPECT_EQ(buffers[0].len, 128); for (int i = 0; i < 128; ++i) { EXPECT_EQ(buffers[0].ptr[i], 0xAC); } ret = c.commitTx(); EXPECT_EQ(ret, 0); EXPECT_EQ(usedSize(rb), 128); } { for (int i = 0; i < 64; ++i) { ssize_t ret = p.write(&value2, sizeof(value2)); EXPECT_EQ(ret, sizeof(value2)); } // It must be full again by now. EXPECT_EQ(usedSize(rb), 256); } { // Read a 256-byte buffer that wraps around halfway through. ssize_t ret; ret = c.startTx(); EXPECT_EQ(ret, 0); std::array buffers; std::tie(ret, buffers) = c.accessContiguousInTx(256); EXPECT_EQ(ret, 2); EXPECT_EQ(buffers[0].len, 128); for (int i = 0; i < 128; ++i) { EXPECT_EQ(buffers[0].ptr[i], 0xAC); } EXPECT_EQ(buffers[1].len, 128); for (int i = 0; i < 128; ++i) { EXPECT_EQ(buffers[1].ptr[i], 0xDC); } ret = c.commitTx(); EXPECT_EQ(ret, 0); EXPECT_EQ(usedSize(rb), 0); } { for (int i = 0; i < 64; ++i) { ssize_t ret = p.write(&value2, sizeof(value2)); EXPECT_EQ(ret, sizeof(value2)); } for (int i = 0; i < 64; ++i) { ssize_t ret = p.write(&value3, sizeof(value3)); EXPECT_EQ(ret, sizeof(value3)); } // It must be full again by now. EXPECT_EQ(usedSize(rb), 256); } { // Read a 128-byte buffer that is right-aligned with the end. ssize_t ret; ret = c.startTx(); EXPECT_EQ(ret, 0); std::array buffers; std::tie(ret, buffers) = c.accessContiguousInTx(128); EXPECT_EQ(ret, 1); EXPECT_EQ(buffers[0].len, 128); for (int i = 0; i < 128; ++i) { EXPECT_EQ(buffers[0].ptr[i], 0xDC); } ret = c.commitTx(); EXPECT_EQ(ret, 0); EXPECT_EQ(usedSize(rb), 128); } { for (int i = 0; i < 64; ++i) { ssize_t ret = p.write(&value3, sizeof(value3)); EXPECT_EQ(ret, sizeof(value3)); } // It must be full again by now. EXPECT_EQ(usedSize(rb), 256); } { // Reading the whole 256 bytes. ssize_t ret; ret = c.startTx(); EXPECT_EQ(ret, 0); std::array buffers; std::tie(ret, buffers) = c.accessContiguousInTx(256); EXPECT_EQ(ret, 1); EXPECT_EQ(buffers[0].len, 256); for (int i = 0; i < 256; ++i) { EXPECT_EQ(buffers[0].ptr[i], 0xEF); } ret = c.commitTx(); EXPECT_EQ(ret, 0); EXPECT_EQ(usedSize(rb), 0); } { // Attempt reading from empty buffer. ssize_t ret; ret = c.startTx(); EXPECT_EQ(ret, 0); std::array buffers; std::tie(ret, buffers) = c.accessContiguousInTx(200); EXPECT_EQ(ret, 0); ret = c.commitTx(); EXPECT_EQ(ret, 0); EXPECT_EQ(usedSize(rb), 0); } } ================================================ FILE: tensorpipe/test/common/shm_ringbuffer_test.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include #include #include #include #include #include using namespace tensorpipe; constexpr static int kNumRingbufferRoles = 2; using Consumer = RingBufferRole; using Producer = RingBufferRole; // Same process produces and consumes share memory through different mappings. TEST(ShmRingBuffer, SameProducerConsumer) { Fd headerFd; Fd dataFd; { // Producer part. // Buffer large enough to fit all data and persistent // (needs to be unlinked up manually). Error error; ShmSegment headerSegment; ShmSegment dataSegment; RingBuffer rb; std::tie(error, headerSegment, dataSegment, rb) = createShmRingBuffer(256 * 1024); Producer prod{rb}; // Producer loop. It all fits in buffer. int i = 0; while (i < 2000) { ssize_t ret = prod.write(&i, sizeof(i)); EXPECT_EQ(ret, sizeof(i)); ++i; } // Duplicate the file descriptors so that the shared memory remains alive // when the original fds are closed by the segments' destructors. headerFd = Fd(::dup(headerSegment.getFd())); dataFd = Fd(::dup(dataSegment.getFd())); } { // Consumer part. // Map file again (to a different address) and consume it. Error error; ShmSegment headerSegment; ShmSegment dataSegment; RingBuffer rb; std::tie(error, headerSegment, dataSegment, rb) = loadShmRingBuffer( std::move(headerFd), std::move(dataFd)); Consumer cons{rb}; int i = 0; while (i < 2000) { int value; ssize_t ret = cons.read(&value, sizeof(value)); EXPECT_EQ(ret, sizeof(value)); EXPECT_EQ(value, i); ++i; } } }; TEST(ShmRingBuffer, SingleProducer_SingleConsumer) { int sockFds[2]; { int rv = socketpair(AF_UNIX, SOCK_STREAM, 0, sockFds); if (rv != 0) { TP_THROW_SYSTEM(errno) << "Failed to create socket pair"; } } int eventFd = eventfd(0, 0); if (eventFd < 0) { TP_THROW_SYSTEM(errno) << "Failed to create event fd"; } int pid = fork(); if (pid < 0) { TP_THROW_SYSTEM(errno) << "Failed to fork"; } if (pid == 0) { // child, the producer // Make a scope so segments are destroyed even on exit(0). { Error error; ShmSegment headerSegment; ShmSegment dataSegment; RingBuffer rb; std::tie(error, headerSegment, dataSegment, rb) = createShmRingBuffer(1024); Producer prod{rb}; { auto err = sendFdsToSocket( sockFds[0], headerSegment.getFd(), dataSegment.getFd()); if (err) { TP_THROW_ASSERT() << err.what(); } } int i = 0; while (i < 2000) { ssize_t ret = prod.write(&i, sizeof(i)); if (ret == -ENODATA) { std::this_thread::yield(); continue; } EXPECT_EQ(ret, sizeof(i)); ++i; } // Because of buffer size smaller than amount of data written, // producer cannot have completed the loop before consumer // started consuming the data. { uint64_t c; ::read(eventFd, &c, sizeof(uint64_t)); } } // Child exits. Careful when calling exit() directly, because // it does not call destructors. We ensured shared_ptrs were // destroyed before by calling exit(0). exit(0); } // parent, the consumer // Wait for other process to create buffer. Fd headerFd; Fd dataFd; { auto err = recvFdsFromSocket(sockFds[1], headerFd, dataFd); if (err) { TP_THROW_ASSERT() << err.what(); } } Error error; ShmSegment headerSegment; ShmSegment dataSegment; RingBuffer rb; std::tie(error, headerSegment, dataSegment, rb) = loadShmRingBuffer( std::move(headerFd), std::move(dataFd)); Consumer cons{rb}; int i = 0; while (i < 2000) { int value; ssize_t ret = cons.read(&value, sizeof(value)); if (ret == -ENODATA) { std::this_thread::yield(); continue; } EXPECT_EQ(ret, sizeof(value)); EXPECT_EQ(value, i); ++i; } { uint64_t c = 1; ::write(eventFd, &c, sizeof(uint64_t)); } ::close(eventFd); ::close(sockFds[0]); ::close(sockFds[1]); // Wait for child to make gtest happy. ::wait(nullptr); }; ================================================ FILE: tensorpipe/test/common/shm_segment_test.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include #include using namespace tensorpipe; // Same process produces and consumes share memory through different mappings. TEST(ShmSegment, SameProducerConsumer_Scalar) { // Set affinity of producer to CPU zero so that consumer only has to read from // that one CPU's buffer. cpu_set_t cpuset; CPU_ZERO(&cpuset); CPU_SET(0, &cpuset); sched_setaffinity(0, sizeof(cpu_set_t), &cpuset); // This must stay alive for the file descriptor to remain open. Fd fd; { // Producer part. Error error; ShmSegment segment; int* myIntPtr; std::tie(error, segment, myIntPtr) = ShmSegment::create(); ASSERT_FALSE(error) << error.what(); int& myInt = *myIntPtr; myInt = 1000; // Duplicate the file descriptor so that the shared memory remains alive // when the original fd is closed by the segment's destructor. fd = Fd(::dup(segment.getFd())); } { // Consumer part. // Map file again (to a different address) and consume it. Error error; ShmSegment segment; int* myIntPtr; std::tie(error, segment, myIntPtr) = ShmSegment::load(std::move(fd)); ASSERT_FALSE(error) << error.what(); EXPECT_EQ(segment.getSize(), sizeof(int)); EXPECT_EQ(*myIntPtr, 1000); } }; TEST(ShmSegment, SingleProducer_SingleConsumer_Array) { size_t numFloats = 330000; int sockFds[2]; { int rv = socketpair(AF_UNIX, SOCK_STREAM, 0, sockFds); if (rv != 0) { TP_THROW_SYSTEM(errno) << "Failed to create socket pair"; } } int eventFd = eventfd(0, 0); if (eventFd < 0) { TP_THROW_SYSTEM(errno) << "Failed to create event fd"; } int pid = fork(); if (pid < 0) { TP_THROW_SYSTEM(errno) << "Failed to fork"; } if (pid == 0) { // child, the producer // Make a scope so shared_ptr's are released even on exit(0). { // use huge pages in creation and not in loading. This should only affects // TLB overhead. Error error; ShmSegment segment; float* myFloats; std::tie(error, segment, myFloats) = ShmSegment::create(numFloats); ASSERT_FALSE(error) << error.what(); for (int i = 0; i < numFloats; ++i) { myFloats[i] = i; } { auto err = sendFdsToSocket(sockFds[0], segment.getFd()); if (err) { TP_THROW_ASSERT() << err.what(); } } { uint64_t c; ::read(eventFd, &c, sizeof(uint64_t)); } } // Child exits. Careful when calling exit() directly, because // it does not call destructors. We ensured shared_ptrs were // destroyed before by calling exit(0). exit(0); } // parent, the consumer Fd segmentFd; { auto err = recvFdsFromSocket(sockFds[1], segmentFd); if (err) { TP_THROW_ASSERT() << err.what(); } } Error error; ShmSegment segment; float* myFloats; std::tie(error, segment, myFloats) = ShmSegment::load(std::move(segmentFd)); ASSERT_FALSE(error) << error.what(); EXPECT_EQ(numFloats * sizeof(float), segment.getSize()); for (int i = 0; i < numFloats; ++i) { EXPECT_EQ(myFloats[i], i); } { uint64_t c = 1; ::write(eventFd, &c, sizeof(uint64_t)); } ::close(eventFd); ::close(sockFds[0]); ::close(sockFds[1]); // Wait for child to make gtest happy. ::wait(nullptr); }; ================================================ FILE: tensorpipe/test/common/system_test.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include using namespace tensorpipe; TEST(Pow2, isPow2) { for (uint64_t i = 0; i < 63; ++i) { EXPECT_TRUE(isPow2(1ull << i)); } EXPECT_FALSE(isPow2(3)); EXPECT_FALSE(isPow2(5)); EXPECT_FALSE(isPow2(10)); EXPECT_FALSE(isPow2(15)); EXPECT_TRUE(isPow2(16)); EXPECT_FALSE(isPow2(17)); EXPECT_FALSE(isPow2(18)); EXPECT_FALSE(isPow2(25)); EXPECT_FALSE(isPow2(1028)); } TEST(Pow2, nextPow2) { for (uint64_t i = 0; i < 63; ++i) { uint64_t p2 = 1ull << i; uint64_t nextP2 = 1ull << (i + 1); EXPECT_EQ(nextPow2(p2), p2); EXPECT_EQ(nextPow2(p2 + 1), nextP2); } } ================================================ FILE: tensorpipe/test/core/context_test.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include #include #include #include #if TP_USE_CUDA #include #include #endif // TP_USE_CUDA using namespace tensorpipe; namespace { ::testing::AssertionResult buffersAreEqual( const void* ptr1, const size_t len1, const void* ptr2, const size_t len2) { if (ptr1 == nullptr && ptr2 == nullptr) { if (len1 == 0 && len2 == 0) { return ::testing::AssertionSuccess(); } if (len1 != 0) { return ::testing::AssertionFailure() << "first pointer is null but length isn't 0"; } if (len1 != 0) { return ::testing::AssertionFailure() << "second pointer is null but length isn't 0"; } } if (ptr1 == nullptr) { return ::testing::AssertionFailure() << "first pointer is null but second one isn't"; } if (ptr2 == nullptr) { return ::testing::AssertionFailure() << "second pointer is null but first one isn't"; } if (len1 != len2) { return ::testing::AssertionFailure() << "first length is " << len1 << " but second one is " << len2; } if (std::memcmp(ptr1, ptr2, len1) != 0) { return ::testing::AssertionFailure() << "buffer contents aren't equal"; } return ::testing::AssertionSuccess(); } #if TP_USE_CUDA std::vector unwrapCudaBuffer(CudaBuffer b, size_t length) { std::vector result(length); TP_CUDA_CHECK(cudaStreamSynchronize(b.stream)); TP_CUDA_CHECK(cudaMemcpy(result.data(), b.ptr, length, cudaMemcpyDefault)); return result; } #endif // TP_USE_CUDA ::testing::AssertionResult descriptorAndAllocationMatchMessage( const Descriptor& descriptor, const Allocation& allocation, const Message& message) { EXPECT_EQ(descriptor.payloads.size(), allocation.payloads.size()); if (descriptor.payloads.size() != message.payloads.size()) { return ::testing::AssertionFailure() << "descriptor has " << descriptor.payloads.size() << " payloads but message has " << message.payloads.size(); } for (size_t idx = 0; idx < descriptor.payloads.size(); idx++) { EXPECT_TRUE(buffersAreEqual( allocation.payloads[idx].data, descriptor.payloads[idx].length, message.payloads[idx].data, message.payloads[idx].length)); } EXPECT_EQ(descriptor.tensors.size(), allocation.tensors.size()); if (descriptor.tensors.size() != message.tensors.size()) { return ::testing::AssertionFailure() << "descriptor has " << descriptor.tensors.size() << " tensors but message has " << message.tensors.size(); } for (size_t idx = 0; idx < descriptor.tensors.size(); idx++) { EXPECT_EQ( allocation.tensors[idx].buffer.device(), message.tensors[idx].buffer.device()); const std::string& deviceType = allocation.tensors[idx].buffer.device().type; if (deviceType == kCpuDeviceType) { EXPECT_TRUE(buffersAreEqual( allocation.tensors[idx].buffer.unwrap().ptr, descriptor.tensors[idx].length, message.tensors[idx].buffer.unwrap().ptr, message.tensors[idx].length)); #if TP_USE_CUDA } else if (deviceType == kCudaDeviceType) { std::vector buffer1 = unwrapCudaBuffer( allocation.tensors[idx].buffer.unwrap(), descriptor.tensors[idx].length); std::vector buffer2 = unwrapCudaBuffer( message.tensors[idx].buffer.unwrap(), message.tensors[idx].length); EXPECT_TRUE(buffersAreEqual( buffer1.data(), buffer1.size(), buffer2.data(), buffer2.size())); #endif // TP_USE_CUDA } else { ADD_FAILURE() << "Unexpected device type: " << deviceType; } } return ::testing::AssertionSuccess(); } #if TP_USE_CUDA struct CudaPointerDeleter { void operator()(void* ptr) { TP_CUDA_CHECK(cudaFree(ptr)); } }; std::unique_ptr makeCudaPointer(size_t length) { void* cudaPtr; TP_CUDA_CHECK(cudaMalloc(&cudaPtr, length)); return std::unique_ptr(cudaPtr); } #endif // TP_USE_CUDA // Having 4 payloads per message is arbitrary. constexpr int kNumPayloads = 4; // Having 4 tensors per message ensures there are 2 CPU tensors and 2 CUDA // tensors. constexpr int kNumTensors = 4; std::string kPayloadData = "I'm a payload"; std::string kTensorData = "And I'm a tensor"; #if TP_USE_CUDA const int kCudaTensorLength = 32; const uint8_t kCudaTensorFillValue = 0x42; #endif // TP_USE_CUDA Message::Tensor makeTensor(int index) { #if TP_USE_CUDA static std::unique_ptr kCudaTensorData = []() { auto cudaPtr = makeCudaPointer(kCudaTensorLength); TP_CUDA_CHECK( cudaMemset(cudaPtr.get(), kCudaTensorFillValue, kCudaTensorLength)); return cudaPtr; }(); if (index % 2 == 1) { return { .buffer = CudaBuffer{ .ptr = kCudaTensorData.get(), .stream = cudaStreamDefault, }, // FIXME: Use non-blocking stream. .length = kCudaTensorLength, }; } #endif // TP_USE_CUDA return { .buffer = CpuBuffer{ .ptr = reinterpret_cast( const_cast(kTensorData.data())), }, .length = kTensorData.length(), }; } Message makeMessage(int numPayloads, int numTensors) { Message message; for (int i = 0; i < numPayloads; i++) { Message::Payload payload; payload.data = reinterpret_cast(const_cast(kPayloadData.data())); payload.length = kPayloadData.length(); message.payloads.push_back(std::move(payload)); } for (int i = 0; i < numTensors; i++) { message.tensors.push_back(makeTensor(i)); } return message; } Allocation allocateForDescriptor( const Descriptor& descriptor, std::vector>& buffers) { Allocation allocation; for (const auto& payload : descriptor.payloads) { // FIXME: Changing this to a make_shared causes havoc. auto payloadData = std::unique_ptr>( new uint8_t[payload.length]); allocation.payloads.push_back({.data = payloadData.get()}); buffers.push_back(std::move(payloadData)); } for (const auto& tensor : descriptor.tensors) { if (tensor.sourceDevice.type == kCpuDeviceType) { auto tensorData = std::unique_ptr>( new uint8_t[tensor.length]); allocation.tensors.push_back({ .buffer = CpuBuffer{.ptr = tensorData.get()}, }); buffers.push_back(std::move(tensorData)); #if TP_USE_CUDA } else if (tensor.sourceDevice.type == kCudaDeviceType) { auto tensorData = makeCudaPointer(tensor.length); allocation.tensors.push_back({ .buffer = CudaBuffer{ .ptr = tensorData.get(), // FIXME: Use non-blocking streams. .stream = cudaStreamDefault, }, }); buffers.push_back(std::move(tensorData)); #endif // TP_USE_CUDA } else { ADD_FAILURE() << "Unrecognized device type: " << tensor.sourceDevice.type; } } return allocation; } Message messageFromAllocation( const Descriptor& descriptor, const Allocation& allocation) { Message message; message.metadata = descriptor.metadata; for (int payloadIdx = 0; payloadIdx < descriptor.payloads.size(); ++payloadIdx) { message.payloads.emplace_back(); Message::Payload& payload = message.payloads.back(); payload.metadata = descriptor.payloads[payloadIdx].metadata; payload.length = descriptor.payloads[payloadIdx].length; payload.data = allocation.payloads[payloadIdx].data; } for (int tensorIdx = 0; tensorIdx < descriptor.tensors.size(); ++tensorIdx) { message.tensors.emplace_back(); Message::Tensor& tensor = message.tensors.back(); tensor.metadata = descriptor.tensors[tensorIdx].metadata; tensor.length = descriptor.tensors[tensorIdx].length; tensor.buffer = allocation.tensors[tensorIdx].buffer; } return message; } std::vector genUrls() { std::vector res; #if TENSORPIPE_HAS_SHM_TRANSPORT res.push_back("shm://"); #endif // TENSORPIPE_HAS_SHM_TRANSPORT res.push_back("uv://127.0.0.1"); return res; } std::shared_ptr makeContext() { auto context = std::make_shared(); context->registerTransport(0, "uv", transport::uv::create()); #if TENSORPIPE_HAS_SHM_TRANSPORT context->registerTransport(1, "shm", transport::shm::create()); #endif // TENSORPIPE_HAS_SHM_TRANSPORT context->registerChannel(0, "basic", channel::basic::create()); #if TENSORPIPE_HAS_CMA_CHANNEL context->registerChannel(1, "cma", channel::cma::create()); #endif // TENSORPIPE_HAS_CMA_CHANNEL #if TP_USE_CUDA context->registerChannel( 10, "cuda_basic", channel::cuda_basic::create(channel::basic::create())); #if TENSORPIPE_HAS_CUDA_IPC_CHANNEL context->registerChannel(11, "cuda_ipc", channel::cuda_ipc::create()); #endif // TENSORPIPE_HAS_CUDA_IPC_CHANNEL context->registerChannel(12, "cuda_xth", channel::cuda_xth::create()); #endif // TP_USE_CUDA return context; } } // namespace TEST(Context, ClientPingSerial) { ForkedThreadPeerGroup pg; pg.spawn( [&]() { std::vector> buffers; std::promise> serverPipePromise; std::promise readDescriptorPromise; std::promise readMessagePromise; auto context = makeContext(); auto listener = context->listen(genUrls()); pg.send(PeerGroup::kClient, listener->url("uv")); listener->accept([&](const Error& error, std::shared_ptr pipe) { if (error) { serverPipePromise.set_exception( std::make_exception_ptr(std::runtime_error(error.what()))); } else { serverPipePromise.set_value(std::move(pipe)); } }); std::shared_ptr serverPipe = serverPipePromise.get_future().get(); serverPipe->readDescriptor( [&readDescriptorPromise]( const Error& error, Descriptor descriptor) { if (error) { readDescriptorPromise.set_exception( std::make_exception_ptr(std::runtime_error(error.what()))); } else { readDescriptorPromise.set_value(std::move(descriptor)); } }); Descriptor descriptor = readDescriptorPromise.get_future().get(); Allocation allocation = allocateForDescriptor(descriptor, buffers); serverPipe->read(allocation, [&readMessagePromise](const Error& error) { if (error) { readMessagePromise.set_exception( std::make_exception_ptr(std::runtime_error(error.what()))); } else { readMessagePromise.set_value(); } }); readMessagePromise.get_future().get(); EXPECT_TRUE(descriptorAndAllocationMatchMessage( descriptor, allocation, makeMessage(kNumPayloads, kNumTensors))); pg.done(PeerGroup::kServer); pg.join(PeerGroup::kServer); context->join(); }, [&]() { std::promise writtenMessagePromise; auto context = makeContext(); auto url = pg.recv(PeerGroup::kClient); auto clientPipe = context->connect(url); clientPipe->write( makeMessage(kNumPayloads, kNumTensors), [&writtenMessagePromise](const Error& error) { if (error) { writtenMessagePromise.set_exception( std::make_exception_ptr(std::runtime_error(error.what()))); } else { writtenMessagePromise.set_value(); } }); writtenMessagePromise.get_future().get(); pg.done(PeerGroup::kClient); pg.join(PeerGroup::kClient); context->join(); }); } TEST(Context, ClientPingInline) { ForkedThreadPeerGroup pg; pg.spawn( [&]() { std::vector> buffers; std::promise> serverPipePromise; std::promise readCompletedProm; auto context = makeContext(); auto listener = context->listen(genUrls()); pg.send(PeerGroup::kClient, listener->url("uv")); listener->accept([&](const Error& error, std::shared_ptr pipe) { if (error) { serverPipePromise.set_exception( std::make_exception_ptr(std::runtime_error(error.what()))); } else { serverPipePromise.set_value(std::move(pipe)); } }); std::shared_ptr serverPipe = serverPipePromise.get_future().get(); serverPipe->readDescriptor([&serverPipe, &readCompletedProm, &buffers]( const Error& error, Descriptor descriptor) { if (error) { ADD_FAILURE() << error.what(); readCompletedProm.set_value(); return; } Allocation allocation = allocateForDescriptor(descriptor, buffers); serverPipe->read( allocation, [&readCompletedProm, descriptor{std::move(descriptor)}, allocation](const Error& error) { if (error) { readCompletedProm.set_exception(std::make_exception_ptr( std::runtime_error(error.what()))); } else { EXPECT_TRUE(descriptorAndAllocationMatchMessage( descriptor, allocation, makeMessage(kNumPayloads, kNumTensors))); readCompletedProm.set_value(); } }); }); readCompletedProm.get_future().get(); pg.done(PeerGroup::kServer); pg.join(PeerGroup::kServer); context->join(); }, [&]() { std::promise writeCompletedProm; auto context = makeContext(); auto url = pg.recv(PeerGroup::kClient); auto clientPipe = context->connect(url); clientPipe->write( makeMessage(kNumPayloads, kNumTensors), [&writeCompletedProm](const Error& error) { if (error) { writeCompletedProm.set_exception( std::make_exception_ptr(std::runtime_error(error.what()))); } else { writeCompletedProm.set_value(); } }); writeCompletedProm.get_future().get(); pg.done(PeerGroup::kClient); pg.join(PeerGroup::kClient); context->join(); }); } TEST(Context, ServerPingPongTwice) { ForkedThreadPeerGroup pg; pg.spawn( [&]() { std::vector> buffers; std::promise> serverPipePromise; std::promise pingCompletedProm; auto context = makeContext(); auto listener = context->listen(genUrls()); pg.send(PeerGroup::kClient, listener->url("uv")); listener->accept([&](const Error& error, std::shared_ptr pipe) { if (error) { serverPipePromise.set_exception( std::make_exception_ptr(std::runtime_error(error.what()))); } else { serverPipePromise.set_value(std::move(pipe)); } }); std::shared_ptr serverPipe = serverPipePromise.get_future().get(); int numPingsGoneThrough = 0; for (int i = 0; i < 2; i++) { serverPipe->write( makeMessage(kNumPayloads, kNumTensors), [&serverPipe, &pingCompletedProm, &buffers, &numPingsGoneThrough, i](const Error& error) { if (error) { ADD_FAILURE() << error.what(); pingCompletedProm.set_value(); return; } serverPipe->readDescriptor( [&serverPipe, &pingCompletedProm, &buffers, &numPingsGoneThrough, i](const Error& error, Descriptor descriptor) { if (error) { ADD_FAILURE() << error.what(); pingCompletedProm.set_value(); return; } Allocation allocation = allocateForDescriptor(descriptor, buffers); serverPipe->read( allocation, [&pingCompletedProm, &numPingsGoneThrough, descriptor{std::move(descriptor)}, allocation, i](const Error& error) { if (error) { ADD_FAILURE() << error.what(); pingCompletedProm.set_value(); return; } EXPECT_TRUE(descriptorAndAllocationMatchMessage( descriptor, allocation, makeMessage(kNumPayloads, kNumTensors))); EXPECT_EQ(numPingsGoneThrough, i); numPingsGoneThrough++; if (numPingsGoneThrough == 2) { pingCompletedProm.set_value(); } }); }); }); } pingCompletedProm.get_future().get(); pg.done(PeerGroup::kServer); pg.join(PeerGroup::kServer); context->join(); }, [&]() { std::vector> buffers; std::promise pongCompletedProm; auto context = makeContext(); auto url = pg.recv(PeerGroup::kClient); auto clientPipe = context->connect(url); int numPongsGoneThrough = 0; for (int i = 0; i < 2; i++) { clientPipe->readDescriptor([&clientPipe, &pongCompletedProm, &buffers, &numPongsGoneThrough, i](const Error& error, Descriptor descriptor) { if (error) { ADD_FAILURE() << error.what(); pongCompletedProm.set_value(); return; } Allocation allocation = allocateForDescriptor(descriptor, buffers); clientPipe->read( allocation, [&clientPipe, &pongCompletedProm, &numPongsGoneThrough, descriptor{std::move(descriptor)}, allocation, i](const Error& error) { if (error) { ADD_FAILURE() << error.what(); pongCompletedProm.set_value(); return; } // Copy received message to send it back. Message message = messageFromAllocation(descriptor, allocation); clientPipe->write( std::move(message), [&pongCompletedProm, &numPongsGoneThrough, i]( const Error& error) { if (error) { ADD_FAILURE() << error.what(); pongCompletedProm.set_value(); return; } EXPECT_EQ(numPongsGoneThrough, i); numPongsGoneThrough++; if (numPongsGoneThrough == 2) { pongCompletedProm.set_value(); } }); }); }); } pongCompletedProm.get_future().get(); pg.done(PeerGroup::kClient); pg.join(PeerGroup::kClient); context->join(); }); } static void pipeRead( std::shared_ptr& pipe, std::vector>& buffers, std::function fn) { pipe->readDescriptor([&pipe, &buffers, fn{std::move(fn)}]( const Error& error, Descriptor descriptor) mutable { ASSERT_FALSE(error); Allocation allocation = allocateForDescriptor(descriptor, buffers); pipe->read( allocation, [fn{std::move(fn)}, descriptor{std::move(descriptor)}, allocation]( const Error& error) mutable { fn(error, std::move(descriptor), std::move(allocation)); }); }); } TEST(Context, MixedTensorMessage) { constexpr int kNumMessages = 2; ForkedThreadPeerGroup pg; pg.spawn( [&]() { std::vector> buffers; std::promise> serverPipePromise; std::promise readCompletedProm; auto context = makeContext(); auto listener = context->listen(genUrls()); pg.send(PeerGroup::kClient, listener->url("uv")); listener->accept([&](const Error& error, std::shared_ptr pipe) { if (error) { serverPipePromise.set_exception( std::make_exception_ptr(std::runtime_error(error.what()))); } else { serverPipePromise.set_value(std::move(pipe)); } }); std::shared_ptr serverPipe = serverPipePromise.get_future().get(); std::atomic readNum(kNumMessages); pipeRead( serverPipe, buffers, [&readNum, &readCompletedProm]( const Error& error, Descriptor descriptor, Allocation allocation) { ASSERT_FALSE(error); EXPECT_TRUE(descriptorAndAllocationMatchMessage( descriptor, allocation, makeMessage(kNumPayloads, kNumTensors))); if (--readNum == 0) { readCompletedProm.set_value(); } }); pipeRead( serverPipe, buffers, [&readNum, &readCompletedProm]( const Error& error, Descriptor descriptor, Allocation allocation) { ASSERT_FALSE(error); EXPECT_TRUE(descriptorAndAllocationMatchMessage( descriptor, allocation, makeMessage(0, 0))); if (--readNum == 0) { readCompletedProm.set_value(); } }); readCompletedProm.get_future().get(); pg.done(PeerGroup::kServer); pg.join(PeerGroup::kServer); context->join(); }, [&]() { std::promise writeCompletedProm; auto context = makeContext(); auto url = pg.recv(PeerGroup::kClient); auto clientPipe = context->connect(url); std::atomic writeNum(kNumMessages); clientPipe->write( makeMessage(kNumPayloads, kNumTensors), [&writeNum, &writeCompletedProm](const Error& error) { ASSERT_FALSE(error) << error.what(); if (--writeNum == 0) { writeCompletedProm.set_value(); } }); clientPipe->write( makeMessage(0, 0), [&writeNum, &writeCompletedProm](const Error& error) { ASSERT_FALSE(error) << error.what(); if (--writeNum == 0) { writeCompletedProm.set_value(); } }); writeCompletedProm.get_future().get(); pg.done(PeerGroup::kClient); pg.join(PeerGroup::kClient); context->join(); }); } ================================================ FILE: tensorpipe/test/core/listener_test.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include #include #include using namespace tensorpipe; TEST(Listener, ClosingAbortsOperations) { auto context = std::make_shared(); context->registerTransport(0, "uv", transport::uv::create()); context->registerChannel(0, "basic", channel::basic::create()); { auto listener = context->listen({"uv://127.0.0.1"}); std::promise donePromise; listener->accept( [&](const Error& error, std::shared_ptr /* unused */) { EXPECT_TRUE(error); donePromise.set_value(); }); listener->close(); donePromise.get_future().get(); } context->join(); } ================================================ FILE: tensorpipe/test/core/pipe_cuda_test.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include using namespace tensorpipe; class CudaSimpleWriteReadWithAllTargetDevicesTest : public ClientServerPipeTestCase { InlineMessage imessage_ = { .payloads = { {.data = "payload #1", .metadata = "payload metadata #1"}, {.data = "payload #2", .metadata = "payload metadata #2"}, {.data = "payload #3", .metadata = "payload metadata #3"}, }, .tensors = { { .data = "tensor #1", .metadata = "tensor metadata #1", .device = Device{kCudaDeviceType, 0}, .targetDevice = Device{kCudaDeviceType, 0}, }, { .data = "tensor #2", .metadata = "tensor metadata #2", .device = Device{kCpuDeviceType, 0}, .targetDevice = Device{kCudaDeviceType, 0}, }, { .data = "tensor #3", .metadata = "tensor metadata #3", .device = Device{kCudaDeviceType, 0}, .targetDevice = Device{kCpuDeviceType, 0}, }, { .data = "tensor #4", .metadata = "tensor metadata #4", .device = Device{kCpuDeviceType, 0}, .targetDevice = Device{kCpuDeviceType, 0}, }, }, .metadata = "pipe metadata", }; public: void server(Pipe& pipe) override { Message message; Storage storage; std::tie(message, storage) = makeMessage(imessage_); auto future = pipeWriteWithFuture(pipe, message); future.get(); } void client(Pipe& pipe) override { Descriptor descriptor; Storage storage; auto future = pipeReadWithFuture( pipe, /*targetDevices=*/ { Device{kCudaDeviceType, 0}, Device{kCudaDeviceType, 0}, Device{kCpuDeviceType, 0}, Device{kCpuDeviceType, 0}, }); std::tie(descriptor, storage) = future.get(); expectDescriptorAndStorageMatchMessage(descriptor, storage, imessage_); } }; TEST(Pipe, CudaSimpleWriteReadWithAllTargetDevices) { CudaSimpleWriteReadWithAllTargetDevicesTest test; test.run(); } class CudaSimpleWriteReadWithSomeTargetDevicesTest : public ClientServerPipeTestCase { InlineMessage imessage_ = { .payloads = { {.data = "payload #1", .metadata = "payload metadata #1"}, {.data = "payload #2", .metadata = "payload metadata #2"}, {.data = "payload #3", .metadata = "payload metadata #3"}, }, .tensors = { { .data = "tensor #1", .metadata = "tensor metadata #1", .device = Device{kCudaDeviceType, 0}, .targetDevice = Device{kCudaDeviceType, 0}, }, { .data = "tensor #2", .metadata = "tensor metadata #2", .device = Device{kCudaDeviceType, 0}, }, { .data = "tensor #3", .metadata = "tensor metadata #3", .device = Device{kCpuDeviceType, 0}, }, }, .metadata = "pipe metadata", }; public: void server(Pipe& pipe) override { Message message; Storage storage; std::tie(message, storage) = makeMessage(imessage_); auto future = pipeWriteWithFuture(pipe, message); future.get(); } void client(Pipe& pipe) override { Descriptor descriptor; Storage storage; auto future = pipeReadWithFuture( pipe, /*targetDevices=*/ { Device{kCudaDeviceType, 0}, Device{kCpuDeviceType, 0}, Device{kCpuDeviceType, 0}, }); std::tie(descriptor, storage) = future.get(); expectDescriptorAndStorageMatchMessage(descriptor, storage, imessage_); } }; TEST(Pipe, CudaSimpleWriteReadWithSomeTargetDevices) { CudaSimpleWriteReadWithSomeTargetDevicesTest test; test.run(); } class CudaSimpleWriteReadWithoutTargetDeviceTest : public ClientServerPipeTestCase { InlineMessage imessage_ = { .payloads = { {.data = "payload #1", .metadata = "payload metadata #1"}, {.data = "payload #2", .metadata = "payload metadata #2"}, {.data = "payload #3", .metadata = "payload metadata #3"}, }, .tensors = { { .data = "tensor #1", .metadata = "tensor metadata #1", .device = Device{kCpuDeviceType, 0}, }, { .data = "tensor #2", .metadata = "tensor metadata #2", .device = Device{kCpuDeviceType, 0}, }, { .data = "tensor #3", .metadata = "tensor metadata #3", .device = Device{kCudaDeviceType, 0}, }, { .data = "tensor #4", .metadata = "tensor metadata #4", .device = Device{kCudaDeviceType, 0}, }, }, .metadata = "pipe metadata", }; public: void server(Pipe& pipe) override { Message message; Storage storage; std::tie(message, storage) = makeMessage(imessage_); auto future = pipeWriteWithFuture(pipe, message); future.get(); } void client(Pipe& pipe) override { Descriptor descriptor; Storage storage; auto future = pipeReadWithFuture( pipe, /*targetDevices=*/ { Device{kCpuDeviceType, 0}, Device{kCudaDeviceType, 0}, Device{kCpuDeviceType, 0}, Device{kCudaDeviceType, 0}, }); std::tie(descriptor, storage) = future.get(); expectDescriptorAndStorageMatchMessage(descriptor, storage, imessage_); } }; TEST(Pipe, CudaSimpleWriteReadWithoutTargetDevice) { CudaSimpleWriteReadWithoutTargetDeviceTest test; test.run(); } ================================================ FILE: tensorpipe/test/core/pipe_test.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include using namespace tensorpipe; class SimpleWriteReadTest : public ClientServerPipeTestCase { InlineMessage imessage_ = { .payloads = { {.data = "payload #1", .metadata = "payload metadata #1"}, {.data = "payload #2", .metadata = "payload metadata #2"}, {.data = "payload #3", .metadata = "payload metadata #3"}, }, .tensors = { { .data = "tensor #1", .metadata = "tensor metadata #1", .device = Device{kCpuDeviceType, 0}, }, { .data = "tensor #2", .metadata = "tensor metadata #2", .device = Device{kCpuDeviceType, 0}, }, { .data = "tensor #3", .metadata = "tensor metadata #3", .device = Device{kCpuDeviceType, 0}, }, }, .metadata = "pipe metadata", }; public: void server(Pipe& pipe) override { Message message; Storage storage; std::tie(message, storage) = makeMessage(imessage_); auto future = pipeWriteWithFuture(pipe, message); future.get(); } void client(Pipe& pipe) override { Descriptor descriptor; Storage storage; auto future = pipeReadWithFuture( pipe, /*targetDevices=*/ { Device{kCpuDeviceType, 0}, Device{kCpuDeviceType, 0}, Device{kCpuDeviceType, 0}, }); std::tie(descriptor, storage) = future.get(); expectDescriptorAndStorageMatchMessage(descriptor, storage, imessage_); } }; TEST(Pipe, SimpleWriteRead) { SimpleWriteReadTest test; test.run(); } class SimpleWriteReadPayloadsOnlyTest : public ClientServerPipeTestCase { InlineMessage imessage_ = { .payloads = { {.data = "payload #1", .metadata = "payload metadata #1"}, {.data = "payload #2", .metadata = "payload metadata #2"}, {.data = "payload #3", .metadata = "payload metadata #3"}, }, .tensors = {}, .metadata = "pipe metadata", }; public: void server(Pipe& pipe) override { Message message; Storage storage; std::tie(message, storage) = makeMessage(imessage_); auto future = pipeWriteWithFuture(pipe, message); future.get(); } void client(Pipe& pipe) override { Descriptor descriptor; Storage storage; auto future = pipeReadWithFuture(pipe, /*targetDevices=*/{}); std::tie(descriptor, storage) = future.get(); expectDescriptorAndStorageMatchMessage(descriptor, storage, imessage_); } }; TEST(Pipe, SimpleWriteReadPayloadsOnly) { SimpleWriteReadPayloadsOnlyTest test; test.run(); } class SimpleWriteReadTensorsOnlyTest : public ClientServerPipeTestCase { InlineMessage imessage_ = { .payloads = {}, .tensors = { { .data = "tensor #1", .metadata = "tensor metadata #1", .device = Device{kCpuDeviceType, 0}, }, { .data = "tensor #2", .metadata = "tensor metadata #2", .device = Device{kCpuDeviceType, 0}, }, { .data = "tensor #3", .metadata = "tensor metadata #3", .device = Device{kCpuDeviceType, 0}, }, }, .metadata = "pipe metadata", }; public: void server(Pipe& pipe) override { Message message; Storage storage; std::tie(message, storage) = makeMessage(imessage_); auto future = pipeWriteWithFuture(pipe, message); future.get(); } void client(Pipe& pipe) override { Descriptor descriptor; Storage storage; auto future = pipeReadWithFuture( pipe, /*targetDevices=*/ { Device{kCpuDeviceType, 0}, Device{kCpuDeviceType, 0}, Device{kCpuDeviceType, 0}, }); std::tie(descriptor, storage) = future.get(); expectDescriptorAndStorageMatchMessage(descriptor, storage, imessage_); } }; TEST(Pipe, SimpleWriteReadTensorsOnly) { SimpleWriteReadTensorsOnlyTest test; test.run(); } class SimpleWriteReadWithAllTargetDevicesTest : public ClientServerPipeTestCase { InlineMessage imessage_ = { .payloads = { {.data = "payload #1", .metadata = "payload metadata #1"}, {.data = "payload #2", .metadata = "payload metadata #2"}, {.data = "payload #3", .metadata = "payload metadata #3"}, }, .tensors = { { .data = "tensor #1", .metadata = "tensor metadata #1", .device = Device{kCpuDeviceType, 0}, .targetDevice = Device{kCpuDeviceType, 0}, }, { .data = "tensor #2", .metadata = "tensor metadata #2", .device = Device{kCpuDeviceType, 0}, .targetDevice = Device{kCpuDeviceType, 0}, }, { .data = "tensor #3", .metadata = "tensor metadata #3", .device = Device{kCpuDeviceType, 0}, .targetDevice = Device{kCpuDeviceType, 0}, }, }, .metadata = "pipe metadata", }; public: void server(Pipe& pipe) override { Message message; Storage storage; std::tie(message, storage) = makeMessage(imessage_); auto future = pipeWriteWithFuture(pipe, message); future.get(); } void client(Pipe& pipe) override { Descriptor descriptor; Storage storage; auto future = pipeReadWithFuture( pipe, /*targetDevices=*/ { Device{kCpuDeviceType, 0}, Device{kCpuDeviceType, 0}, Device{kCpuDeviceType, 0}, }); std::tie(descriptor, storage) = future.get(); expectDescriptorAndStorageMatchMessage(descriptor, storage, imessage_); } }; TEST(Pipe, SimpleWriteReadWithAllTargetDevices) { SimpleWriteReadWithAllTargetDevicesTest test; test.run(); } class SimpleWriteReadWithSomeTargetDevicesTest : public ClientServerPipeTestCase { InlineMessage imessage_ = { .payloads = { {.data = "payload #1", .metadata = "payload metadata #1"}, {.data = "payload #2", .metadata = "payload metadata #2"}, {.data = "payload #3", .metadata = "payload metadata #3"}, }, .tensors = { { .data = "tensor #1", .metadata = "tensor metadata #1", .device = Device{kCpuDeviceType, 0}, .targetDevice = Device{kCpuDeviceType, 0}, }, { .data = "tensor #2", .metadata = "tensor metadata #2", .device = Device{kCpuDeviceType, 0}, }, { .data = "tensor #3", .metadata = "tensor metadata #3", .device = Device{kCpuDeviceType, 0}, .targetDevice = Device{kCpuDeviceType, 0}, }, }, .metadata = "pipe metadata", }; public: void server(Pipe& pipe) override { Message message; Storage storage; std::tie(message, storage) = makeMessage(imessage_); auto future = pipeWriteWithFuture(pipe, message); future.get(); } void client(Pipe& pipe) override { Descriptor descriptor; Storage storage; auto future = pipeReadWithFuture( pipe, /*targetDevices=*/ { Device{kCpuDeviceType, 0}, Device{kCpuDeviceType, 0}, Device{kCpuDeviceType, 0}, }); std::tie(descriptor, storage) = future.get(); expectDescriptorAndStorageMatchMessage(descriptor, storage, imessage_); } }; TEST(Pipe, SimpleWriteReadWithSomeTargetDevices) { SimpleWriteReadWithSomeTargetDevicesTest test; test.run(); } class MultipleWriteReadTest : public ClientServerPipeTestCase { InlineMessage imessage1_ = { .payloads = { {.data = "payload #1.1", .metadata = "payload metadata #1.1"}, }, .tensors = { { .data = "tensor #1.1", .metadata = "tensor metadata #1.1", .device = Device{kCpuDeviceType, 0}, }, }, .metadata = "message metadata", }; InlineMessage imessage2_ = { .payloads = { {.data = "payload #2.1", .metadata = "payload metadata #2.1"}, }, .tensors = { { .data = "tensor #2.1", .metadata = "tensor metadata #2.1", .device = Device{kCpuDeviceType, 0}, }, }, .metadata = "message metadata", }; public: void server(Pipe& pipe) override { Message message1; Storage storage1; std::tie(message1, storage1) = makeMessage(imessage1_); auto future1 = pipeWriteWithFuture(pipe, message1); Message message2; Storage storage2; std::tie(message2, storage2) = makeMessage(imessage2_); auto future2 = pipeWriteWithFuture(pipe, message2); future1.get(); future2.get(); } void client(Pipe& pipe) override { auto future1 = pipeReadWithFuture( pipe, /*targetDevices=*/ { Device{kCpuDeviceType, 0}, }); auto future2 = pipeReadWithFuture( pipe, /*targetDevices=*/ { Device{kCpuDeviceType, 0}, }); Descriptor descriptor1; Storage storage1; std::tie(descriptor1, storage1) = future1.get(); expectDescriptorAndStorageMatchMessage(descriptor1, storage1, imessage1_); Descriptor descriptor2; Storage storage2; std::tie(descriptor2, storage2) = future2.get(); expectDescriptorAndStorageMatchMessage(descriptor2, storage2, imessage2_); } }; TEST(Pipe, MultipleWriteRead) { MultipleWriteReadTest test; test.run(); } class MultipleWriteReadWithSomeTargetDevicesTest : public ClientServerPipeTestCase { InlineMessage imessage1_ = { .payloads = { {.data = "payload #1.1", .metadata = "payload metadata #1.1"}, }, .tensors = { { .data = "tensor #1.1", .metadata = "tensor metadata #1.1", .device = Device{kCpuDeviceType, 0}, }, }, .metadata = "message metadata", }; InlineMessage imessage2_ = { .payloads = { {.data = "payload #2.1", .metadata = "payload metadata #2.1"}, }, .tensors = { { .data = "tensor #2.1", .metadata = "tensor metadata #2.1", .device = Device{kCpuDeviceType, 0}, .targetDevice = Device{kCpuDeviceType, 0}, }, }, .metadata = "message metadata", }; public: void server(Pipe& pipe) override { Message message1; Storage storage1; std::tie(message1, storage1) = makeMessage(imessage1_); auto future1 = pipeWriteWithFuture(pipe, message1); Message message2; Storage storage2; std::tie(message2, storage2) = makeMessage(imessage2_); auto future2 = pipeWriteWithFuture(pipe, message2); future1.get(); future2.get(); } void client(Pipe& pipe) override { auto future1 = pipeReadWithFuture( pipe, /*targetDevices=*/ { Device{kCpuDeviceType, 0}, }); auto future2 = pipeReadWithFuture( pipe, /*targetDevices=*/ { Device{kCpuDeviceType, 0}, }); Descriptor descriptor1; Storage storage1; std::tie(descriptor1, storage1) = future1.get(); expectDescriptorAndStorageMatchMessage(descriptor1, storage1, imessage1_); Descriptor descriptor2; Storage storage2; std::tie(descriptor2, storage2) = future2.get(); expectDescriptorAndStorageMatchMessage(descriptor2, storage2, imessage2_); } }; TEST(Pipe, MultipleWriteReadWithSomeTargetDevices) { MultipleWriteReadWithSomeTargetDevicesTest test; test.run(); } class WriteFromBothThenReadTest : public ClientServerPipeTestCase { InlineMessage imessage1_ = { .payloads = { {.data = "payload #1.1", .metadata = "payload metadata #1.1"}, }, .tensors = { { .data = "tensor #1.1", .metadata = "tensor metadata #1.1", .device = Device{kCpuDeviceType, 0}, }, }, .metadata = "message metadata", }; InlineMessage imessage2_ = { .payloads = { {.data = "payload #2.1", .metadata = "payload metadata #2.1"}, }, .tensors = { { .data = "tensor #2.1", .metadata = "tensor metadata #2.1", .device = Device{kCpuDeviceType, 0}, }, }, .metadata = "message metadata", }; public: void server(Pipe& pipe) override { Message message; Storage writeStorage; std::tie(message, writeStorage) = makeMessage(imessage1_); auto writeFuture = pipeWriteWithFuture(pipe, message); auto readFuture = pipeReadWithFuture( pipe, /*targetDevices=*/ { Device{kCpuDeviceType, 0}, }); writeFuture.get(); Descriptor descriptor; Storage readStorage; std::tie(descriptor, readStorage) = readFuture.get(); expectDescriptorAndStorageMatchMessage(descriptor, readStorage, imessage2_); } void client(Pipe& pipe) override { Message message; Storage writeStorage; std::tie(message, writeStorage) = makeMessage(imessage2_); auto writeFuture = pipeWriteWithFuture(pipe, message); auto readFuture = pipeReadWithFuture( pipe, /*targetDevices=*/ { Device{kCpuDeviceType, 0}, }); writeFuture.get(); Descriptor descriptor; Storage readStorage; std::tie(descriptor, readStorage) = readFuture.get(); expectDescriptorAndStorageMatchMessage(descriptor, readStorage, imessage1_); } }; TEST(Pipe, WriteFromBothThenRead) { WriteFromBothThenReadTest test; test.run(); } ================================================ FILE: tensorpipe/test/core/pipe_test.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include #include #if TP_USE_CUDA #include #include #endif // TP_USE_CUDA struct Storage { std::vector> payloads; std::vector, tensorpipe::Buffer>> tensors; }; struct InlineMessage { struct Payload { std::string data; std::string metadata; }; struct Tensor { std::string data; std::string metadata; tensorpipe::Device device; tensorpipe::optional targetDevice; }; std::vector payloads; std::vector tensors; std::string metadata; }; inline std::pair makeMessage( InlineMessage imessage) { tensorpipe::Message message; Storage storage; for (auto& payload : imessage.payloads) { size_t length = payload.data.length(); auto data = std::unique_ptr>( new uint8_t[length]); std::memcpy(data.get(), &payload.data[0], length); message.payloads.push_back({ .data = data.get(), .length = length, .metadata = payload.metadata, }); storage.payloads.push_back(std::move(data)); } for (auto& tensor : imessage.tensors) { size_t length = tensor.data.length(); tensorpipe::Buffer buffer; std::shared_ptr data; if (tensor.device.type == tensorpipe::kCpuDeviceType) { data = std::unique_ptr>( new uint8_t[length]); std::memcpy(data.get(), &tensor.data[0], length); buffer = tensorpipe::CpuBuffer{.ptr = data.get()}; #if TP_USE_CUDA } else if (tensor.device.type == tensorpipe::kCudaDeviceType) { void* cudaPtr; TP_CUDA_CHECK(cudaSetDevice(tensor.device.index)); TP_CUDA_CHECK(cudaMalloc(&cudaPtr, length)); data = std::unique_ptr>( cudaPtr, [](void* ptr) { TP_CUDA_CHECK(cudaFree(ptr)); }); // TODO: Properly dispose of stream when done. cudaStream_t stream; TP_CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); buffer = tensorpipe::CudaBuffer{ .ptr = data.get(), .stream = stream, }; TP_CUDA_CHECK(cudaMemcpyAsync( cudaPtr, &tensor.data[0], length, cudaMemcpyDefault, stream)); #endif // TP_USE_CUDA } else { ADD_FAILURE() << "Unexpected source device: " << tensor.device.toString(); } message.tensors.push_back({ .buffer = buffer, .length = length, .targetDevice = tensor.targetDevice, .metadata = tensor.metadata, }); storage.tensors.push_back({std::move(data), std::move(buffer)}); } message.metadata = imessage.metadata; return {std::move(message), std::move(storage)}; } inline std::pair makeAllocation( const tensorpipe::Descriptor& descriptor, const std::vector& devices) { tensorpipe::Allocation allocation; Storage storage; for (const auto& payload : descriptor.payloads) { auto data = std::unique_ptr>( new uint8_t[payload.length]); allocation.payloads.push_back({.data = data.get()}); storage.payloads.push_back(std::move(data)); } TP_DCHECK(devices.size() == descriptor.tensors.size()); for (size_t tensorIdx = 0; tensorIdx < descriptor.tensors.size(); ++tensorIdx) { const auto& tensor = descriptor.tensors[tensorIdx]; tensorpipe::Device targetDevice = devices[tensorIdx]; if (tensor.targetDevice.has_value()) { TP_DCHECK(targetDevice == *tensor.targetDevice); } if (targetDevice.type == tensorpipe::kCpuDeviceType) { auto data = std::unique_ptr>( new uint8_t[tensor.length]); tensorpipe::Buffer buffer = tensorpipe::CpuBuffer{.ptr = data.get()}; allocation.tensors.push_back({.buffer = buffer}); storage.tensors.push_back({std::move(data), std::move(buffer)}); #if TP_USE_CUDA } else if (targetDevice.type == tensorpipe::kCudaDeviceType) { void* cudaPtr; TP_CUDA_CHECK(cudaSetDevice(targetDevice.index)); TP_CUDA_CHECK(cudaMalloc(&cudaPtr, tensor.length)); auto data = std::unique_ptr>( cudaPtr, [](void* ptr) { TP_CUDA_CHECK(cudaFree(ptr)); }); // TODO: Properly dispose of stream when done. cudaStream_t stream; TP_CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); tensorpipe::Buffer buffer = tensorpipe::CudaBuffer{ .ptr = data.get(), .stream = stream, }; allocation.tensors.push_back({.buffer = buffer}); storage.tensors.push_back({std::move(data), std::move(buffer)}); #endif // TP_USE_CUDA } else { ADD_FAILURE() << "Unexpected target device: " << targetDevice.toString(); } } return {std::move(allocation), std::move(storage)}; } inline std::future pipeWriteWithFuture( tensorpipe::Pipe& pipe, tensorpipe::Message message) { auto promise = std::make_shared>(); auto future = promise->get_future(); pipe.write( std::move(message), [promise{std::move(promise)}](const tensorpipe::Error& error) { if (error) { promise->set_exception( std::make_exception_ptr(std::runtime_error(error.what()))); return; } promise->set_value(); }); return future; } inline std::future> pipeReadWithFuture( tensorpipe::Pipe& pipe, std::vector targetDevices) { auto promise = std::make_shared< std::promise>>(); auto future = promise->get_future(); pipe.readDescriptor([&pipe, promise{std::move(promise)}, targetDevices{std::move(targetDevices)}]( const tensorpipe::Error& error, tensorpipe::Descriptor descriptor) mutable { if (error) { promise->set_exception( std::make_exception_ptr(std::runtime_error(error.what()))); return; } tensorpipe::Allocation allocation; Storage storage; std::tie(allocation, storage) = makeAllocation(descriptor, targetDevices); pipe.read( std::move(allocation), [promise{std::move(promise)}, descriptor{std::move(descriptor)}, storage{std::move(storage)}](const tensorpipe::Error& error) mutable { if (error) { promise->set_exception( std::make_exception_ptr(std::runtime_error(error.what()))); return; } promise->set_value(std::make_tuple( std::move(descriptor), std::move(storage))); }); }); return future; } inline void expectDescriptorAndStorageMatchMessage( tensorpipe::Descriptor descriptor, Storage storage, InlineMessage imessage) { EXPECT_EQ(imessage.metadata, descriptor.metadata); EXPECT_EQ(descriptor.payloads.size(), storage.payloads.size()); EXPECT_EQ(imessage.payloads.size(), storage.payloads.size()); for (size_t idx = 0; idx < imessage.payloads.size(); ++idx) { EXPECT_EQ( imessage.payloads[idx].metadata, descriptor.payloads[idx].metadata); EXPECT_EQ( imessage.payloads[idx].data.length(), descriptor.payloads[idx].length); EXPECT_EQ( imessage.payloads[idx].data, std::string( static_cast(storage.payloads[idx].get()), descriptor.payloads[idx].length)); } EXPECT_EQ(descriptor.tensors.size(), storage.tensors.size()); EXPECT_EQ(imessage.tensors.size(), storage.tensors.size()); for (size_t idx = 0; idx < imessage.tensors.size(); ++idx) { EXPECT_TRUE( imessage.tensors[idx].device == descriptor.tensors[idx].sourceDevice); EXPECT_EQ(imessage.tensors[idx].metadata, descriptor.tensors[idx].metadata); EXPECT_EQ( imessage.tensors[idx].targetDevice, descriptor.tensors[idx].targetDevice); const tensorpipe::Device& device = storage.tensors[idx].second.device(); EXPECT_TRUE( !imessage.tensors[idx].targetDevice || imessage.tensors[idx].targetDevice == device); size_t length = descriptor.tensors[idx].length; EXPECT_EQ(imessage.tensors[idx].data.length(), length); if (device.type == tensorpipe::kCpuDeviceType) { const tensorpipe::CpuBuffer& buffer = storage.tensors[idx].second.unwrap(); EXPECT_EQ( imessage.tensors[idx].data, std::string(static_cast(buffer.ptr), length)); #if TP_USE_CUDA } else if (device.type == tensorpipe::kCudaDeviceType) { const tensorpipe::CudaBuffer& buffer = storage.tensors[idx].second.unwrap(); std::string data(length, 0x0); TP_CUDA_CHECK(cudaStreamSynchronize(buffer.stream)); TP_CUDA_CHECK( cudaMemcpy(&data[0], buffer.ptr, length, cudaMemcpyDefault)); EXPECT_EQ(imessage.tensors[idx].data, data.data()); #endif // TP_USE_CUDA } else { ADD_FAILURE() << "Unexpected target device: " << device.toString(); } } } inline std::vector genUrls() { std::vector res; #if TENSORPIPE_HAS_SHM_TRANSPORT res.push_back("shm://"); #endif // TENSORPIPE_HAS_SHM_TRANSPORT res.push_back("uv://127.0.0.1"); return res; } inline std::shared_ptr makeContext() { auto context = std::make_shared(); context->registerTransport(0, "uv", tensorpipe::transport::uv::create()); #if TENSORPIPE_HAS_SHM_TRANSPORT context->registerTransport(1, "shm", tensorpipe::transport::shm::create()); #endif // TENSORPIPE_HAS_SHM_TRANSPORT context->registerChannel(100, "basic", tensorpipe::channel::basic::create()); #if TENSORPIPE_HAS_CMA_CHANNEL context->registerChannel(101, "cma", tensorpipe::channel::cma::create()); #endif // TENSORPIPE_HAS_CMA_CHANNEL #if TP_USE_CUDA context->registerChannel( 10, "cuda_basic", tensorpipe::channel::cuda_basic::create( tensorpipe::channel::basic::create())); #if TENSORPIPE_HAS_CUDA_IPC_CHANNEL context->registerChannel( 11, "cuda_ipc", tensorpipe::channel::cuda_ipc::create()); #endif // TENSORPIPE_HAS_CUDA_IPC_CHANNEL context->registerChannel( 12, "cuda_xth", tensorpipe::channel::cuda_xth::create()); #endif // TP_USE_CUDA return context; } class ClientServerPipeTestCase { ForkedThreadPeerGroup pg_; public: void run() { pg_.spawn( [&]() { auto context = makeContext(); auto listener = context->listen(genUrls()); pg_.send(PeerGroup::kClient, listener->url("uv")); std::promise> promise; listener->accept([&](const tensorpipe::Error& error, std::shared_ptr pipe) { if (error) { promise.set_exception( std::make_exception_ptr(std::runtime_error(error.what()))); } else { promise.set_value(std::move(pipe)); } }); std::shared_ptr pipe = promise.get_future().get(); server(*pipe); pg_.done(PeerGroup::kServer); pg_.join(PeerGroup::kServer); context->join(); }, [&]() { auto context = makeContext(); auto url = pg_.recv(PeerGroup::kClient); auto pipe = context->connect(url); client(*pipe); pg_.done(PeerGroup::kClient); pg_.join(PeerGroup::kClient); context->join(); }); } virtual void client(tensorpipe::Pipe& pipe) = 0; virtual void server(tensorpipe::Pipe& pipe) = 0; virtual ~ClientServerPipeTestCase() = default; }; ================================================ FILE: tensorpipe/test/peer_group.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include #include class PeerGroup { public: static constexpr int kNumPeers = 2; static constexpr int kServer = 0; static constexpr int kClient = 1; virtual ~PeerGroup() = default; // Send message to given peer. virtual void send(int receiverId, const std::string&) = 0; // Read next message for given peer. This method is blocking. virtual std::string recv(int receiverId) = 0; // Spawn two peers each running one of the provided functions. virtual void spawn(std::function, std::function) = 0; // Whether the two endpoints are two threads in the same process (as opposed // to two separate processes). virtual bool endpointsInSameProcess() const = 0; // Signal other peers that this peer is done. void done(int selfId) { send(1 - selfId, doneString_); std::unique_lock lock(m_); done_[selfId] = true; condVar_[selfId].notify_one(); } // Wait for all peers (including this one) to be done. void join(int selfId) { EXPECT_EQ(doneString_, recv(selfId)); std::unique_lock lock(m_); condVar_[selfId].wait(lock, [&] { return done_[selfId]; }); } private: // This should be static but then we need to define it out-of-line (or mark it // as inline once we can use C++-17). const std::string doneString_ = "done"; std::mutex m_; std::array done_{{false, false}}; std::array condVar_; }; class ThreadPeerGroup : public PeerGroup { public: void send(int receiverId, const std::string& str) override { q_[receiverId].push(str); } std::string recv(int receiverId) override { return q_[receiverId].pop(); } void spawn(std::function f1, std::function f2) override { std::array, kNumPeers> fns = { std::move(f1), std::move(f2)}; std::array ts; for (int peerId = 0; peerId < kNumPeers; ++peerId) { ts[peerId] = std::thread(fns[peerId]); } for (auto& t : ts) { t.join(); } } bool endpointsInSameProcess() const override { return true; } private: std::array, kNumPeers> q_; }; class ForkedThreadPeerGroup : public ThreadPeerGroup { public: void spawn(std::function f1, std::function f2) override { // Some tests modify the global state of the process (such as initializing // the CUDA context), which would cause other tests running as sub-processes // to fail. Here, we run all thread-based tests in a sub-process to avoid // this issue. pid_t pid = fork(); TP_THROW_SYSTEM_IF(pid < 0, errno) << "Failed to fork"; if (pid == 0) { ThreadPeerGroup::spawn(f1, f2); std::exit(((testing::Test::HasFailure()) ? 1 : 0)); } int status; TP_THROW_SYSTEM_IF(waitpid(pid, &status, 0) < 0, errno) << "Failed to wait for child test process"; EXPECT_TRUE(WIFEXITED(status)); if (WIFSIGNALED(status)) { TP_LOG_WARNING() << "Test process terminated with signal " << WTERMSIG(status); } const int exitStatus = WEXITSTATUS(status); EXPECT_EQ(0, exitStatus); } }; class ProcessPeerGroup : public PeerGroup { public: void send(int receiverId, const std::string& str) override { uint64_t len = str.length(); int ret; ret = write(pipefd_[receiverId][kWriteEnd], &len, sizeof(len)); TP_THROW_SYSTEM_IF(ret < 0, errno) << "Failed to write to pipe"; EXPECT_EQ(sizeof(len), ret); ret = write(pipefd_[receiverId][kWriteEnd], str.data(), len); TP_THROW_SYSTEM_IF(ret < 0, errno) << "Failed to write to pipe"; EXPECT_EQ(len, ret); } std::string recv(int receiverId) override { int ret; uint64_t len; ret = read(pipefd_[receiverId][kReadEnd], &len, sizeof(len)); TP_THROW_SYSTEM_IF(ret < 0, errno) << "Failed to read from pipe"; EXPECT_EQ(sizeof(len), ret); std::string str(len, 0); ret = read(pipefd_[receiverId][kReadEnd], &str[0], len); TP_THROW_SYSTEM_IF(ret < 0, errno) << "Failed to read from pipe"; EXPECT_EQ(len, ret); return str; } void spawn(std::function f1, std::function f2) override { std::array, kNumPeers> fns = { std::move(f1), std::move(f2)}; std::array pids = {-1, -1}; for (int peerId = 0; peerId < kNumPeers; ++peerId) { TP_THROW_SYSTEM_IF(pipe(pipefd_[peerId].data()) < 0, errno) << "Failed to create pipe"; } for (int peerId = 0; peerId < kNumPeers; ++peerId) { pids[peerId] = fork(); TP_THROW_SYSTEM_IF(pids[peerId] < 0, errno) << "Failed to fork"; if (pids[peerId] == 0) { try { // Close writing end of our pipe. TP_THROW_SYSTEM_IF(close(pipefd_[peerId][kWriteEnd]) < 0, errno) << "Failed to close fd"; // Close reading end of other pipe. TP_THROW_SYSTEM_IF(close(pipefd_[1 - peerId][kReadEnd]) < 0, errno) << "Failed to close fd"; fns[peerId](); } catch (const std::exception& e) { TP_LOG_ERROR() << "Child #" << peerId << " (PID " << getpid() << ") encountered exception " << e.what(); std::exit(2); } catch (...) { std::exit(3); } std::exit(((testing::Test::HasFailure()) ? 1 : 0)); } } // Close all pipes in parent process. for (int peerId = 0; peerId < kNumPeers; ++peerId) { for (int pipeEnd = 0; pipeEnd < 2; ++pipeEnd) { TP_THROW_SYSTEM_IF(close(pipefd_[peerId][pipeEnd]) < 0, errno) << "Failed to close fd"; } } for (int peerId = 0; peerId < kNumPeers; ++peerId) { int status; TP_THROW_SYSTEM_IF(waitpid(-1, &status, 0) < 0, errno) << "Failed to wait for child process"; EXPECT_TRUE(WIFEXITED(status)); if (WIFSIGNALED(status)) { TP_LOG_WARNING() << "Peer process terminated with signal " << WTERMSIG(status); } const int exitStatus = WEXITSTATUS(status); EXPECT_EQ(0, exitStatus); } } bool endpointsInSameProcess() const override { return false; } private: static constexpr int kReadEnd = 0; static constexpr int kWriteEnd = 1; std::array, kNumPeers> pipefd_; }; ================================================ FILE: tensorpipe/test/python/tensorpipe.py ================================================ #!/usr/bin/env python3 # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. import threading import unittest import pytensorpipe as tp class TestTensorpipe(unittest.TestCase): def test_read_write(self): context = tp.Context() context.register_transport(0, "tcp", tp.create_uv_transport()) create_shm_transport = getattr(tp, "create_shm_transport", None) if create_shm_transport is not None: context.register_transport(-1, "shm", create_shm_transport()) context.register_channel(0, "basic", tp.create_basic_channel()) create_cma_channel = getattr(tp, "create_cma_channel", None) if create_cma_channel is not None: context.register_channel(-1, "cma", create_cma_channel()) # We must keep a reference to it, or it will be destroyed early. server_pipe = None listener: tp.Listener = context.listen(["tcp://127.0.0.1"]) write_completed = threading.Event() def on_connection(pipe: tp.Pipe) -> None: global server_pipe payload = tp.OutgoingPayload(b"Hello ", b"a greeting") tensor = tp.OutgoingTensor(b"World!", b"a place") message = tp.OutgoingMessage(b"metadata", [payload], [tensor]) pipe.write(message, on_write) server_pipe = pipe def on_write() -> None: write_completed.set() listener.listen(on_connection) client_pipe: tp.Pipe = context.connect(listener.get_url("tcp")) received_payloads = None received_tensors = None read_completed = threading.Event() def on_read_descriptor(message: tp.IncomingMessage) -> None: nonlocal received_payloads, received_tensors self.assertEqual(message.metadata, bytearray(b"metadata")) received_payloads = [] for payload in message.payloads: self.assertEqual(payload.metadata, bytearray(b"a greeting")) received_payloads.append(bytearray(payload.length)) payload.buffer = received_payloads[-1] received_tensors = [] for tensor in message.tensors: self.assertEqual(tensor.metadata, bytearray(b"a place")) received_tensors.append(bytearray(tensor.length)) tensor.buffer = received_tensors[-1] client_pipe.read(message, on_read) def on_read() -> None: read_completed.set() client_pipe.read_descriptor(on_read_descriptor) write_completed.wait() read_completed.wait() self.assertEqual(received_payloads, [bytearray(b"Hello ")]) self.assertEqual(received_tensors, [bytearray(b"World!")]) # Due to a current limitation we're not releasing the GIL when calling # the context's destructor, which implicitly calls join, which may fire # some callbacks that also try to acquire the GIL and thus deadlock. # So, for now, we must explicitly call join. # See https://github.com/pybind/pybind11/issues/1446. context.join() if __name__ == "__main__": unittest.main() ================================================ FILE: tensorpipe/test/test.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include // One-time init to use EPIPE errors instead of SIGPIPE namespace { struct Initializer { explicit Initializer() { signal(SIGPIPE, SIG_IGN); } }; Initializer initializer; } // namespace ================================================ FILE: tensorpipe/test/test_environment.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #if TP_USE_CUDA #include #include #include #include #include #include #endif // TP_USE_CUDA int TestEnvironment::numCudaDevices() { static int count = -1; if (count == -1) { #if TP_USE_CUDA pid_t pid = fork(); TP_THROW_SYSTEM_IF(pid < 0, errno) << "Failed to fork"; if (pid == 0) { int res; TP_CUDA_CHECK(cudaGetDeviceCount(&res)); std::exit(res); } else { int status; TP_THROW_SYSTEM_IF(waitpid(pid, &status, 0) < 0, errno) << "Failed to wait for child process"; TP_THROW_ASSERT_IF(!WIFEXITED(status)); count = WEXITSTATUS(status); } #else // TP_USE_CUDA count = 0; #endif // TP_USE_CUDA } return count; } ================================================ FILE: tensorpipe/test/test_environment.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once class TestEnvironment { public: static int numCudaDevices(); }; ================================================ FILE: tensorpipe/test/transport/connection_test.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include using namespace tensorpipe; using namespace tensorpipe::transport; TEST_P(TransportTest, Connection_Initialization) { constexpr size_t numBytes = 13; std::array garbage; testConnection( [&](std::shared_ptr conn) { doRead( conn, [&](const Error& error, const void* /* unused */, size_t len) { ASSERT_FALSE(error) << error.what(); ASSERT_EQ(len, garbage.size()); peers_->done(PeerGroup::kServer); }); peers_->join(PeerGroup::kServer); }, [&](std::shared_ptr conn) { doWrite(conn, garbage.data(), garbage.size(), [&](const Error& error) { ASSERT_FALSE(error) << error.what(); peers_->done(PeerGroup::kClient); }); peers_->join(PeerGroup::kClient); }); } TEST_P(TransportTest, Connection_InitializationError) { int numRequests = 10; testConnection( [&](std::shared_ptr /* unused */) { // Closes connection }, [&](std::shared_ptr conn) { for (int i = 0; i < numRequests; i++) { std::promise readCompletedProm; doRead( conn, [&, conn]( const Error& error, const void* /* unused */, size_t /* unused */) { ASSERT_TRUE(error); readCompletedProm.set_value(); }); readCompletedProm.get_future().wait(); } }); } // Disabled because no one really knows what this test was meant to check. TEST_P(TransportTest, DISABLED_Connection_DestroyConnectionFromCallback) { testConnection( [&](std::shared_ptr /* unused */) { // Closes connection }, [&](std::shared_ptr conn) { // This should be the only connection instance. EXPECT_EQ(conn.use_count(), 1); // Move connection instance to lambda scope, so we can destroy // the only instance we have from the callback itself. This // tests that the transport keeps the connection alive as long // as it's executing a callback. doRead( conn, [conn]( const Error& /* unused */, const void* /* unused */, size_t /* unused */) mutable { // Destroy connection from within callback. EXPECT_GT(conn.use_count(), 1); conn.reset(); }); }); } namespace { struct MyNopType { uint32_t myIntField; NOP_STRUCTURE(MyNopType, myIntField); }; } // namespace TEST_P(TransportTest, Connection_NopWrite) { constexpr size_t kSize = 0x42; testConnection( [&](std::shared_ptr conn) { auto holder = std::make_shared>(); MyNopType& object = holder->getObject(); conn->read(*holder, [&, conn, holder](const Error& error) { ASSERT_FALSE(error) << error.what(); ASSERT_EQ(object.myIntField, kSize); peers_->done(PeerGroup::kServer); }); peers_->join(PeerGroup::kServer); }, [&](std::shared_ptr conn) { auto holder = std::make_shared>(); MyNopType& object = holder->getObject(); object.myIntField = kSize; conn->write(*holder, [&, conn, holder](const Error& error) { ASSERT_FALSE(error) << error.what(); peers_->done(PeerGroup::kClient); }); peers_->join(PeerGroup::kClient); }); } TEST_P(TransportTest, Connection_QueueWritesBeforeReads) { constexpr int kMsgSize = 16 * 1024; constexpr int numMsg = 10; const std::string kReady = "ready"; std::string msg[numMsg]; for (int i = 0; i < numMsg; i++) { msg[i] = std::string(kMsgSize, static_cast(i)); } testConnection( [&](std::shared_ptr conn) { for (int i = 0; i < numMsg; i++) { doWrite( conn, msg[i].c_str(), msg[i].length(), [&, conn, i](const Error& error) { ASSERT_FALSE(error) << error.what(); if (i == numMsg - 1) { peers_->send(PeerGroup::kClient, kReady); peers_->done(PeerGroup::kServer); } }); } peers_->join(PeerGroup::kServer); }, [&](std::shared_ptr conn) { ASSERT_EQ(kReady, peers_->recv(PeerGroup::kClient)); for (int i = 0; i < numMsg; i++) { doRead( conn, [&, conn, i](const Error& error, const void* data, size_t len) { ASSERT_FALSE(error) << error.what(); ASSERT_EQ(len, msg[i].length()); const char* cdata = (const char*)data; for (int j = 0; j < len; ++j) { const char c = cdata[j]; ASSERT_EQ(c, msg[i][j]) << "Wrong value at position " << j << " of " << msg[i].length(); } if (i == numMsg - 1) { peers_->done(PeerGroup::kClient); } }); } peers_->join(PeerGroup::kClient); }); } // TODO: Enable this test when uv transport could handle TEST_P(TransportTest, DISABLED_Connection_EmptyBuffer) { constexpr size_t numBytes = 13; std::array garbage; int ioNum = 100; testConnection( [&](std::shared_ptr conn) { std::atomic n(ioNum); for (int i = 0; i < ioNum; i++) { if (i % 2 == 0) { // Empty buffer doRead( conn, nullptr, 0, [&, conn](const Error& error, const void* ptr, size_t len) { ASSERT_FALSE(error) << error.what(); ASSERT_EQ(len, 0); ASSERT_EQ(ptr, nullptr); if (--n == 0) { peers_->done(PeerGroup::kServer); } }); } else { // Garbage buffer doRead( conn, [&, conn]( const Error& error, const void* /* unused */, size_t len) { ASSERT_FALSE(error) << error.what(); ASSERT_EQ(len, garbage.size()); if (--n == 0) { peers_->done(PeerGroup::kServer); } }); } } peers_->join(PeerGroup::kServer); }, [&](std::shared_ptr conn) { std::atomic n(ioNum); for (int i = 0; i < ioNum; i++) { if ((i & 1) == 0) { // Empty buffer doWrite(conn, nullptr, 0, [&, conn](const Error& error) { ASSERT_FALSE(error) << error.what(); if (--n == 0) { peers_->done(PeerGroup::kClient); } }); } else { // Garbage buffer doWrite( conn, garbage.data(), garbage.size(), [&, conn](const Error& error) { ASSERT_FALSE(error) << error.what(); if (--n == 0) { peers_->done(PeerGroup::kClient); } }); } } peers_->join(PeerGroup::kClient); }); } ================================================ FILE: tensorpipe/test/transport/context_test.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include using namespace tensorpipe; using namespace tensorpipe::transport; TEST_P(TransportTest, Context_Basics) { auto context = GetParam()->getContext(); auto addr = GetParam()->defaultAddr(); { std::mutex mutex; std::condition_variable cv; std::vector> connections; // Listener runs callback for every new connection. auto listener = context->listen(addr); listener->accept( [&](const Error& error, std::shared_ptr connection) { ASSERT_FALSE(error) << error.what(); std::lock_guard lock(mutex); connections.push_back(std::move(connection)); cv.notify_one(); }); // Connect to listener. auto conn = context->connect(listener->addr()); // Wait for new connection { std::unique_lock lock(mutex); while (connections.empty()) { cv.wait(lock); } } } context->join(); } TEST_P(TransportTest, Context_DomainDescriptor) { auto context = GetParam()->getContext(); { const auto& domainDescriptor = context->domainDescriptor(); EXPECT_FALSE(domainDescriptor.empty()); } context->join(); } ================================================ FILE: tensorpipe/test/transport/ibv/connection_test.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include using namespace tensorpipe; using namespace tensorpipe::transport; namespace { class IbvTransportTest : public TransportTest {}; IbvTransportTestHelper helper; // This value is defined in tensorpipe/transport/ibv/connection.h static constexpr auto kBufferSize = 2 * 1024 * 1024; } // namespace TEST_P(IbvTransportTest, Chunking) { // This is larger than the default ring buffer size. const int kMsgSize = 5 * kBufferSize; std::string srcBuf(kMsgSize, 0x42); auto dstBuf = std::make_unique(kMsgSize); testConnection( [&](std::shared_ptr conn) { doRead( conn, dstBuf.get(), kMsgSize, [&, conn](const Error& error, const void* ptr, size_t len) { ASSERT_FALSE(error) << error.what(); ASSERT_EQ(len, kMsgSize); ASSERT_EQ(ptr, dstBuf.get()); for (int i = 0; i < kMsgSize; ++i) { ASSERT_EQ(dstBuf[i], srcBuf[i]); } peers_->done(PeerGroup::kServer); }); peers_->join(PeerGroup::kServer); }, [&](std::shared_ptr conn) { doWrite( conn, srcBuf.c_str(), srcBuf.length(), [&, conn](const Error& error) { ASSERT_FALSE(error) << error.what(); peers_->done(PeerGroup::kClient); }); peers_->join(PeerGroup::kClient); }); } TEST_P(IbvTransportTest, ChunkingImplicitRead) { // This is larger than the default ring buffer size. const size_t kMsgSize = 5 * kBufferSize; std::string msg(kMsgSize, 0x42); testConnection( [&](std::shared_ptr conn) { doRead( conn, [&, conn](const Error& error, const void* ptr, size_t len) { ASSERT_FALSE(error) << error.what(); ASSERT_EQ(len, kMsgSize); for (int i = 0; i < kMsgSize; ++i) { ASSERT_EQ(static_cast(ptr)[i], msg[i]); } peers_->done(PeerGroup::kServer); }); peers_->join(PeerGroup::kServer); }, [&](std::shared_ptr conn) { doWrite(conn, msg.c_str(), msg.length(), [&, conn](const Error& error) { ASSERT_FALSE(error) << error.what(); peers_->done(PeerGroup::kClient); }); peers_->join(PeerGroup::kClient); }); } TEST_P(IbvTransportTest, QueueWrites) { // This is large enough that two of those will not fit in the ring buffer at // the same time. constexpr int numMsg = 2; constexpr size_t numBytes = (3 * kBufferSize) / 4; const std::string kReady = "ready"; std::array garbage; testConnection( [&](std::shared_ptr conn) { // Wait for peer to queue up writes before attempting to read EXPECT_EQ(kReady, peers_->recv(PeerGroup::kServer)); for (int i = 0; i < numMsg; ++i) { doRead( conn, [&, conn, i](const Error& error, const void* ptr, size_t len) { ASSERT_FALSE(error) << error.what(); ASSERT_EQ(len, numBytes); if (i == numMsg - 1) { peers_->done(PeerGroup::kServer); } }); } peers_->join(PeerGroup::kServer); }, [&](std::shared_ptr conn) { for (int i = 0; i < numMsg; ++i) { doWrite( conn, garbage.data(), garbage.size(), [&, conn, i](const Error& error) { ASSERT_FALSE(error) << error.what(); if (i == numMsg - 1) { peers_->done(PeerGroup::kClient); } }); } peers_->send(PeerGroup::kServer, kReady); peers_->join(PeerGroup::kClient); }); } namespace { struct MyNopType { std::string myStringField; NOP_STRUCTURE(MyNopType, myStringField); }; } // namespace TEST_P(IbvTransportTest, NopWriteWrapAround) { constexpr int numMsg = 2; constexpr size_t kSize = (3 * kBufferSize) / 4; testConnection( [&](std::shared_ptr conn) { for (int i = 0; i < numMsg; ++i) { auto holder = std::make_shared>(); conn->read(*holder, [&, conn, holder, i](const Error& error) { ASSERT_FALSE(error) << error.what(); ASSERT_EQ(holder->getObject().myStringField.length(), kSize); if (i == numMsg - 1) { peers_->done(PeerGroup::kServer); } }); } peers_->join(PeerGroup::kServer); }, [&](std::shared_ptr conn) { for (int i = 0; i < numMsg; ++i) { auto holder = std::make_shared>(); holder->getObject().myStringField = std::string(kSize, 'B'); conn->write(*holder, [&, conn, holder, i](const Error& error) { ASSERT_FALSE(error) << error.what(); if (i == numMsg - 1) { peers_->done(PeerGroup::kClient); } }); } peers_->join(PeerGroup::kClient); }); } INSTANTIATE_TEST_CASE_P(Ibv, IbvTransportTest, ::testing::Values(&helper)); ================================================ FILE: tensorpipe/test/transport/ibv/context_test.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include namespace { class IbvTransportContextTest : public TransportTest {}; IbvTransportTestHelper helper; } // namespace using namespace tensorpipe; // Linux-only because OSX machines on CircleCI cannot resolve their hostname #ifdef __linux__ TEST_P(IbvTransportContextTest, LookupHostnameAddress) { Error error; std::string addr; std::tie(error, addr) = transport::ibv::lookupAddrForHostname(); EXPECT_FALSE(error) << error.what(); EXPECT_NE(addr, ""); } #endif // Interface name conventions change based on platform. Linux uses "lo", OSX // uses lo0, Windows uses integers. #ifdef __linux__ #define LOOPBACK_INTERFACE "lo" #elif __APPLE__ #define LOOPBACK_INTERFACE "lo0" #endif #ifdef LOOPBACK_INTERFACE TEST_P(IbvTransportContextTest, LookupInterfaceAddress) { Error error; std::string addr; std::tie(error, addr) = transport::ibv::lookupAddrForIface(LOOPBACK_INTERFACE); EXPECT_FALSE(error) << error.what(); EXPECT_NE(addr, ""); } #endif INSTANTIATE_TEST_CASE_P( Ibv, IbvTransportContextTest, ::testing::Values(&helper)); ================================================ FILE: tensorpipe/test/transport/ibv/ibv_test.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include namespace { IbvTransportTestHelper helper; } // namespace INSTANTIATE_TEST_CASE_P(Ibv, TransportTest, ::testing::Values(&helper)); ================================================ FILE: tensorpipe/test/transport/ibv/ibv_test.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include class IbvTransportTestHelper : public TransportTestHelper { protected: std::shared_ptr getContextInternal() override { return tensorpipe::transport::ibv::create(); } public: std::string defaultAddr() override { return "127.0.0.1"; } }; ================================================ FILE: tensorpipe/test/transport/ibv/sockaddr_test.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include using namespace tensorpipe::transport; namespace { int family(const ibv::Sockaddr& addr) { auto sockaddr = addr.addr(); return sockaddr->sa_family; } int port(const ibv::Sockaddr& addr) { auto sockaddr = addr.addr(); if (sockaddr->sa_family == AF_INET) { auto in = reinterpret_cast(sockaddr); return in->sin_port; } if (sockaddr->sa_family == AF_INET6) { auto in6 = reinterpret_cast(sockaddr); return in6->sin6_port; } return -1; } } // namespace TEST(IbvSockaddr, InetBadPort) { ASSERT_THROW( ibv::Sockaddr::createInetSockAddr("1.2.3.4:-1"), std::invalid_argument); ASSERT_THROW( ibv::Sockaddr::createInetSockAddr("1.2.3.4:65536"), std::invalid_argument); } TEST(IbvSockaddr, Inet) { { auto sa = ibv::Sockaddr::createInetSockAddr("1.2.3.4:5"); ASSERT_EQ(family(sa), AF_INET); ASSERT_EQ(port(sa), ntohs(5)); ASSERT_EQ(sa.str(), "1.2.3.4:5"); } { auto sa = ibv::Sockaddr::createInetSockAddr("1.2.3.4:0"); ASSERT_EQ(family(sa), AF_INET); ASSERT_EQ(port(sa), 0); ASSERT_EQ(sa.str(), "1.2.3.4:0"); } { auto sa = ibv::Sockaddr::createInetSockAddr("1.2.3.4"); ASSERT_EQ(family(sa), AF_INET); ASSERT_EQ(port(sa), 0); ASSERT_EQ(sa.str(), "1.2.3.4:0"); } } TEST(IbvSockaddr, Inet6BadPort) { ASSERT_THROW( ibv::Sockaddr::createInetSockAddr("[::1]:-1"), std::invalid_argument); ASSERT_THROW( ibv::Sockaddr::createInetSockAddr("[::1]:65536"), std::invalid_argument); ASSERT_THROW( ibv::Sockaddr::createInetSockAddr("]::1["), std::invalid_argument); } // Interface name conventions change based on platform. Linux uses "lo", OSX // uses lo0, Windows uses integers. #ifdef __linux__ #define LOOPBACK_INTERFACE "lo" #elif __APPLE__ #define LOOPBACK_INTERFACE "lo0" #endif TEST(IbvSockaddr, Inet6) { { auto sa = ibv::Sockaddr::createInetSockAddr("[::1]:5"); ASSERT_EQ(family(sa), AF_INET6); ASSERT_EQ(port(sa), ntohs(5)); ASSERT_EQ(sa.str(), "[::1]:5"); } { auto sa = ibv::Sockaddr::createInetSockAddr("[::1]:0"); ASSERT_EQ(family(sa), AF_INET6); ASSERT_EQ(port(sa), 0); ASSERT_EQ(sa.str(), "[::1]:0"); } { auto sa = ibv::Sockaddr::createInetSockAddr("::1"); ASSERT_EQ(family(sa), AF_INET6); ASSERT_EQ(port(sa), 0); ASSERT_EQ(sa.str(), "[::1]:0"); } #ifdef LOOPBACK_INTERFACE { auto sa = ibv::Sockaddr::createInetSockAddr("::1%" LOOPBACK_INTERFACE); ASSERT_EQ(family(sa), AF_INET6); ASSERT_EQ(port(sa), 0); ASSERT_EQ(sa.str(), "[::1%" LOOPBACK_INTERFACE "]:0"); } { sockaddr_in6 sa; std::memset(&sa, 0, sizeof(sa)); sa.sin6_family = AF_INET6; sa.sin6_port = ntohs(42); sa.sin6_flowinfo = 0; sa.sin6_addr.s6_addr[15] = 1; // Implicitly assuming that the loopback interface is the first one. sa.sin6_scope_id = 1; ibv::Sockaddr tpSa(reinterpret_cast(&sa), sizeof(sa)); ASSERT_EQ(tpSa.str(), "[::1%" LOOPBACK_INTERFACE "]:42"); } #endif } ================================================ FILE: tensorpipe/test/transport/listener_test.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include using namespace tensorpipe; using namespace tensorpipe::transport; TEST_P(TransportTest, Listener_Basics) { auto context = GetParam()->getContext(); auto addr = GetParam()->defaultAddr(); { std::mutex mutex; std::condition_variable cv; std::vector> connections; // Listener runs callback for every new connection. auto listener = context->listen(addr); listener->accept( [&](const Error& error, std::shared_ptr connection) { ASSERT_FALSE(error) << error.what(); std::lock_guard lock(mutex); connections.push_back(std::move(connection)); cv.notify_one(); }); // Connect to listener. auto connection = context->connect(listener->addr()); // Wait for new connection { std::unique_lock lock(mutex); while (connections.empty()) { cv.wait(lock); } } } context->join(); } TEST_P(TransportTest, Listener_AcceptCallbacksAreQueued) { auto context = GetParam()->getContext(); auto addr = GetParam()->defaultAddr(); { auto listener = context->listen(addr); int numAccepts = 0; std::promise donePromise; for (int i = 0; i < 10; ++i) { listener->accept( [&, i](const Error& error, std::shared_ptr /*unused*/) { if (error) { donePromise.set_exception( std::make_exception_ptr(std::runtime_error(error.what()))); } else { EXPECT_EQ(i, numAccepts); numAccepts++; if (numAccepts == 10) { donePromise.set_value(); } } }); } // Avoid connections to be destroyed before being established. std::vector> conns; for (int i = 0; i < 10; ++i) { auto c = context->connect(listener->addr()); conns.push_back(std::move(c)); } donePromise.get_future().get(); } context->join(); } TEST_P(TransportTest, Listener_IncomingConnectionsAreQueued) { auto context = GetParam()->getContext(); auto addr = GetParam()->defaultAddr(); { auto listener = context->listen(addr); int numAccepts = 0; std::promise donePromise; // Avoid connections to be destroyed before being established. std::vector> conns; for (int i = 0; i < 10; ++i) { auto c = context->connect(listener->addr()); conns.push_back(std::move(c)); } for (int i = 0; i < 10; ++i) { listener->accept( [&, i](const Error& error, std::shared_ptr /*unused*/) { if (error) { donePromise.set_exception( std::make_exception_ptr(std::runtime_error(error.what()))); } else { EXPECT_EQ(i, numAccepts); numAccepts++; if (numAccepts == 10) { donePromise.set_value(); } } }); } donePromise.get_future().get(); } context->join(); } TEST_P(TransportTest, Listener_CreateThenCloseAndThenGetAddress) { auto context = GetParam()->getContext(); auto listener = context->listen(GetParam()->defaultAddr()); listener->close(); auto addr = listener->addr(); std::promise acceptPromise; listener->accept( [&](const Error& error, std::shared_ptr /*unused*/) { if (error) { acceptPromise.set_exception( std::make_exception_ptr(std::runtime_error(error.what()))); } else { acceptPromise.set_value(); } }); auto connection = context->connect(addr); std::promise writePromise; connection->write(nullptr, 0, [&](const Error& error) { if (error) { writePromise.set_exception( std::make_exception_ptr(std::runtime_error(error.what()))); } else { writePromise.set_value(); } }); try { acceptPromise.get_future().get(); } catch (const std::runtime_error&) { // Expected } try { writePromise.get_future().get(); } catch (const std::runtime_error&) { // Expected } context->join(); } TEST_P(TransportTest, Listener_CreateAfterClosingContextAndThenGetAddress) { auto context = GetParam()->getContext(); // This means the listener will be created in an already-closed state. context->close(); auto listener = context->listen(GetParam()->defaultAddr()); auto addr = listener->addr(); std::promise acceptPromise; listener->accept( [&](const Error& error, std::shared_ptr /*unused*/) { if (error) { acceptPromise.set_exception( std::make_exception_ptr(std::runtime_error(error.what()))); } else { acceptPromise.set_value(); } }); auto connection = context->connect(addr); std::promise writePromise; connection->write(nullptr, 0, [&](const Error& error) { if (error) { writePromise.set_exception( std::make_exception_ptr(std::runtime_error(error.what()))); } else { writePromise.set_value(); } }); try { acceptPromise.get_future().get(); } catch (const std::runtime_error&) { // Expected } try { writePromise.get_future().get(); } catch (const std::runtime_error&) { // Expected } context->join(); } ================================================ FILE: tensorpipe/test/transport/shm/connection_test.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include using namespace tensorpipe; using namespace tensorpipe::transport; namespace { class ShmTransportTest : public TransportTest {}; SHMTransportTestHelper helper; // This value is defined in tensorpipe/transport/shm/connection.h static constexpr auto kBufferSize = 2 * 1024 * 1024; } // namespace TEST_P(ShmTransportTest, Chunking) { // This is larger than the default ring buffer size. const int kMsgSize = 5 * kBufferSize; std::string srcBuf(kMsgSize, 0x42); auto dstBuf = std::make_unique(kMsgSize); testConnection( [&](std::shared_ptr conn) { doRead( conn, dstBuf.get(), kMsgSize, [&, conn](const Error& error, const void* ptr, size_t len) { ASSERT_FALSE(error) << error.what(); ASSERT_EQ(len, kMsgSize); ASSERT_EQ(ptr, dstBuf.get()); for (int i = 0; i < kMsgSize; ++i) { ASSERT_EQ(dstBuf[i], srcBuf[i]); } peers_->done(PeerGroup::kServer); }); peers_->join(PeerGroup::kServer); }, [&](std::shared_ptr conn) { doWrite( conn, srcBuf.c_str(), srcBuf.length(), [&, conn](const Error& error) { ASSERT_FALSE(error) << error.what(); peers_->done(PeerGroup::kClient); }); peers_->join(PeerGroup::kClient); }); } TEST_P(ShmTransportTest, ChunkingImplicitRead) { // This is larger than the default ring buffer size. const size_t kMsgSize = 5 * kBufferSize; std::string msg(kMsgSize, 0x42); testConnection( [&](std::shared_ptr conn) { doRead( conn, [&, conn](const Error& error, const void* ptr, size_t len) { ASSERT_FALSE(error) << error.what(); ASSERT_EQ(len, kMsgSize); for (int i = 0; i < kMsgSize; ++i) { ASSERT_EQ(static_cast(ptr)[i], msg[i]); } peers_->done(PeerGroup::kServer); }); peers_->join(PeerGroup::kServer); }, [&](std::shared_ptr conn) { doWrite(conn, msg.c_str(), msg.length(), [&, conn](const Error& error) { ASSERT_FALSE(error) << error.what(); peers_->done(PeerGroup::kClient); }); peers_->join(PeerGroup::kClient); }); } TEST_P(ShmTransportTest, QueueWrites) { // This is large enough that two of those will not fit in the ring buffer at // the same time. constexpr int numMsg = 2; constexpr size_t numBytes = (3 * kBufferSize) / 4; const std::string kReady = "ready"; std::array garbage; testConnection( [&](std::shared_ptr conn) { // Wait for peer to queue up writes before attempting to read EXPECT_EQ(kReady, peers_->recv(PeerGroup::kServer)); for (int i = 0; i < numMsg; ++i) { doRead( conn, [&, conn, i](const Error& error, const void* ptr, size_t len) { ASSERT_FALSE(error) << error.what(); ASSERT_EQ(len, numBytes); if (i == numMsg - 1) { peers_->done(PeerGroup::kServer); } }); } peers_->join(PeerGroup::kServer); }, [&](std::shared_ptr conn) { for (int i = 0; i < numMsg; ++i) { doWrite( conn, garbage.data(), garbage.size(), [&, conn, i](const Error& error) { ASSERT_FALSE(error) << error.what(); if (i == numMsg - 1) { peers_->done(PeerGroup::kClient); } }); } peers_->send(PeerGroup::kServer, kReady); peers_->join(PeerGroup::kClient); }); } namespace { struct MyNopType { std::string myStringField; NOP_STRUCTURE(MyNopType, myStringField); }; } // namespace TEST_P(ShmTransportTest, NopWriteWrapAround) { constexpr int numMsg = 2; constexpr size_t kSize = (3 * kBufferSize) / 4; testConnection( [&](std::shared_ptr conn) { for (int i = 0; i < numMsg; ++i) { auto holder = std::make_shared>(); conn->read(*holder, [&, conn, holder, i](const Error& error) { ASSERT_FALSE(error) << error.what(); ASSERT_EQ(holder->getObject().myStringField.length(), kSize); if (i == numMsg - 1) { peers_->done(PeerGroup::kServer); } }); } peers_->join(PeerGroup::kServer); }, [&](std::shared_ptr conn) { for (int i = 0; i < numMsg; ++i) { auto holder = std::make_shared>(); holder->getObject().myStringField = std::string(kSize, 'B'); conn->write(*holder, [&, conn, holder, i](const Error& error) { ASSERT_FALSE(error) << error.what(); if (i == numMsg - 1) { peers_->done(PeerGroup::kClient); } }); } peers_->join(PeerGroup::kClient); }); } INSTANTIATE_TEST_CASE_P(Shm, ShmTransportTest, ::testing::Values(&helper)); ================================================ FILE: tensorpipe/test/transport/shm/listener_test.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include #include #include using namespace tensorpipe; using namespace tensorpipe::transport; namespace { class ShmListenerTest : public TransportTest {}; SHMTransportTestHelper helper; std::string generateUniqueAddr() { const ::testing::TestInfo* const testInfo = ::testing::UnitTest::GetInstance()->current_test_info(); std::ostringstream ss; ss << "tensorpipe_test_" << testInfo->test_suite_name() << "." << testInfo->name() << "_" << ::getpid(); return ss.str(); } } // namespace TEST_P(ShmListenerTest, ExplicitAbstractSocketName) { std::string expectedAddr = generateUniqueAddr(); std::shared_ptr ctx = GetParam()->getContext(); std::shared_ptr listener = ctx->listen(expectedAddr); std::string actualAddr = listener->addr(); ASSERT_EQ(actualAddr, expectedAddr); std::shared_ptr outgoingConnection = ctx->connect(actualAddr); std::promise prom; listener->accept( [&](const Error& error, std::shared_ptr /* unused */) { EXPECT_FALSE(error) << error.what(); prom.set_value(); }); std::future_status res = prom.get_future().wait_for(std::chrono::seconds(1)); ASSERT_NE(res, std::future_status::timeout); } TEST_P(ShmListenerTest, AutobindAbstractSocketName) { std::shared_ptr ctx = GetParam()->getContext(); std::shared_ptr listener = ctx->listen(""); std::string addr = listener->addr(); ASSERT_NE(addr, ""); // Since Linux 2.3.15 (Aug 1999) the address is in this format, see unix(7). ASSERT_THAT(addr, ::testing::MatchesRegex("[0-9a-f]{5}")); std::shared_ptr outgoingConnection = ctx->connect(addr); std::promise prom; listener->accept( [&](const Error& error, std::shared_ptr /* unused */) { EXPECT_FALSE(error) << error.what(); prom.set_value(); }); std::future_status res = prom.get_future().wait_for(std::chrono::seconds(1)); ASSERT_NE(res, std::future_status::timeout); } INSTANTIATE_TEST_CASE_P(Shm, ShmListenerTest, ::testing::Values(&helper)); ================================================ FILE: tensorpipe/test/transport/shm/reactor_test.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include #include #include using namespace tensorpipe; using namespace tensorpipe::transport::shm; namespace { void run(std::function fn1, std::function fn2) { int fds[2]; { auto rv = socketpair(AF_UNIX, SOCK_STREAM, 0, fds); if (rv != 0) { TP_THROW_SYSTEM(errno) << "Failed to create socket pair"; } } { auto pid = fork(); TP_DCHECK_GE(pid, 0); if (pid == 0) { close(fds[0]); fn2(fds[1]); close(fds[1]); exit(0); } } close(fds[1]); fn1(fds[0]); close(fds[0]); wait(nullptr); } } // namespace TEST(ShmReactor, Basic) { run( [](int fd) { tensorpipe::Queue queue; auto reactor = std::make_shared(); auto token1 = reactor->add([&] { queue.push(1); }); auto token2 = reactor->add([&] { queue.push(2); }); // Share reactor fds and token with other process. { auto socket = Socket(fd); auto fds = reactor->fds(); auto error = socket.sendPayloadAndFds( token1, token2, std::get<0>(fds), std::get<1>(fds)); ASSERT_FALSE(error) << error.what(); } // Wait for other process to run trigger. ASSERT_EQ(queue.pop(), 1); ASSERT_EQ(queue.pop(), 2); reactor->remove(token1); reactor->remove(token2); }, [](int fd) { Reactor::TToken token1; Reactor::TToken token2; Fd header; Fd data; // Wait for other process to share reactor fds and token. { auto socket = Socket(fd); auto error = socket.recvPayloadAndFds(token1, token2, header, data); ASSERT_FALSE(error) << error.what(); } // Create and run trigger. This should wake up the other // process and run the registered function. Reactor::Trigger trigger(std::move(header), std::move(data)); trigger.run(token1); trigger.run(token2); }); } TEST(ShmReactor, TokenReuse) { tensorpipe::Queue queue(3); auto reactor = std::make_shared(); auto t1 = reactor->add([&] { queue.push(1); }); auto t2 = reactor->add([&] { queue.push(2); }); auto t3 = reactor->add([&] { queue.push(3); }); // Check that they're monotonically increasing. ASSERT_GT(t2, t1); ASSERT_GT(t3, t2); // Remove token and check that it is reused. reactor->remove(t1); auto t4 = reactor->add([&] { queue.push(4); }); ASSERT_EQ(t4, t1); // Remove multiple tokens and check that they're reused in order. reactor->remove(t2); reactor->remove(t3); auto t5 = reactor->add([&] { queue.push(5); }); auto t6 = reactor->add([&] { queue.push(6); }); ASSERT_EQ(t5, t2); ASSERT_EQ(t6, t3); reactor->remove(t4); reactor->remove(t5); reactor->remove(t6); } ================================================ FILE: tensorpipe/test/transport/shm/shm_test.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include namespace { SHMTransportTestHelper helper; } // namespace INSTANTIATE_TEST_CASE_P(Shm, TransportTest, ::testing::Values(&helper)); ================================================ FILE: tensorpipe/test/transport/shm/shm_test.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include class SHMTransportTestHelper : public TransportTestHelper { protected: std::shared_ptr getContextInternal() override { return tensorpipe::transport::shm::create(); } public: std::string defaultAddr() override { return ""; } }; ================================================ FILE: tensorpipe/test/transport/shm/sockaddr_test.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include using namespace tensorpipe::transport; TEST(ShmSockaddr, FromToString) { auto addr = shm::Sockaddr::createAbstractUnixAddr("foo"); ASSERT_EQ(addr.str(), std::string("foo")); } ================================================ FILE: tensorpipe/test/transport/transport_test.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include #include class TransportTestHelper { public: std::shared_ptr getContext( bool skipViabilityCheck = false) { std::shared_ptr ctx = getContextInternal(); if (!skipViabilityCheck) { EXPECT_TRUE(ctx->isViable()); } return ctx; } virtual std::string defaultAddr() = 0; virtual std::unique_ptr makePeerGroup() { return std::make_unique(); } virtual ~TransportTestHelper() = default; protected: virtual std::shared_ptr getContextInternal() = 0; }; class TransportTest : public ::testing::TestWithParam { protected: std::unique_ptr peers_; public: TransportTest() : peers_(GetParam()->makePeerGroup()) {} void testConnection( std::function)> listeningFn, std::function)> connectingFn) { using namespace tensorpipe::transport; peers_->spawn( [&] { auto ctx = GetParam()->getContext(); ctx->setId("server"); auto addr = GetParam()->defaultAddr(); auto listener = ctx->listen(addr); std::promise> connectionProm; listener->accept([&](const tensorpipe::Error& error, std::shared_ptr conn) { ASSERT_FALSE(error) << error.what(); connectionProm.set_value(std::move(conn)); }); peers_->send(PeerGroup::kClient, listener->addr()); listeningFn(connectionProm.get_future().get()); ctx->join(); }, [&] { auto ctx = GetParam()->getContext(); ctx->setId("client"); auto listenerAddr = peers_->recv(PeerGroup::kClient); connectingFn(ctx->connect(listenerAddr)); ctx->join(); }); } // Add to a closure to check the callback is called before being destroyed class Bomb { public: Bomb() = default; Bomb(const Bomb&) = delete; Bomb(Bomb&& b) { defused_ = b.defused_; b.defused_ = false; } Bomb& operator=(const Bomb&) = delete; Bomb& operator=(Bomb&&) = delete; void defuse() { defused_ = true; } ~Bomb() { EXPECT_TRUE(defused_); } private: bool defused_ = false; }; std::shared_ptr armBomb() { return std::make_shared(); } void doRead( std::shared_ptr conn, tensorpipe::transport::Connection::read_callback_fn fn) { auto mutex = std::make_shared(); std::lock_guard outerLock(*mutex); // We acquire the same mutex while calling read and inside its callback so // that we deadlock if the callback is invoked inline. conn->read( [fn{std::move(fn)}, mutex, bomb{armBomb()}]( const tensorpipe::Error& error, const void* ptr, size_t len) { std::lock_guard innerLock(*mutex); bomb->defuse(); fn(error, ptr, len); }); } void doRead( std::shared_ptr conn, void* ptr, size_t length, tensorpipe::transport::Connection::read_callback_fn fn) { auto mutex = std::make_shared(); std::lock_guard outerLock(*mutex); // We acquire the same mutex while calling read and inside its callback so // that we deadlock if the callback is invoked inline. conn->read( ptr, length, [fn{std::move(fn)}, mutex, bomb{armBomb()}]( const tensorpipe::Error& error, const void* ptr, size_t len) { std::lock_guard innerLock(*mutex); bomb->defuse(); fn(error, ptr, len); }); } void doWrite( std::shared_ptr conn, const void* ptr, size_t length, tensorpipe::transport::Connection::write_callback_fn fn) { auto mutex = std::make_shared(); // We acquire the same mutex while calling write and inside its callback // so that we deadlock if the callback is invoked inline. std::lock_guard outerLock(*mutex); conn->write( ptr, length, [fn{std::move(fn)}, mutex, bomb{armBomb()}]( const tensorpipe::Error& error) { std::lock_guard innerLock(*mutex); bomb->defuse(); fn(error); }); } }; ================================================ FILE: tensorpipe/test/transport/uv/connection_test.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include namespace { class UVTransportConnectionTest : public TransportTest {}; UVTransportTestHelper helper; } // namespace using namespace tensorpipe; using namespace tensorpipe::transport; TEST_P(UVTransportConnectionTest, LargeWrite) { constexpr int kMsgSize = 16 * 1024 * 1024; std::string msg(kMsgSize, 0x42); testConnection( [&](std::shared_ptr conn) { doWrite(conn, msg.c_str(), msg.length(), [&, conn](const Error& error) { ASSERT_FALSE(error) << error.what(); peers_->done(PeerGroup::kServer); }); peers_->join(PeerGroup::kServer); }, [&](std::shared_ptr conn) { doRead( conn, [&, conn](const Error& error, const void* data, size_t len) { ASSERT_FALSE(error) << error.what(); ASSERT_EQ(len, msg.length()); const char* cdata = (const char*)data; for (int i = 0; i < len; ++i) { const char c = cdata[i]; ASSERT_EQ(c, msg[i]) << "Wrong value at position " << i << " of " << msg.length(); } peers_->done(PeerGroup::kClient); }); peers_->join(PeerGroup::kClient); }); } INSTANTIATE_TEST_CASE_P( Uv, UVTransportConnectionTest, ::testing::Values(&helper)); ================================================ FILE: tensorpipe/test/transport/uv/context_test.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include namespace { class UVTransportContextTest : public TransportTest {}; UVTransportTestHelper helper; } // namespace using namespace tensorpipe; // Linux-only because OSX machines on CircleCI cannot resolve their hostname #ifdef __linux__ TEST_P(UVTransportContextTest, LookupHostnameAddress) { Error error; std::string addr; std::tie(error, addr) = transport::uv::lookupAddrForHostname(); EXPECT_FALSE(error) << error.what(); EXPECT_NE(addr, ""); } #endif // Interface name conventions change based on platform. Linux uses "lo", OSX // uses lo0, Windows uses integers. #ifdef __linux__ #define LOOPBACK_INTERFACE "lo" #elif __APPLE__ #define LOOPBACK_INTERFACE "lo0" #endif #ifdef LOOPBACK_INTERFACE TEST_P(UVTransportContextTest, LookupInterfaceAddress) { Error error; std::string addr; std::tie(error, addr) = transport::uv::lookupAddrForIface(LOOPBACK_INTERFACE); EXPECT_FALSE(error) << error.what(); EXPECT_NE(addr, ""); } #endif TEST_P(UVTransportContextTest, LookupAddressLikeNccl) { Error error; std::string addr; std::tie(error, addr) = transport::uv::lookupAddrLikeNccl(); EXPECT_FALSE(error) << error.what(); EXPECT_NE(addr, ""); } INSTANTIATE_TEST_CASE_P(Uv, UVTransportContextTest, ::testing::Values(&helper)); ================================================ FILE: tensorpipe/test/transport/uv/loop_test.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include using namespace tensorpipe::transport::uv; namespace test { namespace transport { namespace uv { TEST(UvLoop, Defer) { Loop loop; { // Defer function on event loop thread. std::promise prom; loop.deferToLoop([&] { prom.set_value(std::this_thread::get_id()); }); ASSERT_NE(std::this_thread::get_id(), prom.get_future().get()); } loop.join(); } } // namespace uv } // namespace transport } // namespace test ================================================ FILE: tensorpipe/test/transport/uv/sockaddr_test.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include using namespace tensorpipe::transport; namespace { int family(const uv::Sockaddr& addr) { auto sockaddr = addr.addr(); return sockaddr->sa_family; } int port(const uv::Sockaddr& addr) { auto sockaddr = addr.addr(); if (sockaddr->sa_family == AF_INET) { auto in = reinterpret_cast(sockaddr); return in->sin_port; } if (sockaddr->sa_family == AF_INET6) { auto in6 = reinterpret_cast(sockaddr); return in6->sin6_port; } return -1; } } // namespace TEST(UvSockaddr, InetBadPort) { ASSERT_THROW( uv::Sockaddr::createInetSockAddr("1.2.3.4:-1"), std::invalid_argument); ASSERT_THROW( uv::Sockaddr::createInetSockAddr("1.2.3.4:65536"), std::invalid_argument); } TEST(UvSockaddr, Inet) { { auto sa = uv::Sockaddr::createInetSockAddr("1.2.3.4:5"); ASSERT_EQ(family(sa), AF_INET); ASSERT_EQ(port(sa), ntohs(5)); ASSERT_EQ(sa.str(), "1.2.3.4:5"); } { auto sa = uv::Sockaddr::createInetSockAddr("1.2.3.4:0"); ASSERT_EQ(family(sa), AF_INET); ASSERT_EQ(port(sa), 0); ASSERT_EQ(sa.str(), "1.2.3.4:0"); } { auto sa = uv::Sockaddr::createInetSockAddr("1.2.3.4"); ASSERT_EQ(family(sa), AF_INET); ASSERT_EQ(port(sa), 0); ASSERT_EQ(sa.str(), "1.2.3.4:0"); } } TEST(UvSockaddr, Inet6BadPort) { ASSERT_THROW( uv::Sockaddr::createInetSockAddr("[::1]:-1"), std::invalid_argument); ASSERT_THROW( uv::Sockaddr::createInetSockAddr("[::1]:65536"), std::invalid_argument); ASSERT_THROW( uv::Sockaddr::createInetSockAddr("]::1["), std::invalid_argument); } // Interface name conventions change based on platform. Linux uses "lo", OSX // uses lo0, Windows uses integers. #ifdef __linux__ #define LOOPBACK_INTERFACE "lo" #elif __APPLE__ #define LOOPBACK_INTERFACE "lo0" #endif TEST(UvSockaddr, Inet6) { { auto sa = uv::Sockaddr::createInetSockAddr("[::1]:5"); ASSERT_EQ(family(sa), AF_INET6); ASSERT_EQ(port(sa), ntohs(5)); ASSERT_EQ(sa.str(), "[::1]:5"); } { auto sa = uv::Sockaddr::createInetSockAddr("[::1]:0"); ASSERT_EQ(family(sa), AF_INET6); ASSERT_EQ(port(sa), 0); ASSERT_EQ(sa.str(), "[::1]:0"); } { auto sa = uv::Sockaddr::createInetSockAddr("::1"); ASSERT_EQ(family(sa), AF_INET6); ASSERT_EQ(port(sa), 0); ASSERT_EQ(sa.str(), "[::1]:0"); } #ifdef LOOPBACK_INTERFACE { auto sa = uv::Sockaddr::createInetSockAddr("::1%" LOOPBACK_INTERFACE); ASSERT_EQ(family(sa), AF_INET6); ASSERT_EQ(port(sa), 0); ASSERT_EQ(sa.str(), "[::1%" LOOPBACK_INTERFACE "]:0"); } { sockaddr_in6 sa; std::memset(&sa, 0, sizeof(sa)); sa.sin6_family = AF_INET6; sa.sin6_port = ntohs(42); sa.sin6_flowinfo = 0; sa.sin6_addr.s6_addr[15] = 1; // Implicitly assuming that the loopback interface is the first one. sa.sin6_scope_id = 1; uv::Sockaddr tpSa(reinterpret_cast(&sa), sizeof(sa)); ASSERT_EQ(tpSa.str(), "[::1%" LOOPBACK_INTERFACE "]:42"); } #endif } ================================================ FILE: tensorpipe/test/transport/uv/uv_test.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include namespace { UVTransportTestHelper helper; } // namespace INSTANTIATE_TEST_CASE_P(Uv, TransportTest, ::testing::Values(&helper)); ================================================ FILE: tensorpipe/test/transport/uv/uv_test.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include class UVTransportTestHelper : public TransportTestHelper { protected: std::shared_ptr getContextInternal() override { return tensorpipe::transport::uv::create(); } public: std::string defaultAddr() override { return "127.0.0.1"; } }; ================================================ FILE: tensorpipe/transport/connection.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include namespace tensorpipe { namespace transport { class Connection { public: using read_callback_fn = std::function; virtual void read(read_callback_fn fn) = 0; virtual void read(void* ptr, size_t length, read_callback_fn fn) = 0; using write_callback_fn = std::function; virtual void write(const void* ptr, size_t length, write_callback_fn fn) = 0; // // Helper functions for reading/writing nop objects. // // Read and parse a nop object. // // This function may be overridden by a subclass. // // For example, the shm transport may be able to bypass reading into a // temporary buffer and instead instead read directly from its peer's // ring buffer. This saves an allocation and a memory copy. // using read_nop_callback_fn = std::function; virtual void read(AbstractNopHolder& object, read_nop_callback_fn fn) = 0; // Serialize and write nop object. // // This function may be overridden by a subclass. // // For example, the shm transport may be able to bypass serialization // into a temporary buffer and instead instead serialize directly into // its peer's ring buffer. This saves an allocation and a memory copy. // virtual void write(const AbstractNopHolder& object, write_callback_fn fn) = 0; // Tell the connection what its identifier is. // // This is only supposed to be called from the high-level pipe or from // channels. It will only used for logging and debugging purposes. virtual void setId(std::string id) = 0; virtual void close() = 0; virtual ~Connection() = default; }; } // namespace transport } // namespace tensorpipe ================================================ FILE: tensorpipe/transport/connection_boilerplate.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include #include namespace tensorpipe { namespace transport { template class ConnectionBoilerplate : public Connection { public: template ConnectionBoilerplate( typename ConnectionImplBoilerplate::ConstructorToken token, std::shared_ptr context, std::string id, Args... args); explicit ConnectionBoilerplate(std::shared_ptr connection); ConnectionBoilerplate(const ConnectionBoilerplate&) = delete; ConnectionBoilerplate(ConnectionBoilerplate&&) = delete; ConnectionBoilerplate& operator=(const ConnectionBoilerplate&) = delete; ConnectionBoilerplate& operator=(ConnectionBoilerplate&&) = delete; // Queue a read operation. void read(read_callback_fn fn) override; void read(AbstractNopHolder& object, read_nop_callback_fn fn) override; void read(void* ptr, size_t length, read_callback_fn fn) override; // Perform a write operation. void write(const void* ptr, size_t length, write_callback_fn fn) override; void write(const AbstractNopHolder& object, write_callback_fn fn) override; // Tell the connection what its identifier is. void setId(std::string id) override; // Shut down the connection and its resources. void close() override; ~ConnectionBoilerplate() override; protected: // Using a shared_ptr allows us to detach the lifetime of the implementation // from the public object's one and perform the destruction asynchronously. const std::shared_ptr impl_; }; template template ConnectionBoilerplate::ConnectionBoilerplate( typename ConnectionImplBoilerplate::ConstructorToken token, std::shared_ptr context, std::string id, Args... args) : impl_(std::make_shared( token, std::move(context), std::move(id), std::forward(args)...)) { static_assert( std::is_base_of, TConn>:: value, ""); impl_->init(); } template ConnectionBoilerplate::ConnectionBoilerplate( std::shared_ptr connection) : impl_(std::move(connection)) { static_assert( std::is_base_of, TConn>:: value, ""); } template void ConnectionBoilerplate::read(read_callback_fn fn) { if (unlikely(!impl_)) { // FIXME In C++-17 perhaps a global static inline variable would be better? static Error error = TP_CREATE_ERROR(ContextNotViableError); fn(error, nullptr, 0); return; } impl_->read(std::move(fn)); } template void ConnectionBoilerplate::read( AbstractNopHolder& object, read_nop_callback_fn fn) { if (unlikely(!impl_)) { // FIXME In C++-17 perhaps a global static inline variable would be better? static Error error = TP_CREATE_ERROR(ContextNotViableError); fn(error); return; } impl_->read(object, std::move(fn)); } template void ConnectionBoilerplate::read( void* ptr, size_t length, read_callback_fn fn) { if (unlikely(!impl_)) { // FIXME In C++-17 perhaps a global static inline variable would be better? static Error error = TP_CREATE_ERROR(ContextNotViableError); fn(error, ptr, length); return; } impl_->read(ptr, length, std::move(fn)); } template void ConnectionBoilerplate::write( const void* ptr, size_t length, write_callback_fn fn) { if (unlikely(!impl_)) { // FIXME In C++-17 perhaps a global static inline variable would be better? static Error error = TP_CREATE_ERROR(ContextNotViableError); fn(error); return; } impl_->write(ptr, length, std::move(fn)); } template void ConnectionBoilerplate::write( const AbstractNopHolder& object, write_callback_fn fn) { if (unlikely(!impl_)) { // FIXME In C++-17 perhaps a global static inline variable would be better? static Error error = TP_CREATE_ERROR(ContextNotViableError); fn(error); return; } impl_->write(object, std::move(fn)); } template void ConnectionBoilerplate::setId(std::string id) { if (unlikely(!impl_)) { return; } impl_->setId(std::move(id)); } template void ConnectionBoilerplate::close() { if (unlikely(!impl_)) { return; } impl_->close(); } template ConnectionBoilerplate::~ConnectionBoilerplate() { close(); } } // namespace transport } // namespace tensorpipe ================================================ FILE: tensorpipe/transport/connection_impl_boilerplate.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include #include #include #include namespace tensorpipe { namespace transport { template class ContextImplBoilerplate; template class ListenerImplBoilerplate; template class ConnectionImplBoilerplate : public std::enable_shared_from_this { public: class ConstructorToken { public: ConstructorToken(const ConstructorToken&) = default; private: explicit ConstructorToken() {} friend ContextImplBoilerplate; friend ListenerImplBoilerplate; }; ConnectionImplBoilerplate( ConstructorToken token, std::shared_ptr context, std::string id); ConnectionImplBoilerplate(const ConnectionImplBoilerplate&) = delete; ConnectionImplBoilerplate(ConnectionImplBoilerplate&&) = delete; ConnectionImplBoilerplate& operator=(const ConnectionImplBoilerplate&) = delete; ConnectionImplBoilerplate& operator=(ConnectionImplBoilerplate&&) = delete; // Initialize member fields that need `shared_from_this`. void init(); // Queue a read operation. using read_callback_fn = Connection::read_callback_fn; using read_nop_callback_fn = Connection::read_nop_callback_fn; void read(read_callback_fn fn); void read(AbstractNopHolder& object, read_nop_callback_fn fn); void read(void* ptr, size_t length, read_callback_fn fn); // Perform a write operation. using write_callback_fn = Connection::write_callback_fn; void write(const void* ptr, size_t length, write_callback_fn fn); void write(const AbstractNopHolder& object, write_callback_fn fn); // Tell the connection what its identifier is. void setId(std::string id); // Shut down the connection and its resources. void close(); virtual ~ConnectionImplBoilerplate() = default; protected: virtual void initImplFromLoop() = 0; virtual void readImplFromLoop(read_callback_fn fn) = 0; virtual void readImplFromLoop( AbstractNopHolder& object, read_nop_callback_fn fn); virtual void readImplFromLoop( void* ptr, size_t length, read_callback_fn fn) = 0; virtual void writeImplFromLoop( const void* ptr, size_t length, write_callback_fn fn) = 0; virtual void writeImplFromLoop( const AbstractNopHolder& object, write_callback_fn fn); virtual void handleErrorImpl() = 0; void setError(Error error); const std::shared_ptr context_; Error error_{Error::kSuccess}; // An identifier for the connection, composed of the identifier for the // context or listener, combined with an increasing sequence number. It will // only be used for logging and debugging purposes. std::string id_; private: // Initialize member fields that need `shared_from_this`. void initFromLoop(); // Queue a read operation. void readFromLoop(read_callback_fn fn); void readFromLoop(AbstractNopHolder& object, read_nop_callback_fn fn); void readFromLoop(void* ptr, size_t length, read_callback_fn fn); // Perform a write operation. void writeFromLoop(const void* ptr, size_t length, write_callback_fn fn); void writeFromLoop(const AbstractNopHolder& object, write_callback_fn fn); void setIdFromLoop(std::string id); // Shut down the connection and its resources. void closeFromLoop(); // Deal with an error. void handleError(); // A sequence number for the calls to read and write. uint64_t nextBufferBeingRead_{0}; uint64_t nextBufferBeingWritten_{0}; // Contexts and listeners do sometimes need to call directly into initFromLoop // and closeFromLoop, in order to make sure that some of their operations can // happen "atomically" on the connection, without possibly other operations // occurring in between (e.g., an error). friend ContextImplBoilerplate; friend ListenerImplBoilerplate; }; template ConnectionImplBoilerplate::ConnectionImplBoilerplate( ConstructorToken /* unused */, std::shared_ptr context, std::string id) : context_(std::move(context)), id_(std::move(id)) {} template void ConnectionImplBoilerplate::init() { context_->deferToLoop( [impl{this->shared_from_this()}]() { impl->initFromLoop(); }); } template void ConnectionImplBoilerplate::initFromLoop() { if (context_->closed()) { // Set the error without calling setError because we do not want to invoke // the subclass's handleErrorImpl as it would find itself in a weird state // (since initFromLoop wouldn't have been called). error_ = TP_CREATE_ERROR(ConnectionClosedError); TP_VLOG(7) << "Connection " << id_ << " is closing (without initing)"; return; } initImplFromLoop(); } template void ConnectionImplBoilerplate::read(read_callback_fn fn) { context_->deferToLoop( [impl{this->shared_from_this()}, fn{std::move(fn)}]() mutable { impl->readFromLoop(std::move(fn)); }); } template void ConnectionImplBoilerplate::readFromLoop( read_callback_fn fn) { TP_DCHECK(context_->inLoop()); uint64_t sequenceNumber = nextBufferBeingRead_++; TP_VLOG(7) << "Connection " << id_ << " received a read request (#" << sequenceNumber << ")"; fn = [this, sequenceNumber, fn{std::move(fn)}]( const Error& error, const void* ptr, size_t length) { TP_VLOG(7) << "Connection " << id_ << " is calling a read callback (#" << sequenceNumber << ")"; fn(error, ptr, length); TP_VLOG(7) << "Connection " << id_ << " done calling a read callback (#" << sequenceNumber << ")"; }; if (error_) { fn(error_, nullptr, 0); return; } readImplFromLoop(std::move(fn)); } template void ConnectionImplBoilerplate::read( AbstractNopHolder& object, read_nop_callback_fn fn) { context_->deferToLoop( [impl{this->shared_from_this()}, &object, fn{std::move(fn)}]() mutable { impl->readFromLoop(object, std::move(fn)); }); } template void ConnectionImplBoilerplate::readFromLoop( AbstractNopHolder& object, read_nop_callback_fn fn) { TP_DCHECK(context_->inLoop()); uint64_t sequenceNumber = nextBufferBeingRead_++; TP_VLOG(7) << "Connection " << id_ << " received a nop object read request (#" << sequenceNumber << ")"; fn = [this, sequenceNumber, fn{std::move(fn)}](const Error& error) { TP_VLOG(7) << "Connection " << id_ << " is calling a nop object read callback (#" << sequenceNumber << ")"; fn(error); TP_VLOG(7) << "Connection " << id_ << " done calling a nop object read callback (#" << sequenceNumber << ")"; }; if (error_) { fn(error_); return; } readImplFromLoop(object, std::move(fn)); } template void ConnectionImplBoilerplate::readImplFromLoop( AbstractNopHolder& object, read_nop_callback_fn fn) { readImplFromLoop([&object, fn{std::move(fn)}]( const Error& error, const void* ptr, size_t len) { if (!error) { NopReader reader(reinterpret_cast(ptr), len); nop::Status status = object.read(reader); TP_THROW_ASSERT_IF(status.has_error()) << "Error reading nop object: " << status.GetErrorMessage(); } fn(error); }); } template void ConnectionImplBoilerplate::read( void* ptr, size_t length, read_callback_fn fn) { context_->deferToLoop([impl{this->shared_from_this()}, ptr, length, fn{std::move(fn)}]() mutable { impl->readFromLoop(ptr, length, std::move(fn)); }); } template void ConnectionImplBoilerplate::readFromLoop( void* ptr, size_t length, read_callback_fn fn) { TP_DCHECK(context_->inLoop()); uint64_t sequenceNumber = nextBufferBeingRead_++; TP_VLOG(7) << "Connection " << id_ << " received a read request (#" << sequenceNumber << ")"; fn = [this, sequenceNumber, fn{std::move(fn)}]( const Error& error, const void* ptr, size_t length) { TP_VLOG(7) << "Connection " << id_ << " is calling a read callback (#" << sequenceNumber << ")"; fn(error, ptr, length); TP_VLOG(7) << "Connection " << id_ << " done calling a read callback (#" << sequenceNumber << ")"; }; if (error_) { fn(error_, ptr, length); return; } readImplFromLoop(ptr, length, std::move(fn)); } template void ConnectionImplBoilerplate::write( const void* ptr, size_t length, write_callback_fn fn) { context_->deferToLoop([impl{this->shared_from_this()}, ptr, length, fn{std::move(fn)}]() mutable { impl->writeFromLoop(ptr, length, std::move(fn)); }); } template void ConnectionImplBoilerplate::writeFromLoop( const void* ptr, size_t length, write_callback_fn fn) { TP_DCHECK(context_->inLoop()); uint64_t sequenceNumber = nextBufferBeingWritten_++; TP_VLOG(7) << "Connection " << id_ << " received a write request (#" << sequenceNumber << ")"; fn = [this, sequenceNumber, fn{std::move(fn)}](const Error& error) { TP_VLOG(7) << "Connection " << id_ << " is calling a write callback (#" << sequenceNumber << ")"; fn(error); TP_VLOG(7) << "Connection " << id_ << " done calling a write callback (#" << sequenceNumber << ")"; }; if (error_) { fn(error_); return; } writeImplFromLoop(ptr, length, std::move(fn)); } template void ConnectionImplBoilerplate::write( const AbstractNopHolder& object, write_callback_fn fn) { context_->deferToLoop( [impl{this->shared_from_this()}, &object, fn{std::move(fn)}]() mutable { impl->writeFromLoop(object, std::move(fn)); }); } template void ConnectionImplBoilerplate::writeFromLoop( const AbstractNopHolder& object, write_callback_fn fn) { TP_DCHECK(context_->inLoop()); uint64_t sequenceNumber = nextBufferBeingWritten_++; TP_VLOG(7) << "Connection " << id_ << " received a nop object write request (#" << sequenceNumber << ")"; fn = [this, sequenceNumber, fn{std::move(fn)}](const Error& error) { TP_VLOG(7) << "Connection " << id_ << " is calling a nop object write callback (#" << sequenceNumber << ")"; fn(error); TP_VLOG(7) << "Connection " << id_ << " done calling a nop object write callback (#" << sequenceNumber << ")"; }; if (error_) { fn(error_); return; } writeImplFromLoop(object, std::move(fn)); } template void ConnectionImplBoilerplate::writeImplFromLoop( const AbstractNopHolder& object, write_callback_fn fn) { const size_t len = object.getSize(); // Using a shared_ptr instead of unique_ptr because if the lambda captures a // unique_ptr then it becomes non-copyable, which prevents it from being // converted to a function. In C++20 use std::make_shared(len). // // Note: this is a std::shared_ptr semantically. A shared_ptr // with array type is supported in C++17 and higher. // auto buf = std::shared_ptr( new uint8_t[len], std::default_delete()); auto ptr = buf.get(); NopWriter writer(ptr, len); nop::Status status = object.write(writer); TP_THROW_ASSERT_IF(status.has_error()) << "Error writing nop object: " << status.GetErrorMessage(); // Perform write and forward callback. writeImplFromLoop( ptr, len, [buf{std::move(buf)}, fn{std::move(fn)}](const Error& error) mutable { // The write has completed; destroy write buffer. buf.reset(); fn(error); }); } template void ConnectionImplBoilerplate::setId(std::string id) { context_->deferToLoop( [impl{this->shared_from_this()}, id{std::move(id)}]() mutable { impl->setIdFromLoop(std::move(id)); }); } template void ConnectionImplBoilerplate::setIdFromLoop( std::string id) { TP_DCHECK(context_->inLoop()); TP_VLOG(7) << "Connection " << id_ << " was renamed to " << id; id_ = std::move(id); } template void ConnectionImplBoilerplate::close() { context_->deferToLoop( [impl{this->shared_from_this()}]() { impl->closeFromLoop(); }); } template void ConnectionImplBoilerplate::closeFromLoop() { TP_DCHECK(context_->inLoop()); TP_VLOG(7) << "Connection " << id_ << " is closing"; setError(TP_CREATE_ERROR(ConnectionClosedError)); } template void ConnectionImplBoilerplate::setError(Error error) { // Don't overwrite an error that's already set. if (error_ || !error) { return; } error_ = std::move(error); handleError(); } template void ConnectionImplBoilerplate::handleError() { TP_DCHECK(context_->inLoop()); TP_VLOG(8) << "Connection " << id_ << " is handling error " << error_.what(); handleErrorImpl(); } } // namespace transport } // namespace tensorpipe ================================================ FILE: tensorpipe/transport/context.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include namespace tensorpipe { namespace transport { class Connection; class Listener; class Context { public: virtual std::shared_ptr connect(std::string addr) = 0; virtual std::shared_ptr listen(std::string addr) = 0; // Return whether the context is able to operate correctly. // // Some transport types may be unable to perform as intended under // some circumstances (e.g., specialized hardware unavailable, lack // of permissions). They can report it through this method in order // for the core context to avoid registering them in the first place. // virtual bool isViable() const = 0; // Return string to describe the domain for this context. // // Two processes with a context of the same type can connect to each // other if one side's domain descriptor is "accepted" by the other // one, using the canCommunicateWithRemote method below. That method // must be symmetric, and unless overridden defaults to string // comparison. // // For example, for a transport that leverages TCP/IP, this may be // as simple as the address family (assuming we can route between // any two processes). For a transport that leverages shared memory, // this descriptor must uniquely identify the machine, such that // only co-located processes generate the same domain descriptor. // virtual const std::string& domainDescriptor() const = 0; // Compare local and remote domain descriptor for compatibility. // // Determine whether a connection can be opened between this context // and a remote one that has the given domain descriptor. This // function needs to be symmetric: if we called this method on the // remote context with the local descriptor we should get the same // answer. Unless overridden it defaults to string comparison. // virtual bool canCommunicateWithRemote( const std::string& remoteDomainDescriptor) const { return domainDescriptor() == remoteDomainDescriptor; } // Tell the context what its identifier is. // // This is only supposed to be called from the high-level context or from // channel contexts. It will only used for logging and debugging purposes. virtual void setId(std::string id) = 0; virtual void close() = 0; virtual void join() = 0; virtual ~Context() = default; }; } // namespace transport } // namespace tensorpipe ================================================ FILE: tensorpipe/transport/context_boilerplate.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include #include namespace tensorpipe { namespace transport { template class ContextBoilerplate : public Context { public: template explicit ContextBoilerplate(Args&&... args); ContextBoilerplate(const ContextBoilerplate&) = delete; ContextBoilerplate(ContextBoilerplate&&) = delete; ContextBoilerplate& operator=(const ContextBoilerplate&) = delete; ContextBoilerplate& operator=(ContextBoilerplate&&) = delete; std::shared_ptr connect(std::string addr) override; std::shared_ptr listen(std::string addr) override; bool isViable() const override; const std::string& domainDescriptor() const override; void setId(std::string id) override; void close() override; void join() override; ~ContextBoilerplate() override; protected: // The implementation is managed by a shared_ptr because each child object // will also hold a shared_ptr to it (downcast as a shared_ptr to the private // interface). However, its lifetime is tied to the one of this public object, // since when the latter is destroyed the implementation is closed and joined. const std::shared_ptr impl_; }; template template ContextBoilerplate::ContextBoilerplate(Args&&... args) : impl_(TCtx::create(std::forward(args)...)) { static_assert( std::is_base_of, TCtx>::value, ""); if (unlikely(!impl_)) { return; } impl_->init(); } template std::shared_ptr ContextBoilerplate::connect( std::string addr) { if (unlikely(!impl_)) { return std::make_shared>(nullptr); } return impl_->connect(std::move(addr)); } template std::shared_ptr ContextBoilerplate::listen( std::string addr) { if (unlikely(!impl_)) { return std::make_shared>(nullptr); } return impl_->listen(std::move(addr)); } template bool ContextBoilerplate::isViable() const { return impl_ != nullptr; } template const std::string& ContextBoilerplate::domainDescriptor() const { if (unlikely(!impl_)) { // FIXME In C++-17 perhaps a global static inline variable would be better? static std::string empty = ""; return empty; } return impl_->domainDescriptor(); } template void ContextBoilerplate::setId(std::string id) { if (unlikely(!impl_)) { return; } impl_->setId(std::move(id)); } template void ContextBoilerplate::close() { if (unlikely(!impl_)) { return; } impl_->close(); } template void ContextBoilerplate::join() { if (unlikely(!impl_)) { return; } impl_->join(); } template ContextBoilerplate::~ContextBoilerplate() { join(); } } // namespace transport } // namespace tensorpipe ================================================ FILE: tensorpipe/transport/context_impl_boilerplate.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include #include #include #include namespace tensorpipe { namespace transport { template class ContextImplBoilerplate : public virtual DeferredExecutor, public std::enable_shared_from_this { public: explicit ContextImplBoilerplate(std::string domainDescriptor); ContextImplBoilerplate(const ContextImplBoilerplate&) = delete; ContextImplBoilerplate(ContextImplBoilerplate&&) = delete; ContextImplBoilerplate& operator=(const ContextImplBoilerplate&) = delete; ContextImplBoilerplate& operator=(ContextImplBoilerplate&&) = delete; void init(); std::shared_ptr connect(std::string addr); std::shared_ptr listen(std::string addr); const std::string& domainDescriptor() const; // Enrolling dependent objects (listeners and connections) causes them to be // kept alive for as long as the context exists. These objects should enroll // themselves as soon as they're created (in their initImplFromLoop method) // and unenroll themselves after they've completed handling an error (either // right in the handleErrorImpl method or in a subsequent callback). The // context, on the other hand, should avoid terminating (i.e., complete // joining) until all objects have unenrolled themselves. void enroll(TList& listener); void enroll(TConn& connection); void unenroll(TList& listener); void unenroll(TConn& connection); // Return whether the context is in a closed state. To avoid race conditions, // this must be called from within the loop. bool closed(); void setId(std::string id); void close(); void join(); virtual ~ContextImplBoilerplate() = default; protected: virtual void initImplFromLoop() {} virtual void handleErrorImpl() = 0; virtual void joinImpl() = 0; void setError(Error error); Error error_{Error::kSuccess}; // An identifier for the context, composed of the identifier for the context, // combined with the transport's name. It will only be used for logging and // debugging purposes. std::string id_{"N/A"}; CallbackWrapper callbackWrapper_{*this, *this}; private: void initFromLoop(); void closeFromLoop(); void handleError(); std::atomic joined_{false}; const std::string domainDescriptor_; // Sequence numbers for the listeners and connections created by this context, // used to create their identifiers based off this context's identifier. They // will only be used for logging and debugging. std::atomic listenerCounter_{0}; std::atomic connectionCounter_{0}; // Store shared_ptrs to dependent objects that have enrolled themselves to // keep them alive. We use a map, indexed by raw pointers, rather than a set // of shared_ptrs so that we can erase objects without them having to create // a fresh shared_ptr just for that. std::unordered_map> listeners_; std::unordered_map> connections_; // For some odd reason it seems we need to use a qualified name here... template friend class tensorpipe::CallbackWrapper; }; template ContextImplBoilerplate::ContextImplBoilerplate( std::string domainDescriptor) : domainDescriptor_(std::move(domainDescriptor)) {} template void ContextImplBoilerplate::init() { deferToLoop([this]() { initFromLoop(); }); } template void ContextImplBoilerplate::initFromLoop() { TP_DCHECK(inLoop()); TP_DCHECK(!error_); initImplFromLoop(); } template std::shared_ptr ContextImplBoilerplate::connect( std::string addr) { std::string connectionId = id_ + ".c" + std::to_string(connectionCounter_++); TP_VLOG(7) << "Transport context " << id_ << " is opening connection " << connectionId << " to address " << addr; return std::make_shared>( typename ConnectionImplBoilerplate:: ConstructorToken(), this->shared_from_this(), std::move(connectionId), std::move(addr)); } template std::shared_ptr ContextImplBoilerplate::listen( std::string addr) { std::string listenerId = id_ + ".l" + std::to_string(listenerCounter_++); TP_VLOG(7) << "Transport context " << id_ << " is opening listener " << listenerId << " on address " << addr; return std::make_shared>( typename ListenerImplBoilerplate::ConstructorToken(), this->shared_from_this(), std::move(listenerId), std::move(addr)); } template const std::string& ContextImplBoilerplate:: domainDescriptor() const { return domainDescriptor_; } template void ContextImplBoilerplate::enroll(TList& listener) { TP_DCHECK(inLoop()); bool wasInserted; std::tie(std::ignore, wasInserted) = listeners_.emplace(&listener, listener.shared_from_this()); TP_DCHECK(wasInserted); } template void ContextImplBoilerplate::enroll(TConn& connection) { TP_DCHECK(inLoop()); bool wasInserted; std::tie(std::ignore, wasInserted) = connections_.emplace(&connection, connection.shared_from_this()); TP_DCHECK(wasInserted); } template void ContextImplBoilerplate::unenroll(TList& listener) { TP_DCHECK(inLoop()); auto numRemoved = listeners_.erase(&listener); TP_DCHECK_EQ(numRemoved, 1); } template void ContextImplBoilerplate::unenroll(TConn& connection) { TP_DCHECK(inLoop()); auto numRemoved = connections_.erase(&connection); TP_DCHECK_EQ(numRemoved, 1); } template bool ContextImplBoilerplate::closed() { TP_DCHECK(inLoop()); return error_; }; template void ContextImplBoilerplate::setId(std::string id) { TP_VLOG(7) << "Transport context " << id_ << " was renamed to " << id; id_ = std::move(id); } template void ContextImplBoilerplate::close() { deferToLoop([this]() { closeFromLoop(); }); } template void ContextImplBoilerplate::closeFromLoop() { TP_DCHECK(inLoop()); TP_VLOG(7) << "Transport context " << id_ << " is closing"; setError(TP_CREATE_ERROR(ContextClosedError)); TP_VLOG(7) << "Transport context " << id_ << " done closing"; } template void ContextImplBoilerplate::setError(Error error) { // Don't overwrite an error that's already set. if (error_ || !error) { return; } error_ = std::move(error); handleError(); } template void ContextImplBoilerplate::handleError() { TP_DCHECK(inLoop()); TP_VLOG(8) << "Transport context " << id_ << " is handling error " << error_.what(); // Make a copy as they could unenroll themselves inline. auto listenersCopy = listeners_; auto connectionsCopy = connections_; // We call closeFromLoop, rather than just close, because we need these // objects to transition _immediately_ to error, "atomically". If we just // deferred closing to later, this could come after some already-enqueued // operations that could try to access the context, which would be closed, // and this could fail. for (auto& iter : listenersCopy) { iter.second->closeFromLoop(); } for (auto& iter : connectionsCopy) { iter.second->closeFromLoop(); } handleErrorImpl(); } template void ContextImplBoilerplate::join() { close(); if (!joined_.exchange(true)) { TP_VLOG(7) << "Transport context " << id_ << " is joining"; // As closing is deferred to the loop, we must wait for closeImpl to be // actually called before we call joinImpl, to avoid race conditions. For // this, we defer another task to the loop, which we know will run after the // closing, and then we wait for that task to be run. std::promise hasClosed; deferToLoop([&]() { hasClosed.set_value(); }); hasClosed.get_future().wait(); joinImpl(); TP_VLOG(7) << "Transport context " << id_ << " done joining"; TP_DCHECK(listeners_.empty()); TP_DCHECK(connections_.empty()); } } } // namespace transport } // namespace tensorpipe ================================================ FILE: tensorpipe/transport/error.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include namespace tensorpipe { namespace transport { std::string ContextClosedError::what() const { return "context closed"; } std::string ListenerClosedError::what() const { return "listener closed"; } std::string ConnectionClosedError::what() const { return "connection closed"; } std::string ContextNotViableError::what() const { return "context not viable"; } } // namespace transport } // namespace tensorpipe ================================================ FILE: tensorpipe/transport/error.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include namespace tensorpipe { namespace transport { class ContextClosedError final : public BaseError { public: ContextClosedError() {} std::string what() const override; }; class ListenerClosedError final : public BaseError { public: ListenerClosedError() {} std::string what() const override; }; class ConnectionClosedError final : public BaseError { public: ConnectionClosedError() {} std::string what() const override; }; class ContextNotViableError final : public BaseError { public: ContextNotViableError() {} std::string what() const override; }; } // namespace transport } // namespace tensorpipe ================================================ FILE: tensorpipe/transport/ibv/connection_impl.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include namespace tensorpipe { namespace transport { namespace ibv { namespace { // The data that each queue pair endpoint needs to send to the other endpoint in // order to set up the queue pair itself. This data is transferred over a TCP // connection. struct Exchange { IbvSetupInformation setupInfo; uint64_t memoryRegionPtr; uint32_t memoryRegionKey; }; } // namespace ConnectionImpl::ConnectionImpl( ConstructorToken token, std::shared_ptr context, std::string id, Socket socket) : ConnectionImplBoilerplate( token, std::move(context), std::move(id)), socket_(std::move(socket)) {} ConnectionImpl::ConnectionImpl( ConstructorToken token, std::shared_ptr context, std::string id, std::string addr) : ConnectionImplBoilerplate( token, std::move(context), std::move(id)), sockaddr_(Sockaddr::createInetSockAddr(addr)) {} void ConnectionImpl::initImplFromLoop() { context_->enroll(*this); Error error; // The connection either got a socket or an address, but not both. TP_DCHECK(socket_.hasValue() ^ sockaddr_.has_value()); if (!socket_.hasValue()) { std::tie(error, socket_) = Socket::createForFamily(sockaddr_->addr()->sa_family); if (error) { setError(std::move(error)); return; } error = socket_.reuseAddr(true); if (error) { setError(std::move(error)); return; } error = socket_.connect(sockaddr_.value()); if (error) { setError(std::move(error)); return; } } // Ensure underlying control socket is non-blocking such that it // works well with event driven I/O. error = socket_.block(false); if (error) { setError(std::move(error)); return; } // Create ringbuffer for inbox. std::tie(error, inboxBuf_) = MmappedPtr::create( kBufferSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1); TP_THROW_ASSERT_IF(error) << "Couldn't allocate ringbuffer for connection inbox: " << error.what(); inboxRb_ = RingBuffer(&inboxHeader_, inboxBuf_.ptr()); inboxMr_ = createIbvMemoryRegion( context_->getReactor().getIbvLib(), context_->getReactor().getIbvPd(), inboxBuf_.ptr(), kBufferSize, IbvLib::ACCESS_LOCAL_WRITE | IbvLib::ACCESS_REMOTE_WRITE); // Create ringbuffer for outbox. std::tie(error, outboxBuf_) = MmappedPtr::create( kBufferSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1); TP_THROW_ASSERT_IF(error) << "Couldn't allocate ringbuffer for connection outbox: " << error.what(); outboxRb_ = RingBuffer(&outboxHeader_, outboxBuf_.ptr()); outboxMr_ = createIbvMemoryRegion( context_->getReactor().getIbvLib(), context_->getReactor().getIbvPd(), outboxBuf_.ptr(), kBufferSize, 0); // Create and init queue pair. { IbvLib::qp_init_attr initAttr; std::memset(&initAttr, 0, sizeof(initAttr)); initAttr.qp_type = IbvLib::QPT_RC; initAttr.send_cq = context_->getReactor().getIbvCq().get(); initAttr.recv_cq = context_->getReactor().getIbvCq().get(); initAttr.cap.max_send_wr = kSendQueueSize; initAttr.cap.max_send_sge = 1; initAttr.srq = context_->getReactor().getIbvSrq().get(); initAttr.sq_sig_all = 1; qp_ = createIbvQueuePair( context_->getReactor().getIbvLib(), context_->getReactor().getIbvPd(), initAttr); } transitionIbvQueuePairToInit( context_->getReactor().getIbvLib(), qp_, context_->getReactor().getIbvAddress()); // Register methods to be called when our peer writes to our inbox and reads // from our outbox. context_->getReactor().registerQp(qp_->qp_num, shared_from_this()); // We're sending address first, so wait for writability. state_ = SEND_ADDR; context_->registerDescriptor(socket_.fd(), EPOLLOUT, shared_from_this()); } void ConnectionImpl::readImplFromLoop(read_callback_fn fn) { readOperations_.emplace_back(std::move(fn)); // If the inbox already contains some data, we may be able to process this // operation right away. processReadOperationsFromLoop(); } void ConnectionImpl::readImplFromLoop( AbstractNopHolder& object, read_nop_callback_fn fn) { readOperations_.emplace_back( &object, [fn{std::move(fn)}]( const Error& error, const void* /* unused */, size_t /* unused */) { fn(error); }); // If the inbox already contains some data, we may be able to process this // operation right away. processReadOperationsFromLoop(); } void ConnectionImpl::readImplFromLoop( void* ptr, size_t length, read_callback_fn fn) { readOperations_.emplace_back(ptr, length, std::move(fn)); // If the inbox already contains some data, we may be able to process this // operation right away. processReadOperationsFromLoop(); } void ConnectionImpl::writeImplFromLoop( const void* ptr, size_t length, write_callback_fn fn) { writeOperations_.emplace_back(ptr, length, std::move(fn)); // If the outbox has some free space, we may be able to process this operation // right away. processWriteOperationsFromLoop(); } void ConnectionImpl::writeImplFromLoop( const AbstractNopHolder& object, write_callback_fn fn) { writeOperations_.emplace_back(&object, std::move(fn)); // If the outbox has some free space, we may be able to process this operation // right away. processWriteOperationsFromLoop(); } void ConnectionImpl::handleEventsFromLoop(int events) { TP_DCHECK(context_->inLoop()); TP_VLOG(9) << "Connection " << id_ << " is handling an event on its socket (" << EpollLoop::formatEpollEvents(events) << ")"; // Handle only one of the events in the mask. Events on the control // file descriptor are rare enough for the cost of having epoll call // into this function multiple times to not matter. The benefit is // that every handler can close and unregister the control file // descriptor from the event loop, without worrying about the next // handler trying to do so as well. // In some cases the socket could be in a state where it's both in an error // state and readable/writable. If we checked for EPOLLIN or EPOLLOUT first // and then returned after handling them, we would keep doing so forever and // never reach the error handling. So we should keep the error check first. if (events & EPOLLERR) { int error; socklen_t errorlen = sizeof(error); int rv = getsockopt( socket_.fd(), SOL_SOCKET, SO_ERROR, reinterpret_cast(&error), &errorlen); if (rv == -1) { setError(TP_CREATE_ERROR(SystemError, "getsockopt", rv)); } else { setError(TP_CREATE_ERROR(SystemError, "async error on socket", error)); } return; } if (events & EPOLLIN) { handleEventInFromLoop(); return; } if (events & EPOLLOUT) { handleEventOutFromLoop(); return; } // Check for hangup last, as there could be cases where we get EPOLLHUP but // there's still data to be read from the socket, so we want to deal with that // before dealing with the hangup. if (events & EPOLLHUP) { setError(TP_CREATE_ERROR(EOFError)); return; } } void ConnectionImpl::handleEventInFromLoop() { TP_DCHECK(context_->inLoop()); if (state_ == RECV_ADDR) { struct Exchange ex; auto err = socket_.read(&ex, sizeof(ex)); // Crossing our fingers that the exchange information is small enough that // it can be read in a single chunk. if (err != sizeof(ex)) { setError(TP_CREATE_ERROR(ShortReadError, sizeof(ex), err)); return; } transitionIbvQueuePairToReadyToReceive( context_->getReactor().getIbvLib(), qp_, context_->getReactor().getIbvAddress(), ex.setupInfo); transitionIbvQueuePairToReadyToSend( context_->getReactor().getIbvLib(), qp_); peerInboxKey_ = ex.memoryRegionKey; peerInboxPtr_ = ex.memoryRegionPtr; // The connection is usable now. state_ = ESTABLISHED; processWriteOperationsFromLoop(); // Trigger read operations in case a pair of local read() and remote // write() happened before connection is established. Otherwise read() // callback would lose if it's the only read() request. processReadOperationsFromLoop(); return; } if (state_ == ESTABLISHED) { // We don't expect to read anything on this socket once the // connection has been established. If we do, assume it's a // zero-byte read indicating EOF. setError(TP_CREATE_ERROR(EOFError)); return; } TP_THROW_ASSERT() << "EPOLLIN event not handled in state " << state_; } void ConnectionImpl::handleEventOutFromLoop() { TP_DCHECK(context_->inLoop()); if (state_ == SEND_ADDR) { Exchange ex; ex.setupInfo = makeIbvSetupInformation(context_->getReactor().getIbvAddress(), qp_); ex.memoryRegionPtr = reinterpret_cast(inboxBuf_.ptr()); ex.memoryRegionKey = inboxMr_->rkey; auto err = socket_.write(reinterpret_cast(&ex), sizeof(ex)); // Crossing our fingers that the exchange information is small enough that // it can be written in a single chunk. if (err != sizeof(ex)) { setError(TP_CREATE_ERROR(ShortWriteError, sizeof(ex), err)); return; } // Sent our address. Wait for address from peer. state_ = RECV_ADDR; context_->registerDescriptor(socket_.fd(), EPOLLIN, shared_from_this()); return; } TP_THROW_ASSERT() << "EPOLLOUT event not handled in state " << state_; } void ConnectionImpl::processReadOperationsFromLoop() { TP_DCHECK(context_->inLoop()); // Process all read read operations that we can immediately serve, only // when connection is established. if (state_ != ESTABLISHED) { return; } // Serve read operations InboxConsumer inboxConsumer(inboxRb_); while (!readOperations_.empty()) { RingbufferReadOperation& readOperation = readOperations_.front(); ssize_t len = readOperation.handleRead(inboxConsumer); if (len > 0) { Reactor::AckInfo info; info.length = len; TP_VLOG(9) << "Connection " << id_ << " is posting a send request (acknowledging " << info.length << " bytes) on QP " << qp_->qp_num; context_->getReactor().postAck(qp_, info); numAcksInFlight_++; } if (readOperation.completed()) { readOperations_.pop_front(); } else { break; } } } void ConnectionImpl::processWriteOperationsFromLoop() { TP_DCHECK(context_->inLoop()); if (state_ != ESTABLISHED) { return; } OutboxProducer outboxProducer(outboxRb_); while (!writeOperations_.empty()) { RingbufferWriteOperation& writeOperation = writeOperations_.front(); ssize_t len = writeOperation.handleWrite(outboxProducer); if (len > 0) { ssize_t ret; OutboxIbvWriter outboxConsumer(outboxRb_); ret = outboxConsumer.startTx(); TP_THROW_SYSTEM_IF(ret < 0, -ret); ssize_t numBuffers; std::array buffers; std::tie(numBuffers, buffers) = outboxConsumer.accessContiguousInTx(len); TP_THROW_SYSTEM_IF(numBuffers < 0, -numBuffers); for (int bufferIdx = 0; bufferIdx < numBuffers; bufferIdx++) { Reactor::WriteInfo info; info.addr = buffers[bufferIdx].ptr; info.length = buffers[bufferIdx].len; info.lkey = outboxMr_->lkey; uint64_t peerInboxOffset = peerInboxHead_ & (kBufferSize - 1); peerInboxHead_ += buffers[bufferIdx].len; info.remoteAddr = peerInboxPtr_ + peerInboxOffset; info.rkey = peerInboxKey_; TP_VLOG(9) << "Connection " << id_ << " is posting a RDMA write request (transmitting " << info.length << " bytes) on QP " << qp_->qp_num; context_->getReactor().postWrite(qp_, info); numWritesInFlight_++; } ret = outboxConsumer.commitTx(); TP_THROW_SYSTEM_IF(ret < 0, -ret); } if (writeOperation.completed()) { writeOperations_.pop_front(); } else { break; } } } void ConnectionImpl::onRemoteProducedData(uint32_t length) { TP_DCHECK(context_->inLoop()); TP_VLOG(9) << "Connection " << id_ << " was signalled that " << length << " bytes were written to its inbox on QP " << qp_->qp_num; ssize_t ret; InboxIbvRecver inboxProducer(inboxRb_); ret = inboxProducer.startTx(); TP_THROW_SYSTEM_IF(ret < 0, -ret); ret = inboxProducer.incMarkerInTx(length); TP_THROW_SYSTEM_IF(ret < 0, -ret); ret = inboxProducer.commitTx(); TP_THROW_SYSTEM_IF(ret < 0, -ret); processReadOperationsFromLoop(); } void ConnectionImpl::onRemoteConsumedData(uint32_t length) { TP_DCHECK(context_->inLoop()); TP_VLOG(9) << "Connection " << id_ << " was signalled that " << length << " bytes were read from its outbox on QP " << qp_->qp_num; ssize_t ret; OutboxIbvAcker outboxConsumer(outboxRb_); ret = outboxConsumer.startTx(); TP_THROW_SYSTEM_IF(ret < 0, -ret); ret = outboxConsumer.incMarkerInTx(length); TP_THROW_SYSTEM_IF(ret < 0, -ret); ret = outboxConsumer.commitTx(); TP_THROW_SYSTEM_IF(ret < 0, -ret); processWriteOperationsFromLoop(); } void ConnectionImpl::onWriteCompleted() { TP_DCHECK(context_->inLoop()); TP_VLOG(9) << "Connection " << id_ << " done posting a RDMA write request on QP " << qp_->qp_num; numWritesInFlight_--; tryCleanup(); } void ConnectionImpl::onAckCompleted() { TP_DCHECK(context_->inLoop()); TP_VLOG(9) << "Connection " << id_ << " done posting a send request on QP " << qp_->qp_num; numAcksInFlight_--; tryCleanup(); } void ConnectionImpl::onError(IbvLib::wc_status status, uint64_t wrId) { TP_DCHECK(context_->inLoop()); setError(TP_CREATE_ERROR( IbvError, context_->getReactor().getIbvLib().wc_status_str(status))); if (wrId == kWriteRequestId) { onWriteCompleted(); } else if (wrId == kAckRequestId) { onAckCompleted(); } } void ConnectionImpl::handleErrorImpl() { for (auto& readOperation : readOperations_) { readOperation.handleError(error_); } readOperations_.clear(); for (auto& writeOperation : writeOperations_) { writeOperation.handleError(error_); } writeOperations_.clear(); transitionIbvQueuePairToError(context_->getReactor().getIbvLib(), qp_); tryCleanup(); if (socket_.hasValue()) { if (state_ > INITIALIZING) { context_->unregisterDescriptor(socket_.fd()); } socket_.reset(); } context_->unenroll(*this); } void ConnectionImpl::tryCleanup() { TP_DCHECK(context_->inLoop()); // Setting the queue pair to an error state will cause all its work requests // (both those that had started being served, and those that hadn't; including // those from a shared receive queue) to be flushed. We need to wait for the // completion events of all those requests to be retrieved from the completion // queue before we can destroy the queue pair. We can do so by deferring the // destruction to the loop, since the reactor will only proceed to invoke // deferred functions once it doesn't have any completion events to handle. // However the RDMA writes and the sends may be queued up inside the reactor // and thus may not have even been scheduled yet, so we explicitly wait for // them to complete. if (error_) { if (numWritesInFlight_ == 0 && numAcksInFlight_ == 0) { TP_VLOG(8) << "Connection " << id_ << " is ready to clean up"; context_->deferToLoop([impl{shared_from_this()}]() { impl->cleanup(); }); } else { TP_VLOG(9) << "Connection " << id_ << " cannot proceed to cleanup because it has " << numWritesInFlight_ << " pending RDMA write requests and " << numAcksInFlight_ << " pending send requests on QP " << qp_->qp_num; } } } void ConnectionImpl::cleanup() { TP_DCHECK(context_->inLoop()); TP_VLOG(8) << "Connection " << id_ << " is cleaning up"; context_->getReactor().unregisterQp(qp_->qp_num); qp_.reset(); inboxMr_.reset(); inboxBuf_.reset(); outboxMr_.reset(); outboxBuf_.reset(); } } // namespace ibv } // namespace transport } // namespace tensorpipe ================================================ FILE: tensorpipe/transport/ibv/connection_impl.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include #include #include #include #include #include #include #include #include namespace tensorpipe { namespace transport { namespace ibv { class ContextImpl; class ListenerImpl; class ConnectionImpl final : public ConnectionImplBoilerplate< ContextImpl, ListenerImpl, ConnectionImpl>, public EpollLoop::EventHandler, public IbvEventHandler { constexpr static size_t kBufferSize = 2 * 1024 * 1024; constexpr static int kNumOutboxRingbufferRoles = 3; using OutboxIbvAcker = RingBufferRole; using OutboxIbvWriter = RingBufferRole; using OutboxProducer = RingBufferRole; constexpr static int kNumInboxRingbufferRoles = 2; using InboxConsumer = RingBufferRole; using InboxIbvRecver = RingBufferRole; enum State { INITIALIZING = 1, SEND_ADDR, RECV_ADDR, ESTABLISHED, }; public: // Create a connection that is already connected (e.g. from a listener). ConnectionImpl( ConstructorToken token, std::shared_ptr context, std::string id, Socket socket); // Create a connection that connects to the specified address. ConnectionImpl( ConstructorToken token, std::shared_ptr context, std::string id, std::string addr); // Implementation of EventHandler. void handleEventsFromLoop(int events) override; // Implementation of IbvEventHandler. void onRemoteProducedData(uint32_t length) override; void onRemoteConsumedData(uint32_t length) override; void onWriteCompleted() override; void onAckCompleted() override; void onError(IbvLib::wc_status status, uint64_t wrId) override; protected: // Implement the entry points called by ConnectionImplBoilerplate. void initImplFromLoop() override; void readImplFromLoop(read_callback_fn fn) override; void readImplFromLoop(AbstractNopHolder& object, read_nop_callback_fn fn) override; void readImplFromLoop(void* ptr, size_t length, read_callback_fn fn) override; void writeImplFromLoop(const void* ptr, size_t length, write_callback_fn fn) override; void writeImplFromLoop(const AbstractNopHolder& object, write_callback_fn fn) override; void handleErrorImpl() override; private: // Handle events of type EPOLLIN on the UNIX domain socket. // // The only data that is expected on that socket is the address and other // setup information for the other side's queue pair and inbox. void handleEventInFromLoop(); // Handle events of type EPOLLOUT on the UNIX domain socket. // // Once the socket is writable we send the address and other setup information // for this side's queue pair and inbox. void handleEventOutFromLoop(); State state_{INITIALIZING}; Socket socket_; optional sockaddr_; IbvQueuePair qp_; // Inbox. // Initialize header during construction because it isn't assignable. RingBufferHeader inboxHeader_{kBufferSize}; // Use mmapped memory so it's page-aligned (and, one day, to use huge pages). MmappedPtr inboxBuf_; RingBuffer inboxRb_; IbvMemoryRegion inboxMr_; // Outbox. // Initialize header during construction because it isn't assignable. RingBufferHeader outboxHeader_{kBufferSize}; // Use mmapped memory so it's page-aligned (and, one day, to use huge pages). MmappedPtr outboxBuf_; RingBuffer outboxRb_; IbvMemoryRegion outboxMr_; // Peer inbox key, pointer and head. uint32_t peerInboxKey_{0}; uint64_t peerInboxPtr_{0}; uint64_t peerInboxHead_{0}; // The connection performs two types of send requests: writing to the remote // inbox, or acknowledging a write into its own inbox. These send operations // could be delayed and stalled by the reactor as only a limited number of // work requests can be outstanding at the same time globally. Thus we keep // count of how many we have pending to make sure they have all completed or // flushed when we close, and that none is stuck in the pipeline. uint32_t numWritesInFlight_{0}; uint32_t numAcksInFlight_{0}; // Pending read operations. std::deque readOperations_; // Pending write operations. std::deque writeOperations_; // Process pending read operations if in an operational state. // // This may be triggered by the other side of the connection (by pushing this // side's inbox token to the reactor) when it has written some new data to its // outbox (which is this side's inbox). It is also called by this connection // when it moves into an established state or when a new read operation is // queued, in case data was already available before this connection was ready // to consume it. void processReadOperationsFromLoop(); // Process pending write operations if in an operational state. // // This may be triggered by the other side of the connection (by pushing this // side's outbox token to the reactor) when it has read some data from its // inbox (which is this side's outbox). This is important when some of this // side's writes couldn't complete because the outbox was full, and thus they // needed to wait for some of its data to be read. This method is also called // by this connection when it moves into an established state, in case some // writes were queued before the connection was ready to process them, or when // a new write operation is queued. void processWriteOperationsFromLoop(); void tryCleanup(); void cleanup(); }; } // namespace ibv } // namespace transport } // namespace tensorpipe ================================================ FILE: tensorpipe/transport/ibv/constants.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include namespace { // We should probably allow these to be user-configured. But, for now, we'll set // them to the lowest value they can have, the rationale being that this way // they will always be valid. constexpr uint8_t kPortNum = 1; constexpr uint8_t kGlobalIdentifierIndex = 0; // FIXME Instead of hardcoding the next three values, we could use // ibv_query_device to obtain max_cqe, max_qp_wr and max_srq_wr and deduce from // them the maximum allowed values for these parameters. // How many simultaneous receive requests to keep queued on the shared receive // queue. Incoming RDMA writes and sends will consume one such request. The // reactor loop will fill the SRQ back up to this value once some requests // complete. So this number should just be large enough to accommodate all the // requests that could finish between two reactor loop iterations. And, even if // this number ends up being too low, the excess incoming requests will just // retry, causing a performance penalty but not a failure. constexpr uint32_t kNumPendingRecvReqs = 1024; // How many RDMA write requests can be pending at the same time across all // connections. We need to put a limit on them because they all use the same // global completion queue which has a fixed capacity and if it overruns it will // enter an unrecoverable error state. This value is also set as the capacity of // the send queue of each queue pair. constexpr uint32_t kNumPendingWriteReqs = 1024; // How many send requests (used by the receiver to acknowledge the RDMA writes // from the sender) can be pending at the same time across all connections. constexpr uint32_t kNumPendingAckReqs = 1024; // How many elements the completion queue should be able to hold. These elements // will be either the completed receive requests of the SRQ, or the completed // send requests from a connection's queue pair. We can bound the former value // but not the latter, so we try to add some margin. constexpr int kCompletionQueueSize = kNumPendingRecvReqs + kNumPendingWriteReqs + kNumPendingAckReqs; // How many pending outgoing work requests each send queue should be able to // hold. The operations we post on a send queue are the RDMA_WRITEs to send // outgoing data and the SENDs to acknowledge incoming data, hence we size the // queue to the sum of the maximum amount of these two ops. constexpr int kSendQueueSize = kNumPendingWriteReqs + kNumPendingAckReqs; // How many work completions to poll from the completion queue at each reactor // iteration. constexpr int kNumPolledWorkCompletions = 32; // When the connection gets closed, to avoid leaks, it needs to "reclaim" all // the work requests that it had posted, by waiting for their completion. They // may however complete with error, which makes it harder to identify and // distinguish them from failing incoming requests because, in principle, we // cannot access the opcode field of a failed work completion. Therefore, we // assign a special ID to those types of requests, to match them later on. constexpr uint64_t kWriteRequestId = 1; constexpr uint64_t kAckRequestId = 2; } // namespace ================================================ FILE: tensorpipe/transport/ibv/context_impl.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include namespace tensorpipe { namespace transport { namespace ibv { namespace { // Prepend descriptor with transport name so it's easy to // disambiguate descriptors when debugging. const std::string kDomainDescriptorPrefix{"ibv:"}; std::string generateDomainDescriptor() { // It would be very cool if we could somehow obtain an "identifier" for the // InfiniBand subnet that our device belongs to, but nothing of that sort // seems to be available. So instead we say that if the user is trying to // connect two processes which both have access to an InfiniBand device then // they must know what they are doing and probably must have set up things // properly. return kDomainDescriptorPrefix + "*"; } } // namespace std::shared_ptr ContextImpl::create() { Error error; IbvLib ibvLib; std::tie(error, ibvLib) = IbvLib::create(); if (error) { TP_VLOG(7) << "IBV transport is not viable because libibverbs couldn't be loaded: " << error.what(); return nullptr; } IbvDeviceList deviceList; std::tie(error, deviceList) = IbvDeviceList::create(ibvLib); if (error && error.isOfType() && error.castToType()->errorCode() == ENOSYS) { TP_VLOG(7) << "IBV transport is not viable because it couldn't get list of " << "InfiniBand devices because the kernel module isn't loaded"; return nullptr; } TP_THROW_ASSERT_IF(error) << "Couldn't get list of InfiniBand devices: " << error.what(); if (deviceList.size() == 0) { TP_VLOG(7) << "IBV transport is not viable because it couldn't find any " << "InfiniBand NICs"; return nullptr; } return std::make_shared( std::move(ibvLib), std::move(deviceList)); } ContextImpl::ContextImpl(IbvLib ibvLib, IbvDeviceList deviceList) : ContextImplBoilerplate( generateDomainDescriptor()), reactor_(std::move(ibvLib), std::move(deviceList)) {} void ContextImpl::handleErrorImpl() { loop_.close(); reactor_.close(); } void ContextImpl::joinImpl() { loop_.join(); reactor_.join(); } bool ContextImpl::inLoop() const { return reactor_.inLoop(); }; void ContextImpl::deferToLoop(std::function fn) { reactor_.deferToLoop(std::move(fn)); }; void ContextImpl::registerDescriptor( int fd, int events, std::shared_ptr h) { loop_.registerDescriptor(fd, events, std::move(h)); } void ContextImpl::unregisterDescriptor(int fd) { loop_.unregisterDescriptor(fd); } Reactor& ContextImpl::getReactor() { return reactor_; } } // namespace ibv } // namespace transport } // namespace tensorpipe ================================================ FILE: tensorpipe/transport/ibv/context_impl.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include #include namespace tensorpipe { namespace transport { namespace ibv { class ConnectionImpl; class ListenerImpl; class ContextImpl final : public ContextImplBoilerplate { public: static std::shared_ptr create(); ContextImpl(IbvLib ibvLib, IbvDeviceList deviceList); // Implement the DeferredExecutor interface. bool inLoop() const override; void deferToLoop(std::function fn) override; void registerDescriptor( int fd, int events, std::shared_ptr h); void unregisterDescriptor(int fd); Reactor& getReactor(); protected: // Implement the entry points called by ContextImplBoilerplate. void handleErrorImpl() override; void joinImpl() override; private: Reactor reactor_; EpollLoop loop_{this->reactor_}; }; } // namespace ibv } // namespace transport } // namespace tensorpipe ================================================ FILE: tensorpipe/transport/ibv/error.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include namespace tensorpipe { namespace transport { namespace ibv { std::string IbvError::what() const { return error_; } std::string GetaddrinfoError::what() const { std::ostringstream ss; ss << "getaddrinfo: " << gai_strerror(error_); return ss.str(); } std::string NoAddrFoundError::what() const { return "no address found"; } } // namespace ibv } // namespace transport } // namespace tensorpipe ================================================ FILE: tensorpipe/transport/ibv/error.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include namespace tensorpipe { namespace transport { namespace ibv { class IbvError final : public BaseError { public: explicit IbvError(std::string error) : error_(error) {} std::string what() const override; private: std::string error_; }; class GetaddrinfoError final : public BaseError { public: explicit GetaddrinfoError(int error) : error_(error) {} std::string what() const override; private: int error_; }; class NoAddrFoundError final : public BaseError { public: NoAddrFoundError() {} std::string what() const override; }; } // namespace ibv } // namespace transport } // namespace tensorpipe ================================================ FILE: tensorpipe/transport/ibv/factory.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include namespace tensorpipe { namespace transport { namespace ibv { std::shared_ptr create() { return std::make_shared< ContextBoilerplate>(); } } // namespace ibv } // namespace transport } // namespace tensorpipe ================================================ FILE: tensorpipe/transport/ibv/factory.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include namespace tensorpipe { namespace transport { namespace ibv { std::shared_ptr create(); } // namespace ibv } // namespace transport } // namespace tensorpipe ================================================ FILE: tensorpipe/transport/ibv/listener_impl.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include #include #include #include #include #include #include #include #include namespace tensorpipe { namespace transport { namespace ibv { ListenerImpl::ListenerImpl( ConstructorToken token, std::shared_ptr context, std::string id, std::string addr) : ListenerImplBoilerplate( token, std::move(context), std::move(id)), sockaddr_(Sockaddr::createInetSockAddr(addr)) {} void ListenerImpl::initImplFromLoop() { context_->enroll(*this); Error error; TP_DCHECK(!socket_.hasValue()); std::tie(error, socket_) = Socket::createForFamily(sockaddr_.addr()->sa_family); if (error) { setError(std::move(error)); return; } error = socket_.reuseAddr(true); if (error) { setError(std::move(error)); return; } error = socket_.bind(sockaddr_); if (error) { setError(std::move(error)); return; } error = socket_.block(false); if (error) { setError(std::move(error)); return; } error = socket_.listen(128); if (error) { setError(std::move(error)); return; } struct sockaddr_storage addr; socklen_t addrlen; std::tie(error, addr, addrlen) = socket_.getSockName(); if (error) { setError(std::move(error)); return; } sockaddr_ = Sockaddr(reinterpret_cast(&addr), addrlen); } void ListenerImpl::handleErrorImpl() { if (!fns_.empty()) { context_->unregisterDescriptor(socket_.fd()); } socket_.reset(); for (auto& fn : fns_) { fn(error_, std::shared_ptr()); } fns_.clear(); context_->unenroll(*this); } void ListenerImpl::acceptImplFromLoop(accept_callback_fn fn) { fns_.push_back(std::move(fn)); // Only register if we go from 0 to 1 pending callbacks. In other cases we // already had a pending callback and thus we were already registered. if (fns_.size() == 1) { // Register with loop for readability events. context_->registerDescriptor(socket_.fd(), EPOLLIN, shared_from_this()); } } std::string ListenerImpl::addrImplFromLoop() const { return sockaddr_.str(); } void ListenerImpl::handleEventsFromLoop(int events) { TP_DCHECK(context_->inLoop()); TP_VLOG(9) << "Listener " << id_ << " is handling an event on its socket (" << EpollLoop::formatEpollEvents(events) << ")"; if (events & EPOLLERR) { int error; socklen_t errorlen = sizeof(error); int rv = getsockopt( socket_.fd(), SOL_SOCKET, SO_ERROR, reinterpret_cast(&error), &errorlen); if (rv == -1) { setError(TP_CREATE_ERROR(SystemError, "getsockopt", rv)); } else { setError(TP_CREATE_ERROR(SystemError, "async error on socket", error)); } return; } if (events & EPOLLHUP) { setError(TP_CREATE_ERROR(EOFError)); return; } TP_ARG_CHECK_EQ(events, EPOLLIN); Error error; Socket socket; std::tie(error, socket) = socket_.accept(); if (error) { setError(std::move(error)); return; } TP_DCHECK(!fns_.empty()) << "when the callback is disarmed the listener's descriptor is supposed " << "to be unregistered"; auto fn = std::move(fns_.front()); fns_.pop_front(); if (fns_.empty()) { context_->unregisterDescriptor(socket_.fd()); } fn(Error::kSuccess, createAndInitConnection(std::move(socket))); } } // namespace ibv } // namespace transport } // namespace tensorpipe ================================================ FILE: tensorpipe/transport/ibv/listener_impl.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include #include #include #include #include #include namespace tensorpipe { namespace transport { namespace ibv { class ConnectionImpl; class ContextImpl; class ListenerImpl final : public ListenerImplBoilerplate, public EpollLoop::EventHandler { public: // Create a listener that listens on the specified address. ListenerImpl( ConstructorToken token, std::shared_ptr context, std::string id, std::string addr); // Implementation of EventHandler. void handleEventsFromLoop(int events) override; protected: // Implement the entry points called by ListenerImplBoilerplate. void initImplFromLoop() override; void acceptImplFromLoop(accept_callback_fn fn) override; std::string addrImplFromLoop() const override; void handleErrorImpl() override; private: Socket socket_; Sockaddr sockaddr_; std::deque fns_; }; } // namespace ibv } // namespace transport } // namespace tensorpipe ================================================ FILE: tensorpipe/transport/ibv/reactor.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include namespace tensorpipe { namespace transport { namespace ibv { Reactor::Reactor(IbvLib ibvLib, IbvDeviceList deviceList) : ibvLib_(std::move(ibvLib)) { TP_DCHECK_GE(deviceList.size(), 1); ctx_ = createIbvContext(getIbvLib(), deviceList[0]); pd_ = createIbvProtectionDomain(getIbvLib(), ctx_); cq_ = createIbvCompletionQueue( getIbvLib(), ctx_, kCompletionQueueSize, /*cq_context=*/nullptr, /*channel=*/nullptr, /*comp_vector=*/0); IbvLib::srq_init_attr srqInitAttr; std::memset(&srqInitAttr, 0, sizeof(srqInitAttr)); srqInitAttr.attr.max_wr = kNumPendingRecvReqs; srq_ = createIbvSharedReceiveQueue(getIbvLib(), pd_, srqInitAttr); addr_ = makeIbvAddress(getIbvLib(), ctx_, kPortNum, kGlobalIdentifierIndex); postRecvRequestsOnSRQ(kNumPendingRecvReqs); startThread("TP_IBV_reactor"); } void Reactor::postRecvRequestsOnSRQ(int num) { while (num > 0) { IbvLib::recv_wr* badRecvWr = nullptr; std::array wrs; std::memset(wrs.data(), 0, sizeof(wrs)); for (int i = 0; i < std::min(num, kNumPolledWorkCompletions) - 1; i++) { wrs[i].next = &wrs[i + 1]; } int rv = getIbvLib().post_srq_recv(srq_.get(), wrs.data(), &badRecvWr); TP_THROW_SYSTEM_IF(rv != 0, errno); TP_THROW_ASSERT_IF(badRecvWr != nullptr); num -= std::min(num, kNumPolledWorkCompletions); } } void Reactor::setId(std::string id) { id_ = std::move(id); } void Reactor::close() { if (!closed_.exchange(true)) { stopBusyPolling(); } } void Reactor::join() { close(); if (!joined_.exchange(true)) { joinThread(); } } Reactor::~Reactor() { join(); } bool Reactor::pollOnce() { std::array wcs; auto rv = getIbvLib().poll_cq(cq_.get(), wcs.size(), wcs.data()); if (rv == 0) { return false; } TP_THROW_SYSTEM_IF(rv < 0, errno); int numRecvs = 0; int numWrites = 0; int numAcks = 0; for (int wcIdx = 0; wcIdx < rv; wcIdx++) { IbvLib::wc& wc = wcs[wcIdx]; TP_VLOG(9) << "Transport context " << id_ << " got work completion for request " << wc.wr_id << " for QP " << wc.qp_num << " with status " << getIbvLib().wc_status_str(wc.status) << " and opcode " << ibvWorkCompletionOpcodeToStr(wc.opcode) << " (byte length: " << wc.byte_len << ", immediate data: " << wc.imm_data << ")"; auto iter = queuePairEventHandler_.find(wc.qp_num); TP_THROW_ASSERT_IF(iter == queuePairEventHandler_.end()) << "Got work completion for unknown queue pair " << wc.qp_num; if (wc.status != IbvLib::WC_SUCCESS) { iter->second->onError(wc.status, wc.wr_id); continue; } switch (wc.opcode) { case IbvLib::WC_RECV_RDMA_WITH_IMM: TP_THROW_ASSERT_IF(!(wc.wc_flags & IbvLib::WC_WITH_IMM)); iter->second->onRemoteProducedData(wc.imm_data); numRecvs++; break; case IbvLib::WC_RECV: TP_THROW_ASSERT_IF(!(wc.wc_flags & IbvLib::WC_WITH_IMM)); iter->second->onRemoteConsumedData(wc.imm_data); numRecvs++; break; case IbvLib::WC_RDMA_WRITE: iter->second->onWriteCompleted(); numWrites++; break; case IbvLib::WC_SEND: iter->second->onAckCompleted(); numAcks++; break; default: TP_THROW_ASSERT() << "Unknown opcode: " << wc.opcode; } } postRecvRequestsOnSRQ(numRecvs); numAvailableWrites_ += numWrites; while (!pendingQpWrites_.empty() && numAvailableWrites_ > 0) { postWrite( std::get<0>(pendingQpWrites_.front()), std::get<1>(pendingQpWrites_.front())); pendingQpWrites_.pop_front(); } numAvailableAcks_ += numAcks; while (!pendingQpAcks_.empty() && numAvailableAcks_ > 0) { postAck( std::get<0>(pendingQpAcks_.front()), std::get<1>(pendingQpAcks_.front())); pendingQpAcks_.pop_front(); } return true; } bool Reactor::readyToClose() { return queuePairEventHandler_.size() == 0; } void Reactor::registerQp( uint32_t qpn, std::shared_ptr eventHandler) { queuePairEventHandler_.emplace(qpn, std::move(eventHandler)); } void Reactor::unregisterQp(uint32_t qpn) { queuePairEventHandler_.erase(qpn); } void Reactor::postWrite(IbvQueuePair& qp, WriteInfo info) { if (numAvailableWrites_ > 0) { IbvLib::sge list; list.addr = reinterpret_cast(info.addr); list.length = info.length; list.lkey = info.lkey; IbvLib::send_wr wr; std::memset(&wr, 0, sizeof(wr)); wr.wr_id = kWriteRequestId; wr.sg_list = &list; wr.num_sge = 1; wr.opcode = IbvLib::WR_RDMA_WRITE_WITH_IMM; wr.imm_data = info.length; wr.wr.rdma.remote_addr = info.remoteAddr; wr.wr.rdma.rkey = info.rkey; IbvLib::send_wr* badWr = nullptr; TP_VLOG(9) << "Transport context " << id_ << " posting RDMA write for QP " << qp->qp_num; TP_CHECK_IBV_INT(getIbvLib().post_send(qp.get(), &wr, &badWr)); TP_THROW_ASSERT_IF(badWr != nullptr); numAvailableWrites_--; } else { TP_VLOG(9) << "Transport context " << id_ << " queueing up RDMA write for QP " << qp->qp_num; pendingQpWrites_.emplace_back(qp, info); } } void Reactor::postAck(IbvQueuePair& qp, AckInfo info) { if (numAvailableAcks_ > 0) { IbvLib::send_wr wr; std::memset(&wr, 0, sizeof(wr)); wr.wr_id = kAckRequestId; wr.opcode = IbvLib::WR_SEND_WITH_IMM; wr.imm_data = info.length; IbvLib::send_wr* badWr = nullptr; TP_VLOG(9) << "Transport context " << id_ << " posting send for QP " << qp->qp_num; TP_CHECK_IBV_INT(getIbvLib().post_send(qp.get(), &wr, &badWr)); TP_THROW_ASSERT_IF(badWr != nullptr); numAvailableAcks_--; } else { TP_VLOG(9) << "Transport context " << id_ << " queueing send for QP " << qp->qp_num; pendingQpAcks_.emplace_back(qp, info); } } } // namespace ibv } // namespace transport } // namespace tensorpipe ================================================ FILE: tensorpipe/transport/ibv/reactor.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include #include #include #include #include #include #include #include #include namespace tensorpipe { namespace transport { namespace ibv { class IbvEventHandler { public: virtual void onRemoteProducedData(uint32_t length) = 0; virtual void onRemoteConsumedData(uint32_t length) = 0; virtual void onWriteCompleted() = 0; virtual void onAckCompleted() = 0; virtual void onError(IbvLib::wc_status status, uint64_t wrId) = 0; virtual ~IbvEventHandler() = default; }; // Reactor loop. // // Companion class to the event loop in `loop.h` that executes // functions on triggers. The triggers are posted to a shared memory // ring buffer, so this can be done by other processes on the same // machine. It uses extra data in the ring buffer header to store a // mutex and condition variable to avoid a busy loop. // class Reactor final : public BusyPollingLoop { public: Reactor(IbvLib ibvLib, IbvDeviceList deviceList); const IbvLib& getIbvLib() { return ibvLib_; } IbvProtectionDomain& getIbvPd() { return pd_; } IbvCompletionQueue& getIbvCq() { return cq_; } IbvSharedReceiveQueue& getIbvSrq() { return srq_; } const IbvAddress& getIbvAddress() { return addr_; } void registerQp(uint32_t qpn, std::shared_ptr eventHandler); void unregisterQp(uint32_t qpn); struct WriteInfo { void* addr; size_t length; uint32_t lkey; uint64_t remoteAddr; uint32_t rkey; }; void postWrite(IbvQueuePair& qp, WriteInfo info); struct AckInfo { size_t length; }; void postAck(IbvQueuePair& qp, AckInfo info); void setId(std::string id); void close(); void join(); ~Reactor(); protected: bool pollOnce() override; bool readyToClose() override; private: // InfiniBand stuff const IbvLib ibvLib_; IbvContext ctx_; IbvProtectionDomain pd_; IbvCompletionQueue cq_; IbvSharedReceiveQueue srq_; IbvAddress addr_; void postRecvRequestsOnSRQ(int num); std::atomic closed_{false}; std::atomic joined_{false}; // An identifier for the context, composed of the identifier for the context, // combined with the transport's name. It will only be used for logging and // debugging purposes. std::string id_{"N/A"}; // The registered event handlers for each queue pair. std::unordered_map> queuePairEventHandler_; uint32_t numAvailableWrites_{kNumPendingWriteReqs}; uint32_t numAvailableAcks_{kNumPendingAckReqs}; std::deque> pendingQpWrites_; std::deque> pendingQpAcks_; }; } // namespace ibv } // namespace transport } // namespace tensorpipe ================================================ FILE: tensorpipe/transport/ibv/sockaddr.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include #include #include #include namespace tensorpipe { namespace transport { namespace ibv { Sockaddr Sockaddr::createInetSockAddr(const std::string& str) { int port = 0; std::string addrStr; std::string portStr; // If the input string is an IPv6 address with port, the address // itself must be wrapped with brackets. if (addrStr.empty()) { auto start = str.find("["); auto stop = str.find("]"); if (start < stop && start != std::string::npos && stop != std::string::npos) { addrStr = str.substr(start + 1, stop - (start + 1)); if (stop + 1 < str.size() && str[stop + 1] == ':') { portStr = str.substr(stop + 2); } } } // If the input string is an IPv4 address with port, we expect // at least a single period and a single colon in the string. if (addrStr.empty()) { auto period = str.find("."); auto colon = str.find(":"); if (period != std::string::npos && colon != std::string::npos) { addrStr = str.substr(0, colon); portStr = str.substr(colon + 1); } } // Fallback to using entire input string as address without port. if (addrStr.empty()) { addrStr = str; } // Parse port number if specified. if (!portStr.empty()) { port = std::stoi(portStr); if (port < 0 || port > std::numeric_limits::max()) { TP_THROW_EINVAL() << str; } } // Try to convert an IPv4 address. { struct sockaddr_in addr; std::memset(&addr, 0, sizeof(addr)); auto rv = inet_pton(AF_INET, addrStr.c_str(), &addr.sin_addr); TP_THROW_SYSTEM_IF(rv < 0, errno); if (rv == 1) { addr.sin_family = AF_INET; addr.sin_port = ntohs(port); return Sockaddr(reinterpret_cast(&addr), sizeof(addr)); } } // Try to convert an IPv6 address. { struct sockaddr_in6 addr; std::memset(&addr, 0, sizeof(addr)); auto interfacePos = addrStr.find('%'); if (interfacePos != std::string::npos) { addr.sin6_scope_id = if_nametoindex(addrStr.substr(interfacePos + 1).c_str()); addrStr = addrStr.substr(0, interfacePos); } auto rv = inet_pton(AF_INET6, addrStr.c_str(), &addr.sin6_addr); TP_THROW_SYSTEM_IF(rv < 0, errno); if (rv == 1) { addr.sin6_family = AF_INET6; addr.sin6_port = ntohs(port); return Sockaddr(reinterpret_cast(&addr), sizeof(addr)); } } // Invalid address. TP_THROW_EINVAL() << str; // Return bogus to silence "return from non-void function" warning. // Note: we don't reach this point per the throw above. return Sockaddr(nullptr, 0); } std::string Sockaddr::str() const { std::ostringstream oss; if (addr_.ss_family == AF_INET) { std::array buf; auto in = reinterpret_cast(&addr_); auto rv = inet_ntop(AF_INET, &in->sin_addr, buf.data(), buf.size()); TP_THROW_SYSTEM_IF(rv == nullptr, errno); oss << buf.data() << ":" << htons(in->sin_port); } else if (addr_.ss_family == AF_INET6) { std::array buf; auto in6 = reinterpret_cast(&addr_); auto rv = inet_ntop(AF_INET6, &in6->sin6_addr, buf.data(), buf.size()); TP_THROW_SYSTEM_IF(rv == nullptr, errno); oss << "[" << buf.data(); if (in6->sin6_scope_id > 0) { std::array scopeBuf; rv = if_indextoname(in6->sin6_scope_id, scopeBuf.data()); TP_THROW_SYSTEM_IF(rv == nullptr, errno); oss << "%" << scopeBuf.data(); } oss << "]:" << htons(in6->sin6_port); } else { TP_THROW_EINVAL() << "invalid address family: " << addr_.ss_family; } return oss.str(); } } // namespace ibv } // namespace transport } // namespace tensorpipe ================================================ FILE: tensorpipe/transport/ibv/sockaddr.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include namespace tensorpipe { namespace transport { namespace ibv { class Sockaddr final : public tensorpipe::Sockaddr { public: static Sockaddr createInetSockAddr(const std::string& str); Sockaddr(const struct sockaddr* addr, socklen_t addrlen) { TP_ARG_CHECK(addr != nullptr); TP_ARG_CHECK_LE(addrlen, sizeof(addr_)); // Ensure the sockaddr_storage is zeroed, because we don't always // write to all fields in the `sockaddr_[in|in6]` structures. std::memset(&addr_, 0, sizeof(addr_)); std::memcpy(&addr_, addr, addrlen); addrlen_ = addrlen; } inline const struct sockaddr* addr() const override { return reinterpret_cast(&addr_); } inline struct sockaddr* addr() { return reinterpret_cast(&addr_); } inline socklen_t addrlen() const override { return addrlen_; } std::string str() const; private: struct sockaddr_storage addr_; socklen_t addrlen_; }; } // namespace ibv } // namespace transport } // namespace tensorpipe ================================================ FILE: tensorpipe/transport/ibv/utility.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include namespace tensorpipe { namespace transport { namespace ibv { namespace { struct InterfaceAddressesDeleter { void operator()(struct ifaddrs* ptr) { ::freeifaddrs(ptr); } }; using InterfaceAddresses = std::unique_ptr; std::tuple createInterfaceAddresses() { struct ifaddrs* ifaddrs; auto rv = ::getifaddrs(&ifaddrs); if (rv < 0) { return std::make_tuple( TP_CREATE_ERROR(SystemError, "getifaddrs", errno), InterfaceAddresses()); } return std::make_tuple(Error::kSuccess, InterfaceAddresses(ifaddrs)); } std::tuple getHostname() { std::array hostname; auto rv = ::gethostname(hostname.data(), hostname.size()); if (rv < 0) { return std::make_tuple( TP_CREATE_ERROR(SystemError, "gethostname", errno), std::string()); } return std::make_tuple(Error::kSuccess, std::string(hostname.data())); } struct AddressInfoDeleter { void operator()(struct addrinfo* ptr) { ::freeaddrinfo(ptr); } }; using AddressInfo = std::unique_ptr; std::tuple createAddressInfo(std::string host) { struct addrinfo hints; std::memset(&hints, 0, sizeof(hints)); hints.ai_family = AF_UNSPEC; hints.ai_socktype = SOCK_STREAM; hints.ai_protocol = IPPROTO_TCP; struct addrinfo* result; auto rv = ::getaddrinfo(host.c_str(), nullptr, &hints, &result); if (rv != 0) { return std::make_tuple( TP_CREATE_ERROR(GetaddrinfoError, rv), AddressInfo()); } return std::make_tuple(Error::kSuccess, AddressInfo(result)); } } // namespace std::tuple lookupAddrForIface(std::string iface) { Error error; InterfaceAddresses addresses; std::tie(error, addresses) = createInterfaceAddresses(); if (error) { return std::make_tuple(std::move(error), std::string()); } struct ifaddrs* ifa; for (ifa = addresses.get(); ifa != nullptr; ifa = ifa->ifa_next) { // Skip entry if ifa_addr is NULL (see getifaddrs(3)) if (ifa->ifa_addr == nullptr) { continue; } if (iface != ifa->ifa_name) { continue; } switch (ifa->ifa_addr->sa_family) { case AF_INET: return std::make_tuple( Error::kSuccess, Sockaddr(ifa->ifa_addr, sizeof(struct sockaddr_in)).str()); case AF_INET6: return std::make_tuple( Error::kSuccess, Sockaddr(ifa->ifa_addr, sizeof(struct sockaddr_in6)).str()); } } return std::make_tuple(TP_CREATE_ERROR(NoAddrFoundError), std::string()); } std::tuple lookupAddrForHostname() { Error error; std::string hostname; std::tie(error, hostname) = getHostname(); if (error) { return std::make_tuple(std::move(error), std::string()); } AddressInfo info; std::tie(error, info) = createAddressInfo(std::move(hostname)); if (error) { return std::make_tuple(std::move(error), std::string()); } Error firstError; for (struct addrinfo* rp = info.get(); rp != nullptr; rp = rp->ai_next) { TP_DCHECK(rp->ai_family == AF_INET || rp->ai_family == AF_INET6); TP_DCHECK_EQ(rp->ai_socktype, SOCK_STREAM); TP_DCHECK_EQ(rp->ai_protocol, IPPROTO_TCP); Sockaddr addr = Sockaddr(rp->ai_addr, rp->ai_addrlen); Socket socket; std::tie(error, socket) = Socket::createForFamily(rp->ai_family); if (!error) { error = socket.bind(addr); } if (error) { // Record the first binding error we encounter and return that in the end // if no working address is found, in order to help with debugging. if (!firstError) { firstError = error; } continue; } return std::make_tuple(Error::kSuccess, addr.str()); } if (firstError) { return std::make_tuple(std::move(firstError), std::string()); } else { return std::make_tuple(TP_CREATE_ERROR(NoAddrFoundError), std::string()); } } } // namespace ibv } // namespace transport } // namespace tensorpipe ================================================ FILE: tensorpipe/transport/ibv/utility.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include namespace tensorpipe { namespace transport { namespace ibv { std::tuple lookupAddrForIface(std::string iface); std::tuple lookupAddrForHostname(); } // namespace ibv } // namespace transport } // namespace tensorpipe ================================================ FILE: tensorpipe/transport/listener.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include namespace tensorpipe { namespace transport { class Listener { public: using accept_callback_fn = std::function< void(const Error& error, std::shared_ptr connection)>; virtual void accept(accept_callback_fn fn) = 0; // Return address that this listener is listening on. // This may be required if the listening address is not known up // front, or dynamically populated by the operating system (e.g. by // letting the operating system pick a TCP port to listen on). virtual std::string addr() const = 0; // Tell the listener what its identifier is. // // This is only supposed to be called from the high-level listener or from // channel contexts. It will only used for logging and debugging purposes. virtual void setId(std::string id) = 0; virtual void close() = 0; virtual ~Listener() = default; }; } // namespace transport } // namespace tensorpipe ================================================ FILE: tensorpipe/transport/listener_boilerplate.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include namespace tensorpipe { namespace transport { template class ListenerBoilerplate : public Listener { public: template ListenerBoilerplate( typename ListenerImplBoilerplate::ConstructorToken token, std::shared_ptr context, std::string id, Args... args); explicit ListenerBoilerplate(std::shared_ptr listener); ListenerBoilerplate(const ListenerBoilerplate&) = delete; ListenerBoilerplate(ListenerBoilerplate&&) = delete; ListenerBoilerplate& operator=(const ListenerBoilerplate&) = delete; ListenerBoilerplate& operator=(ListenerBoilerplate&&) = delete; // Queue a callback to be called when a connection comes in. void accept(accept_callback_fn fn) override; // Obtain the listener's address. std::string addr() const override; // Tell the listener what its identifier is. void setId(std::string id) override; // Shut down the connection and its resources. void close() override; ~ListenerBoilerplate() override; protected: // Using a shared_ptr allows us to detach the lifetime of the implementation // from the public object's one and perform the destruction asynchronously. const std::shared_ptr impl_; }; template template ListenerBoilerplate::ListenerBoilerplate( typename ListenerImplBoilerplate::ConstructorToken token, std::shared_ptr context, std::string id, Args... args) : impl_(std::make_shared( token, std::move(context), std::move(id), std::forward(args)...)) { static_assert( std::is_base_of, TList>:: value, ""); impl_->init(); } template ListenerBoilerplate::ListenerBoilerplate( std::shared_ptr listener) : impl_(std::move(listener)) { static_assert( std::is_base_of, TList>:: value, ""); } template void ListenerBoilerplate::accept(accept_callback_fn fn) { if (unlikely(!impl_)) { // FIXME In C++-17 perhaps a global static inline variable would be better? static Error error = TP_CREATE_ERROR(ContextNotViableError); fn(error, nullptr); return; } impl_->accept(std::move(fn)); } template std::string ListenerBoilerplate::addr() const { if (unlikely(!impl_)) { return ""; } return impl_->addr(); } template void ListenerBoilerplate::setId(std::string id) { if (unlikely(!impl_)) { return; } impl_->setId(std::move(id)); } template void ListenerBoilerplate::close() { if (unlikely(!impl_)) { return; } impl_->close(); } template ListenerBoilerplate::~ListenerBoilerplate() { close(); } } // namespace transport } // namespace tensorpipe ================================================ FILE: tensorpipe/transport/listener_impl_boilerplate.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include #include #include #include #include #include namespace tensorpipe { namespace transport { template class ListenerImplBoilerplate : public std::enable_shared_from_this { public: class ConstructorToken { public: ConstructorToken(const ConstructorToken&) = default; private: explicit ConstructorToken() {} friend ContextImplBoilerplate; friend ListenerImplBoilerplate; }; ListenerImplBoilerplate( ConstructorToken token, std::shared_ptr context, std::string id); ListenerImplBoilerplate(const ListenerImplBoilerplate&) = delete; ListenerImplBoilerplate(ListenerImplBoilerplate&&) = delete; ListenerImplBoilerplate& operator=(const ListenerImplBoilerplate&) = delete; ListenerImplBoilerplate& operator=(ListenerImplBoilerplate&&) = delete; // Initialize member fields that need `shared_from_this`. void init(); // Queue a callback to be called when a connection comes in. using accept_callback_fn = Listener::accept_callback_fn; void accept(accept_callback_fn fn); // Obtain the listener's address. std::string addr() const; // Tell the listener what its identifier is. void setId(std::string id); // Shut down the listener and its resources. void close(); virtual ~ListenerImplBoilerplate() = default; protected: virtual void initImplFromLoop() = 0; virtual void acceptImplFromLoop(accept_callback_fn fn) = 0; virtual std::string addrImplFromLoop() const = 0; virtual void handleErrorImpl() = 0; void setError(Error error); const std::shared_ptr context_; Error error_{Error::kSuccess}; template std::shared_ptr createAndInitConnection(Args&&... args); // An identifier for the listener, composed of the identifier for the context, // combined with an increasing sequence number. It will be used as a prefix // for the identifiers of connections. All of them will only be used for // logging and debugging purposes. std::string id_; private: // Initialize member fields that need `shared_from_this`. void initFromLoop(); // Queue a callback to be called when a connection comes in. void acceptFromLoop(accept_callback_fn fn); // Obtain the listener's address. std::string addrFromLoop() const; void setIdFromLoop(std::string id); // Shut down the connection and its resources. void closeFromLoop(); // Deal with an error. void handleError(); // A sequence number for the calls to accept. uint64_t nextConnectionBeingAccepted_{0}; // Sequence numbers for the connections created by this listener, used to // create their identifiers based off this listener's identifier. They will // only be used for logging and debugging. std::atomic connectionCounter_{0}; // Contexts do sometimes need to call directly into closeFromLoop, in order to // make sure that some of their operations can happen "atomically" on the // connection, without possibly other operations occurring in between (e.g., // an error). friend ContextImplBoilerplate; }; template ListenerImplBoilerplate::ListenerImplBoilerplate( ConstructorToken /* unused */, std::shared_ptr context, std::string id) : context_(std::move(context)), id_(std::move(id)) {} template void ListenerImplBoilerplate::init() { context_->deferToLoop( [impl{this->shared_from_this()}]() { impl->initFromLoop(); }); } template void ListenerImplBoilerplate::initFromLoop() { if (context_->closed()) { // Set the error without calling setError because we do not want to invoke // the subclass's handleErrorImpl as it would find itself in a weird state // (since initFromLoop wouldn't have been called). error_ = TP_CREATE_ERROR(ListenerClosedError); TP_VLOG(7) << "Listener " << id_ << " is closing (without initing)"; return; } initImplFromLoop(); } template void ListenerImplBoilerplate::accept( accept_callback_fn fn) { context_->deferToLoop( [impl{this->shared_from_this()}, fn{std::move(fn)}]() mutable { impl->acceptFromLoop(std::move(fn)); }); } template void ListenerImplBoilerplate::acceptFromLoop( accept_callback_fn fn) { TP_DCHECK(context_->inLoop()); uint64_t sequenceNumber = nextConnectionBeingAccepted_++; TP_VLOG(7) << "Listener " << id_ << " received an accept request (#" << sequenceNumber << ")"; fn = [this, sequenceNumber, fn{std::move(fn)}]( const Error& error, std::shared_ptr connection) { TP_VLOG(7) << "Listener " << id_ << " is calling an accept callback (#" << sequenceNumber << ")"; fn(error, std::move(connection)); TP_VLOG(7) << "Listener " << id_ << " done calling an accept callback (#" << sequenceNumber << ")"; }; if (error_) { fn(error_, std::shared_ptr()); return; } acceptImplFromLoop(std::move(fn)); } template std::string ListenerImplBoilerplate::addr() const { std::string addr; context_->runInLoop([this, &addr]() { addr = addrFromLoop(); }); return addr; } template std::string ListenerImplBoilerplate::addrFromLoop() const { TP_DCHECK(context_->inLoop()); return addrImplFromLoop(); } template template std::shared_ptr ListenerImplBoilerplate:: createAndInitConnection(Args&&... args) { TP_DCHECK(context_->inLoop()); std::string connectionId = id_ + ".c" + std::to_string(connectionCounter_++); TP_VLOG(7) << "Listener " << id_ << " is opening connection " << connectionId; auto connection = std::make_shared( typename ConnectionImplBoilerplate:: ConstructorToken(), context_, std::move(connectionId), std::forward(args)...); // We initialize the connection from the loop immediately, inline, because the // initialization of a connection accepted by a listener typically happens // partly in the listener (e.g., opening and accepting the socket) and partly // in the connection's initFromLoop, and we need these two steps to happen // "atomicically" to make it impossible for an error to occur in between. connection->initFromLoop(); return std::make_shared>( std::move(connection)); } template void ListenerImplBoilerplate::setId(std::string id) { context_->deferToLoop( [impl{this->shared_from_this()}, id{std::move(id)}]() mutable { impl->setIdFromLoop(std::move(id)); }); } template void ListenerImplBoilerplate::setIdFromLoop( std::string id) { TP_DCHECK(context_->inLoop()); TP_VLOG(7) << "Listener " << id_ << " was renamed to " << id; id_ = std::move(id); } template void ListenerImplBoilerplate::close() { context_->deferToLoop( [impl{this->shared_from_this()}]() { impl->closeFromLoop(); }); } template void ListenerImplBoilerplate::closeFromLoop() { TP_DCHECK(context_->inLoop()); TP_VLOG(7) << "Listener " << id_ << " is closing"; setError(TP_CREATE_ERROR(ListenerClosedError)); } template void ListenerImplBoilerplate::setError(Error error) { // Don't overwrite an error that's already set. if (error_ || !error) { return; } error_ = std::move(error); handleError(); } template void ListenerImplBoilerplate::handleError() { TP_DCHECK(context_->inLoop()); TP_VLOG(8) << "Listener " << id_ << " is handling error " << error_.what(); handleErrorImpl(); } } // namespace transport } // namespace tensorpipe ================================================ FILE: tensorpipe/transport/shm/connection_impl.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include namespace tensorpipe { namespace transport { namespace shm { ConnectionImpl::ConnectionImpl( ConstructorToken token, std::shared_ptr context, std::string id, Socket socket) : ConnectionImplBoilerplate( token, std::move(context), std::move(id)), socket_(std::move(socket)) {} ConnectionImpl::ConnectionImpl( ConstructorToken token, std::shared_ptr context, std::string id, std::string addr) : ConnectionImplBoilerplate( token, std::move(context), std::move(id)), sockaddr_(Sockaddr::createAbstractUnixAddr(addr)) {} void ConnectionImpl::initImplFromLoop() { context_->enroll(*this); Error error; // The connection either got a socket or an address, but not both. TP_DCHECK(socket_.hasValue() ^ sockaddr_.has_value()); if (!socket_.hasValue()) { std::tie(error, socket_) = Socket::createForFamily(AF_UNIX); if (error) { setError(std::move(error)); return; } error = socket_.connect(sockaddr_.value()); if (error) { setError(std::move(error)); return; } } // Ensure underlying control socket is non-blocking such that it // works well with event driven I/O. error = socket_.block(false); if (error) { setError(std::move(error)); return; } // Create ringbuffer for inbox. std::tie(error, inboxHeaderSegment_, inboxDataSegment_, inboxRb_) = createShmRingBuffer(kBufferSize); TP_THROW_ASSERT_IF(error) << "Couldn't allocate ringbuffer for connection inbox: " << error.what(); // Register method to be called when our peer writes to our inbox. inboxReactorToken_ = context_->addReaction([this]() { TP_VLOG(9) << "Connection " << id_ << " is reacting to the peer writing to the inbox"; processReadOperationsFromLoop(); }); // Register method to be called when our peer reads from our outbox. outboxReactorToken_ = context_->addReaction([this]() { TP_VLOG(9) << "Connection " << id_ << " is reacting to the peer reading from the outbox"; processWriteOperationsFromLoop(); }); // We're sending file descriptors first, so wait for writability. state_ = SEND_FDS; context_->registerDescriptor(socket_.fd(), EPOLLOUT, shared_from_this()); } void ConnectionImpl::readImplFromLoop(read_callback_fn fn) { readOperations_.emplace_back(std::move(fn)); // If the inbox already contains some data, we may be able to process this // operation right away. processReadOperationsFromLoop(); } void ConnectionImpl::readImplFromLoop( AbstractNopHolder& object, read_nop_callback_fn fn) { readOperations_.emplace_back( &object, [fn{std::move(fn)}]( const Error& error, const void* /* unused */, size_t /* unused */) { fn(error); }); // If the inbox already contains some data, we may be able to process this // operation right away. processReadOperationsFromLoop(); } void ConnectionImpl::readImplFromLoop( void* ptr, size_t length, read_callback_fn fn) { readOperations_.emplace_back(ptr, length, std::move(fn)); // If the inbox already contains some data, we may be able to process this // operation right away. processReadOperationsFromLoop(); } void ConnectionImpl::writeImplFromLoop( const void* ptr, size_t length, write_callback_fn fn) { writeOperations_.emplace_back(ptr, length, std::move(fn)); // If the outbox has some free space, we may be able to process this operation // right away. processWriteOperationsFromLoop(); } void ConnectionImpl::writeImplFromLoop( const AbstractNopHolder& object, write_callback_fn fn) { writeOperations_.emplace_back(&object, std::move(fn)); // If the outbox has some free space, we may be able to process this operation // right away. processWriteOperationsFromLoop(); } void ConnectionImpl::handleEventsFromLoop(int events) { TP_DCHECK(context_->inLoop()); TP_VLOG(9) << "Connection " << id_ << " is handling an event on its socket (" << EpollLoop::formatEpollEvents(events) << ")"; // Handle only one of the events in the mask. Events on the control // file descriptor are rare enough for the cost of having epoll call // into this function multiple times to not matter. The benefit is // that every handler can close and unregister the control file // descriptor from the event loop, without worrying about the next // handler trying to do so as well. // In some cases the socket could be in a state where it's both in an error // state and readable/writable. If we checked for EPOLLIN or EPOLLOUT first // and then returned after handling them, we would keep doing so forever and // never reach the error handling. So we should keep the error check first. if (events & EPOLLERR) { int error; socklen_t errorlen = sizeof(error); int rv = getsockopt( socket_.fd(), SOL_SOCKET, SO_ERROR, reinterpret_cast(&error), &errorlen); if (rv == -1) { setError(TP_CREATE_ERROR(SystemError, "getsockopt", rv)); } else { setError(TP_CREATE_ERROR(SystemError, "async error on socket", error)); } return; } if (events & EPOLLIN) { handleEventInFromLoop(); return; } if (events & EPOLLOUT) { handleEventOutFromLoop(); return; } // Check for hangup last, as there could be cases where we get EPOLLHUP but // there's still data to be read from the socket, so we want to deal with that // before dealing with the hangup. if (events & EPOLLHUP) { setError(TP_CREATE_ERROR(EOFError)); return; } } void ConnectionImpl::handleEventInFromLoop() { TP_DCHECK(context_->inLoop()); if (state_ == RECV_FDS) { Fd reactorHeaderFd; Fd reactorDataFd; Fd outboxHeaderFd; Fd outboxDataFd; Reactor::TToken peerInboxReactorToken; Reactor::TToken peerOutboxReactorToken; // Receive the reactor token, reactor fds, and inbox fds. auto err = socket_.recvPayloadAndFds( peerInboxReactorToken, peerOutboxReactorToken, reactorHeaderFd, reactorDataFd, outboxHeaderFd, outboxDataFd); if (err) { setError(std::move(err)); return; } // Load ringbuffer for outbox. std::tie(err, outboxHeaderSegment_, outboxDataSegment_, outboxRb_) = loadShmRingBuffer( std::move(outboxHeaderFd), std::move(outboxDataFd)); TP_THROW_ASSERT_IF(err) << "Couldn't access ringbuffer of connection outbox: " << err.what(); // Initialize remote reactor trigger. peerReactorTrigger_.emplace( std::move(reactorHeaderFd), std::move(reactorDataFd)); peerInboxReactorToken_ = peerInboxReactorToken; peerOutboxReactorToken_ = peerOutboxReactorToken; // The connection is usable now. state_ = ESTABLISHED; processWriteOperationsFromLoop(); // Trigger read operations in case a pair of local read() and remote // write() happened before connection is established. Otherwise read() // callback would lose if it's the only read() request. processReadOperationsFromLoop(); return; } if (state_ == ESTABLISHED) { // We don't expect to read anything on this socket once the // connection has been established. If we do, assume it's a // zero-byte read indicating EOF. setError(TP_CREATE_ERROR(EOFError)); return; } TP_THROW_ASSERT() << "EPOLLIN event not handled in state " << state_; } void ConnectionImpl::handleEventOutFromLoop() { TP_DCHECK(context_->inLoop()); if (state_ == SEND_FDS) { int reactorHeaderFd; int reactorDataFd; std::tie(reactorHeaderFd, reactorDataFd) = context_->reactorFds(); // Send our reactor token, reactor fds, and inbox fds. auto err = socket_.sendPayloadAndFds( inboxReactorToken_.value(), outboxReactorToken_.value(), reactorHeaderFd, reactorDataFd, inboxHeaderSegment_.getFd(), inboxDataSegment_.getFd()); if (err) { setError(std::move(err)); return; } // Sent our fds. Wait for fds from peer. state_ = RECV_FDS; context_->registerDescriptor(socket_.fd(), EPOLLIN, shared_from_this()); return; } TP_THROW_ASSERT() << "EPOLLOUT event not handled in state " << state_; } void ConnectionImpl::processReadOperationsFromLoop() { TP_DCHECK(context_->inLoop()); // Process all read read operations that we can immediately serve, only // when connection is established. if (state_ != ESTABLISHED) { return; } // Serve read operations Consumer inboxConsumer(inboxRb_); while (!readOperations_.empty()) { RingbufferReadOperation& readOperation = readOperations_.front(); if (readOperation.handleRead(inboxConsumer) > 0) { peerReactorTrigger_->run(peerOutboxReactorToken_.value()); } if (readOperation.completed()) { readOperations_.pop_front(); } else { break; } } } void ConnectionImpl::processWriteOperationsFromLoop() { TP_DCHECK(context_->inLoop()); if (state_ != ESTABLISHED) { return; } Producer outboxProducer(outboxRb_); while (!writeOperations_.empty()) { RingbufferWriteOperation& writeOperation = writeOperations_.front(); if (writeOperation.handleWrite(outboxProducer) > 0) { peerReactorTrigger_->run(peerInboxReactorToken_.value()); } if (writeOperation.completed()) { writeOperations_.pop_front(); } else { break; } } } void ConnectionImpl::handleErrorImpl() { for (auto& readOperation : readOperations_) { readOperation.handleError(error_); } readOperations_.clear(); for (auto& writeOperation : writeOperations_) { writeOperation.handleError(error_); } writeOperations_.clear(); if (inboxReactorToken_.has_value()) { context_->removeReaction(inboxReactorToken_.value()); inboxReactorToken_.reset(); } if (outboxReactorToken_.has_value()) { context_->removeReaction(outboxReactorToken_.value()); outboxReactorToken_.reset(); } if (socket_.hasValue()) { if (state_ > INITIALIZING) { context_->unregisterDescriptor(socket_.fd()); } socket_.reset(); } context_->unenroll(*this); } } // namespace shm } // namespace transport } // namespace tensorpipe ================================================ FILE: tensorpipe/transport/shm/connection_impl.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include #include #include #include #include #include #include #include namespace tensorpipe { namespace transport { namespace shm { class ContextImpl; class ListenerImpl; class ConnectionImpl final : public ConnectionImplBoilerplate< ContextImpl, ListenerImpl, ConnectionImpl>, public EpollLoop::EventHandler { constexpr static size_t kBufferSize = 2 * 1024 * 1024; constexpr static int kNumRingbufferRoles = 2; using Consumer = RingBufferRole; using Producer = RingBufferRole; enum State { INITIALIZING = 1, SEND_FDS, RECV_FDS, ESTABLISHED, }; public: // Create a connection that is already connected (e.g. from a listener). ConnectionImpl( ConstructorToken token, std::shared_ptr context, std::string id, Socket socket); // Create a connection that connects to the specified address. ConnectionImpl( ConstructorToken token, std::shared_ptr context, std::string id, std::string addr); // Implementation of EventHandler. void handleEventsFromLoop(int events) override; protected: // Implement the entry points called by ConnectionImplBoilerplate. void initImplFromLoop() override; void readImplFromLoop(read_callback_fn fn) override; void readImplFromLoop(AbstractNopHolder& object, read_nop_callback_fn fn) override; void readImplFromLoop(void* ptr, size_t length, read_callback_fn fn) override; void writeImplFromLoop(const void* ptr, size_t length, write_callback_fn fn) override; void writeImplFromLoop(const AbstractNopHolder& object, write_callback_fn fn) override; void handleErrorImpl() override; private: // Handle events of type EPOLLIN on the UNIX domain socket. // // The only data that is expected on that socket is the file descriptors for // the other side's inbox (which is this side's outbox) and its reactor, plus // the reactor tokens to trigger the other side to read or write. void handleEventInFromLoop(); // Handle events of type EPOLLOUT on the UNIX domain socket. // // Once the socket is writable we send the file descriptors for this side's // inbox (which the other side's outbox) and our reactor, plus the reactor // tokens to trigger this connection to read or write. void handleEventOutFromLoop(); State state_{INITIALIZING}; Socket socket_; optional sockaddr_; // Inbox. ShmSegment inboxHeaderSegment_; ShmSegment inboxDataSegment_; RingBuffer inboxRb_; optional inboxReactorToken_; // Outbox. ShmSegment outboxHeaderSegment_; ShmSegment outboxDataSegment_; RingBuffer outboxRb_; optional outboxReactorToken_; // Peer trigger/tokens. optional peerReactorTrigger_; optional peerInboxReactorToken_; optional peerOutboxReactorToken_; // Pending read operations. std::deque readOperations_; // Pending write operations. std::deque writeOperations_; // Process pending read operations if in an operational state. // // This may be triggered by the other side of the connection (by pushing this // side's inbox token to the reactor) when it has written some new data to its // outbox (which is this side's inbox). It is also called by this connection // when it moves into an established state or when a new read operation is // queued, in case data was already available before this connection was ready // to consume it. void processReadOperationsFromLoop(); // Process pending write operations if in an operational state. // // This may be triggered by the other side of the connection (by pushing this // side's outbox token to the reactor) when it has read some data from its // inbox (which is this side's outbox). This is important when some of this // side's writes couldn't complete because the outbox was full, and thus they // needed to wait for some of its data to be read. This method is also called // by this connection when it moves into an established state, in case some // writes were queued before the connection was ready to process them, or when // a new write operation is queued. void processWriteOperationsFromLoop(); }; } // namespace shm } // namespace transport } // namespace tensorpipe ================================================ FILE: tensorpipe/transport/shm/context_impl.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include #include namespace tensorpipe { namespace transport { namespace shm { namespace { // Prepend descriptor with transport name so it's easy to // disambiguate descriptors when debugging. const std::string kDomainDescriptorPrefix{"shm:"}; } // namespace std::shared_ptr ContextImpl::create() { std::ostringstream oss; oss << kDomainDescriptorPrefix; // This transport only works across processes on the same machine, and we // detect that by computing the boot ID. auto bootID = getBootID(); TP_THROW_ASSERT_IF(!bootID.has_value()) << "Unable to read boot_id"; oss << bootID.value(); // This transport bootstraps a connection by opening a UNIX domain socket, for // which it uses an "abstract" address (i.e., just an identifier, which is not // materialized to a filesystem path). In order for the two endpoints to // access each other's address they must be in the same Linux kernel network // namespace (see network_namespaces(7)). auto nsID = getLinuxNamespaceId(LinuxNamespace::kNet); if (!nsID.has_value()) { TP_VLOG(8) << "Unable to read net namespace ID"; return nullptr; } oss << '_' << nsID.value(); // Over that UNIX domain socket, the two endpoints exchange file descriptors // to regions of shared memory. Some restrictions may be in place that prevent // allocating such regions, hence let's allocate one here to see if it works. Error error; ShmSegment segment; std::tie(error, segment) = ShmSegment::alloc(1024 * 1024); if (error) { TP_VLOG(8) << "Couldn't allocate shared memory segment: " << error.what(); return nullptr; } // A separate problem is that /dev/shm may be sized too small for all the // memory we need to allocate. However, our memory usage is unbounded, as it // grows as we open more connections, hence we cannot check it in advance. std::string domainDescriptor = oss.str(); TP_VLOG(8) << "The domain descriptor for SHM is " << domainDescriptor; return std::make_shared(std::move(domainDescriptor)); } ContextImpl::ContextImpl(std::string domainDescriptor) : ContextImplBoilerplate( std::move(domainDescriptor)) {} void ContextImpl::handleErrorImpl() { loop_.close(); reactor_.close(); } void ContextImpl::joinImpl() { loop_.join(); reactor_.join(); } bool ContextImpl::inLoop() const { return reactor_.inLoop(); }; void ContextImpl::deferToLoop(std::function fn) { reactor_.deferToLoop(std::move(fn)); }; void ContextImpl::registerDescriptor( int fd, int events, std::shared_ptr h) { loop_.registerDescriptor(fd, events, std::move(h)); } void ContextImpl::unregisterDescriptor(int fd) { loop_.unregisterDescriptor(fd); } ContextImpl::TToken ContextImpl::addReaction(TFunction fn) { return reactor_.add(std::move(fn)); } void ContextImpl::removeReaction(TToken token) { reactor_.remove(token); } std::tuple ContextImpl::reactorFds() { return reactor_.fds(); } } // namespace shm } // namespace transport } // namespace tensorpipe ================================================ FILE: tensorpipe/transport/shm/context_impl.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include namespace tensorpipe { namespace transport { namespace shm { class ConnectionImpl; class ListenerImpl; class ContextImpl final : public ContextImplBoilerplate { public: static std::shared_ptr create(); explicit ContextImpl(std::string domainDescriptor); // Implement the DeferredExecutor interface. bool inLoop() const override; void deferToLoop(std::function fn) override; void registerDescriptor( int fd, int events, std::shared_ptr h); void unregisterDescriptor(int fd); using TToken = uint32_t; using TFunction = std::function; TToken addReaction(TFunction fn); void removeReaction(TToken token); std::tuple reactorFds(); protected: // Implement the entry points called by ContextImplBoilerplate. void handleErrorImpl() override; void joinImpl() override; private: Reactor reactor_; EpollLoop loop_{this->reactor_}; }; } // namespace shm } // namespace transport } // namespace tensorpipe ================================================ FILE: tensorpipe/transport/shm/factory.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include namespace tensorpipe { namespace transport { namespace shm { std::shared_ptr create() { return std::make_shared< ContextBoilerplate>(); } } // namespace shm } // namespace transport } // namespace tensorpipe ================================================ FILE: tensorpipe/transport/shm/factory.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include namespace tensorpipe { namespace transport { namespace shm { std::shared_ptr create(); } // namespace shm } // namespace transport } // namespace tensorpipe ================================================ FILE: tensorpipe/transport/shm/listener_impl.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include #include #include #include #include #include #include #include #include namespace tensorpipe { namespace transport { namespace shm { ListenerImpl::ListenerImpl( ConstructorToken token, std::shared_ptr context, std::string id, std::string addr) : ListenerImplBoilerplate( token, std::move(context), std::move(id)), sockaddr_(Sockaddr::createAbstractUnixAddr(addr)) {} void ListenerImpl::initImplFromLoop() { context_->enroll(*this); Error error; TP_DCHECK(!socket_.hasValue()); std::tie(error, socket_) = Socket::createForFamily(AF_UNIX); if (error) { setError(std::move(error)); return; } error = socket_.bind(sockaddr_); if (error) { setError(std::move(error)); return; } error = socket_.block(false); if (error) { setError(std::move(error)); return; } error = socket_.listen(128); if (error) { setError(std::move(error)); return; } struct sockaddr_storage addr; socklen_t addrlen; std::tie(error, addr, addrlen) = socket_.getSockName(); if (error) { setError(std::move(error)); return; } sockaddr_ = Sockaddr(reinterpret_cast(&addr), addrlen); } void ListenerImpl::handleErrorImpl() { if (!fns_.empty()) { context_->unregisterDescriptor(socket_.fd()); } socket_.reset(); for (auto& fn : fns_) { fn(error_, std::shared_ptr()); } fns_.clear(); context_->unenroll(*this); } void ListenerImpl::acceptImplFromLoop(accept_callback_fn fn) { fns_.push_back(std::move(fn)); // Only register if we go from 0 to 1 pending callbacks. In other cases we // already had a pending callback and thus we were already registered. if (fns_.size() == 1) { // Register with loop for readability events. context_->registerDescriptor(socket_.fd(), EPOLLIN, shared_from_this()); } } std::string ListenerImpl::addrImplFromLoop() const { return sockaddr_.str(); } void ListenerImpl::handleEventsFromLoop(int events) { TP_DCHECK(context_->inLoop()); TP_VLOG(9) << "Listener " << id_ << " is handling an event on its socket (" << EpollLoop::formatEpollEvents(events) << ")"; if (events & EPOLLERR) { int error; socklen_t errorlen = sizeof(error); int rv = getsockopt( socket_.fd(), SOL_SOCKET, SO_ERROR, reinterpret_cast(&error), &errorlen); if (rv == -1) { setError(TP_CREATE_ERROR(SystemError, "getsockopt", rv)); } else { setError(TP_CREATE_ERROR(SystemError, "async error on socket", error)); } return; } if (events & EPOLLHUP) { setError(TP_CREATE_ERROR(EOFError)); return; } TP_ARG_CHECK_EQ(events, EPOLLIN); Error error; Socket socket; std::tie(error, socket) = socket_.accept(); if (error) { setError(std::move(error)); return; } TP_DCHECK(!fns_.empty()) << "when the callback is disarmed the listener's descriptor is supposed " << "to be unregistered"; auto fn = std::move(fns_.front()); fns_.pop_front(); if (fns_.empty()) { context_->unregisterDescriptor(socket_.fd()); } fn(Error::kSuccess, createAndInitConnection(std::move(socket))); } } // namespace shm } // namespace transport } // namespace tensorpipe ================================================ FILE: tensorpipe/transport/shm/listener_impl.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include #include namespace tensorpipe { namespace transport { namespace shm { class ConnectionImpl; class ContextImpl; class ListenerImpl final : public ListenerImplBoilerplate, public EpollLoop::EventHandler { public: // Create a listener that listens on the specified address. ListenerImpl( ConstructorToken token, std::shared_ptr context, std::string id, std::string addr); // Implementation of EventHandler. void handleEventsFromLoop(int events) override; protected: // Implement the entry points called by ListenerImplBoilerplate. void initImplFromLoop() override; void acceptImplFromLoop(accept_callback_fn fn) override; std::string addrImplFromLoop() const override; void handleErrorImpl() override; private: Socket socket_; Sockaddr sockaddr_; std::deque fns_; }; } // namespace shm } // namespace transport } // namespace tensorpipe ================================================ FILE: tensorpipe/transport/shm/reactor.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include namespace tensorpipe { namespace transport { namespace shm { namespace { void writeToken(Reactor::Producer& producer, Reactor::TToken token) { for (;;) { auto rv = producer.write(&token, sizeof(token)); if (rv == -EAGAIN) { // There's contention on the spin-lock, wait for it by retrying. std::this_thread::yield(); continue; } if (rv == -ENODATA) { // The ringbuffer is full. Retrying should typically work, but might lead // to a deadlock if, for example, a reactor thread is trying to write a // token to its own ringbuffer, as then it would be stuck here and never // proceed to consume data from the ringbuffer. This could also happen // across multiple processes. This case seems remote enough, and a proper // solution rather complicated, that we're going to take that risk... std::this_thread::yield(); continue; } TP_DCHECK_EQ(rv, sizeof(token)); break; } } } // namespace Reactor::Reactor() { Error error; std::tie(error, headerSegment_, dataSegment_, rb_) = createShmRingBuffer(kSize); TP_THROW_ASSERT_IF(error) << "Couldn't allocate ringbuffer for reactor: " << error.what(); startThread("TP_SHM_reactor"); } void Reactor::close() { if (!closed_.exchange(true)) { stopBusyPolling(); } } void Reactor::join() { close(); if (!joined_.exchange(true)) { joinThread(); } } Reactor::~Reactor() { join(); } Reactor::TToken Reactor::add(TFunction fn) { std::unique_lock lock(mutex_); TToken token; // Either reuse a token or generate a new one. auto it = reusableTokens_.begin(); if (it != reusableTokens_.end()) { token = *it; reusableTokens_.erase(it); } else { // If there are no reusable tokens, the next token is always equal // to the number of tokens in use + 1. token = functions_.size(); } // Ensure there is enough space in the functions vector. if (functions_.size() <= token) { functions_.resize(token + 1); } functions_[token] = std::move(fn); functionCount_++; return token; } void Reactor::remove(TToken token) { std::unique_lock lock(mutex_); functions_[token] = nullptr; reusableTokens_.insert(token); functionCount_--; } std::tuple Reactor::fds() const { return std::make_tuple(headerSegment_.getFd(), dataSegment_.getFd()); } bool Reactor::pollOnce() { Consumer reactorConsumer(rb_); uint32_t token; auto ret = reactorConsumer.read(&token, sizeof(token)); if (ret == -ENODATA) { return false; } TP_THROW_SYSTEM_IF(ret < 0, -ret); TFunction fn; // Make copy of std::function so we don't need // to hold the lock while executing it. { std::unique_lock lock(mutex_); TP_DCHECK_LT(token, functions_.size()); fn = functions_[token]; } if (fn) { fn(); } return true; } bool Reactor::readyToClose() { return functionCount_ == 0; } Reactor::Trigger::Trigger(Fd headerFd, Fd dataFd) { // The header and data segment objects take over ownership // of file descriptors. Release them to avoid double close. Error error; std::tie(error, headerSegment_, dataSegment_, rb_) = loadShmRingBuffer( std::move(headerFd), std::move(dataFd)); TP_THROW_ASSERT_IF(error) << "Couldn't access ringbuffer of remote reactor: " << error.what(); } void Reactor::Trigger::run(TToken token) { Producer producer(rb_); writeToken(producer, token); } } // namespace shm } // namespace transport } // namespace tensorpipe ================================================ FILE: tensorpipe/transport/shm/reactor.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include #include #include #include #include #include #include #include #include namespace tensorpipe { namespace transport { namespace shm { // Reactor loop. // // Companion class to the event loop in `loop.h` that executes // functions on triggers. The triggers are posted to a shared memory // ring buffer, so this can be done by other processes on the same // machine. It uses extra data in the ring buffer header to store a // mutex and condition variable to avoid a busy loop. // class Reactor final : public BusyPollingLoop { // This allows for buffering 1M triggers (at 4 bytes a piece). static constexpr auto kSize = 4 * 1024 * 1024; static constexpr int kNumRingbufferRoles = 2; public: using TFunction = std::function; using TToken = uint32_t; using Consumer = RingBufferRole; using Producer = RingBufferRole; Reactor(); // Add function to the reactor. // Returns token that can be used to trigger it. TToken add(TFunction fn); // Removes function associated with token from reactor. void remove(TToken token); // Returns the file descriptors for the underlying ring buffer. std::tuple fds() const; void close(); void join(); ~Reactor(); protected: bool pollOnce() override; bool readyToClose() override; private: ShmSegment headerSegment_; ShmSegment dataSegment_; RingBuffer rb_; std::mutex mutex_; std::atomic closed_{false}; std::atomic joined_{false}; // Tokens are placed in this set if they can be reused. std::set reusableTokens_; // Map reactor tokens to functions. // // The tokens are reused so we don't worry about unbounded growth // and comfortably use a std::vector here. // std::vector functions_; // Count how many functions are registered. std::atomic functionCount_{0}; public: class Trigger { public: Trigger(Fd header, Fd data); void run(TToken token); private: ShmSegment headerSegment_; ShmSegment dataSegment_; RingBuffer rb_; }; }; } // namespace shm } // namespace transport } // namespace tensorpipe ================================================ FILE: tensorpipe/transport/shm/sockaddr.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include #include namespace tensorpipe { namespace transport { namespace shm { Sockaddr Sockaddr::createAbstractUnixAddr(const std::string& name) { struct sockaddr_un sun; sun.sun_family = AF_UNIX; std::memset(&sun.sun_path, 0, sizeof(sun.sun_path)); // There are three "modes" for binding UNIX domain sockets: // - if len(path) == 0: it autobinds to an abstract address // - if len(path) > 0 and path[0] == 0: it uses an explicit abstract address // - if len(path) > 0 and path[0] != 0: it uses a concrete filesystem path if (name == "") { return Sockaddr( reinterpret_cast(&sun), sizeof(sun.sun_family)); } else { constexpr size_t offset = 1; const size_t len = std::min(sizeof(sun.sun_path) - offset, name.size()); std::strncpy(&sun.sun_path[offset], name.data(), len); // Note: instead of using sizeof(sun) we compute the addrlen from // the string length of the abstract socket name. If we use // sizeof(sun), lsof shows all the trailing NUL characters. return Sockaddr( reinterpret_cast(&sun), sizeof(sun.sun_family) + offset + len); } }; Sockaddr::Sockaddr(const struct sockaddr* addr, socklen_t addrlen) { TP_ARG_CHECK(addr != nullptr); TP_ARG_CHECK_LE(addrlen, sizeof(addr_)); std::memset(&addr_, 0, sizeof(addr_)); std::memcpy(&addr_, addr, addrlen); addrlen_ = addrlen; } std::string Sockaddr::str() const { TP_DCHECK_GE(addrlen_, sizeof(sockaddr_un::sun_family)); if (addrlen_ == sizeof(sockaddr_un::sun_family)) { return ""; } else { const struct sockaddr_un* sun{ reinterpret_cast(&addr_)}; TP_DCHECK_EQ(sun->sun_path[0], '\0'); constexpr size_t offset = 1; const size_t len = addrlen_ - sizeof(sun->sun_family) - offset; return std::string(&sun->sun_path[offset], len); } } } // namespace shm } // namespace transport } // namespace tensorpipe ================================================ FILE: tensorpipe/transport/shm/sockaddr.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include #include #include #include namespace tensorpipe { namespace transport { namespace shm { class Sockaddr final : public tensorpipe::Sockaddr { public: static Sockaddr createAbstractUnixAddr(const std::string& name); inline const struct sockaddr* addr() const override { return reinterpret_cast(&addr_); } inline socklen_t addrlen() const override { return addrlen_; } std::string str() const; explicit Sockaddr(const struct sockaddr* addr, socklen_t addrlen); private: struct sockaddr_storage addr_; socklen_t addrlen_; }; } // namespace shm } // namespace transport } // namespace tensorpipe ================================================ FILE: tensorpipe/transport/uv/connection_impl.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include #include #include #include #include #include #include #include #include namespace tensorpipe { namespace transport { namespace uv { ConnectionImpl::ConnectionImpl( ConstructorToken token, std::shared_ptr context, std::string id, std::unique_ptr handle) : ConnectionImplBoilerplate( token, std::move(context), std::move(id)), handle_(std::move(handle)) {} ConnectionImpl::ConnectionImpl( ConstructorToken token, std::shared_ptr context, std::string id, std::string addr) : ConnectionImplBoilerplate( token, std::move(context), std::move(id)), handle_(context_->createHandle()), sockaddr_(Sockaddr::createInetSockAddr(addr)) {} void ConnectionImpl::initImplFromLoop() { context_->enroll(*this); TP_VLOG(9) << "Connection " << id_ << " is initializing in loop"; if (sockaddr_.has_value()) { TP_THROW_ASSERT_IF(context_->closed()); handle_->initFromLoop(); handle_->connectFromLoop(sockaddr_.value(), [this](int status) { if (status < 0) { setError(TP_CREATE_ERROR(UVError, status)); } }); } handle_->armCloseCallbackFromLoop( [this]() { this->closeCallbackFromLoop(); }); handle_->armAllocCallbackFromLoop( [this](uv_buf_t* buf) { this->allocCallbackFromLoop(buf); }); handle_->armReadCallbackFromLoop([this](ssize_t nread, const uv_buf_t* buf) { this->readCallbackFromLoop(nread, buf); }); } void ConnectionImpl::readImplFromLoop(read_callback_fn fn) { readOperations_.emplace_back(std::move(fn)); // Start reading if this is the first read operation. if (readOperations_.size() == 1) { handle_->readStartFromLoop(); } } void ConnectionImpl::readImplFromLoop( void* ptr, size_t length, read_callback_fn fn) { readOperations_.emplace_back(ptr, length, std::move(fn)); // Start reading if this is the first read operation. if (readOperations_.size() == 1) { handle_->readStartFromLoop(); } } void ConnectionImpl::writeImplFromLoop( const void* ptr, size_t length, write_callback_fn fn) { writeOperations_.emplace_back(ptr, length, std::move(fn)); auto& writeOperation = writeOperations_.back(); StreamWriteOperation::Buf* bufsPtr; unsigned int bufsLen; std::tie(bufsPtr, bufsLen) = writeOperation.getBufs(); const std::array uvBufs = { uv_buf_t{bufsPtr[0].base, bufsPtr[0].len}, uv_buf_t{bufsPtr[1].base, bufsPtr[1].len}}; handle_->writeFromLoop(uvBufs.data(), bufsLen, [this](int status) { this->writeCallbackFromLoop(status); }); } void ConnectionImpl::allocCallbackFromLoop(uv_buf_t* buf) { TP_DCHECK(context_->inLoop()); TP_THROW_ASSERT_IF(readOperations_.empty()); TP_VLOG(9) << "Connection " << id_ << " has incoming data for which it needs to provide a buffer"; readOperations_.front().allocFromLoop(&buf->base, &buf->len); } void ConnectionImpl::readCallbackFromLoop( ssize_t nread, const uv_buf_t* /* unused */) { TP_DCHECK(context_->inLoop()); TP_VLOG(9) << "Connection " << id_ << " has completed reading some data (" << (nread >= 0 ? std::to_string(nread) + " bytes" : formatUvError(nread)) << ")"; if (nread < 0) { setError(TP_CREATE_ERROR(UVError, nread)); return; } TP_THROW_ASSERT_IF(readOperations_.empty()); auto& readOperation = readOperations_.front(); readOperation.readFromLoop(nread); if (readOperation.completeFromLoop()) { readOperation.callbackFromLoop(Error::kSuccess); // Remove the completed operation. // If this was the final pending operation, this instance should // no longer receive allocation and read callbacks. readOperations_.pop_front(); if (readOperations_.empty()) { handle_->readStopFromLoop(); } } } void ConnectionImpl::writeCallbackFromLoop(int status) { TP_DCHECK(context_->inLoop()); TP_VLOG(9) << "Connection " << id_ << " has completed a write request (" << formatUvError(status) << ")"; if (status < 0) { setError(TP_CREATE_ERROR(UVError, status)); // Do NOT return, because the error handler method will only fire the // callbacks of the read operations, because we can only fire the callbacks // of the write operations after their corresponding UV requests complete // (or else the user may deallocate the buffers while the loop is still // processing them), therefore we must fire the write operation callbacks in // this method, both in case of success and of error. } TP_THROW_ASSERT_IF(writeOperations_.empty()); auto& writeOperation = writeOperations_.front(); writeOperation.callbackFromLoop(error_); writeOperations_.pop_front(); } void ConnectionImpl::closeCallbackFromLoop() { TP_DCHECK(context_->inLoop()); TP_VLOG(9) << "Connection " << id_ << " has finished closing its handle"; TP_DCHECK(writeOperations_.empty()); context_->unenroll(*this); } void ConnectionImpl::handleErrorImpl() { for (auto& readOperation : readOperations_) { readOperation.callbackFromLoop(error_); } readOperations_.clear(); // Do NOT fire the callbacks of the write operations, because we must wait for // their corresponding UV write requests to complete (or else the user may // deallocate the buffers while the loop is still processing them). handle_->closeFromLoop(); // Do NOT unenroll here, as we must keep the UV handle alive until the close // callback fires. } } // namespace uv } // namespace transport } // namespace tensorpipe ================================================ FILE: tensorpipe/transport/uv/connection_impl.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include #include #include #include namespace tensorpipe { namespace transport { namespace uv { class ContextImpl; class ListenerImpl; class ConnectionImpl final : public ConnectionImplBoilerplate< ContextImpl, ListenerImpl, ConnectionImpl> { public: // Create a connection that is already connected (e.g. from a listener). ConnectionImpl( ConstructorToken token, std::shared_ptr context, std::string id, std::unique_ptr handle); // Create a connection that connects to the specified address. ConnectionImpl( ConstructorToken token, std::shared_ptr context, std::string id, std::string addr); protected: // Implement the entry points called by ConnectionImplBoilerplate. void initImplFromLoop() override; void readImplFromLoop(read_callback_fn fn) override; void readImplFromLoop(void* ptr, size_t length, read_callback_fn fn) override; void writeImplFromLoop(const void* ptr, size_t length, write_callback_fn fn) override; void handleErrorImpl() override; private: // Called when libuv is about to read data from connection. void allocCallbackFromLoop(uv_buf_t* buf); // Called when libuv has read data from connection. void readCallbackFromLoop(ssize_t nread, const uv_buf_t* buf); // Called when libuv has written data to connection. void writeCallbackFromLoop(int status); // Called when libuv has closed the handle. void closeCallbackFromLoop(); const std::unique_ptr handle_; optional sockaddr_; std::deque readOperations_; std::deque writeOperations_; }; } // namespace uv } // namespace transport } // namespace tensorpipe ================================================ FILE: tensorpipe/transport/uv/context_impl.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include namespace tensorpipe { namespace transport { namespace uv { namespace { // Prepend descriptor with transport name so it's easy to // disambiguate descriptors when debugging. const std::string kDomainDescriptorPrefix{"uv:"}; std::string generateDomainDescriptor() { return kDomainDescriptorPrefix + "*"; } } // namespace std::shared_ptr ContextImpl::create() { return std::make_shared(); } ContextImpl::ContextImpl() : ContextImplBoilerplate( generateDomainDescriptor()) {} void ContextImpl::handleErrorImpl() { loop_.close(); } void ContextImpl::joinImpl() { loop_.join(); } bool ContextImpl::inLoop() const { return loop_.inLoop(); }; void ContextImpl::deferToLoop(std::function fn) { loop_.deferToLoop(std::move(fn)); }; std::unique_ptr ContextImpl::createHandle() { return std::make_unique(loop_.ptr(), loop_); }; } // namespace uv } // namespace transport } // namespace tensorpipe ================================================ FILE: tensorpipe/transport/uv/context_impl.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include #include #include namespace tensorpipe { namespace transport { namespace uv { class ConnectionImpl; class ListenerImpl; class ContextImpl final : public ContextImplBoilerplate { public: static std::shared_ptr create(); ContextImpl(); // Implement the DeferredExecutor interface. bool inLoop() const override; void deferToLoop(std::function fn) override; std::unique_ptr createHandle(); protected: // Implement the entry points called by ContextImplBoilerplate. void handleErrorImpl() override; void joinImpl() override; private: Loop loop_; }; } // namespace uv } // namespace transport } // namespace tensorpipe ================================================ FILE: tensorpipe/transport/uv/error.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include namespace tensorpipe { namespace transport { namespace uv { std::string UVError::what() const { return formatUvError(error_); } std::string NoAddrFoundError::what() const { return "no address found"; } } // namespace uv } // namespace transport } // namespace tensorpipe ================================================ FILE: tensorpipe/transport/uv/error.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include namespace tensorpipe { namespace transport { namespace uv { class UVError final : public BaseError { public: explicit UVError(int error) : error_(error) {} std::string what() const override; private: int error_; }; class NoAddrFoundError final : public BaseError { public: NoAddrFoundError() {} std::string what() const override; }; } // namespace uv } // namespace transport } // namespace tensorpipe ================================================ FILE: tensorpipe/transport/uv/factory.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include namespace tensorpipe { namespace transport { namespace uv { std::shared_ptr create() { return std::make_shared< ContextBoilerplate>(); } } // namespace uv } // namespace transport } // namespace tensorpipe ================================================ FILE: tensorpipe/transport/uv/factory.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include namespace tensorpipe { namespace transport { namespace uv { std::shared_ptr create(); } // namespace uv } // namespace transport } // namespace tensorpipe ================================================ FILE: tensorpipe/transport/uv/listener_impl.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include #include #include #include #include namespace tensorpipe { namespace transport { namespace uv { ListenerImpl::ListenerImpl( ConstructorToken token, std::shared_ptr context, std::string id, std::string addr) : ListenerImplBoilerplate( token, std::move(context), std::move(id)), handle_(context_->createHandle()), sockaddr_(Sockaddr::createInetSockAddr(addr)) {} void ListenerImpl::initImplFromLoop() { context_->enroll(*this); TP_VLOG(9) << "Listener " << id_ << " is initializing in loop"; TP_THROW_ASSERT_IF(context_->closed()); handle_->initFromLoop(); auto rv = handle_->bindFromLoop(sockaddr_); TP_THROW_UV_IF(rv < 0, rv); handle_->armCloseCallbackFromLoop( [this]() { this->closeCallbackFromLoop(); }); handle_->listenFromLoop( [this](int status) { this->connectionCallbackFromLoop(status); }); sockaddr_ = handle_->sockNameFromLoop(); } void ListenerImpl::acceptImplFromLoop(accept_callback_fn fn) { callback_.arm(std::move(fn)); } std::string ListenerImpl::addrImplFromLoop() const { return sockaddr_.str(); } void ListenerImpl::connectionCallbackFromLoop(int status) { TP_DCHECK(context_->inLoop()); TP_VLOG(9) << "Listener " << id_ << " has an incoming connection ready to be accepted (" << formatUvError(status) << ")"; if (status != 0) { setError(TP_CREATE_ERROR(UVError, status)); return; } auto connection = context_->createHandle(); TP_THROW_ASSERT_IF(context_->closed()); connection->initFromLoop(); handle_->acceptFromLoop(*connection); callback_.trigger( Error::kSuccess, createAndInitConnection(std::move(connection))); } void ListenerImpl::closeCallbackFromLoop() { TP_VLOG(9) << "Listener " << id_ << " has finished closing its handle"; context_->unenroll(*this); } void ListenerImpl::handleErrorImpl() { callback_.triggerAll([&]() { return std::make_tuple(std::cref(error_), std::shared_ptr()); }); handle_->closeFromLoop(); // Do NOT unenroll here, as we must keep the UV handle alive until the close // callback fires. } } // namespace uv } // namespace transport } // namespace tensorpipe ================================================ FILE: tensorpipe/transport/uv/listener_impl.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include namespace tensorpipe { namespace transport { namespace uv { class ConnectionImpl; class ContextImpl; class ListenerImpl final : public ListenerImplBoilerplate< ContextImpl, ListenerImpl, ConnectionImpl> { public: // Create a listener that listens on the specified address. ListenerImpl( ConstructorToken token, std::shared_ptr context, std::string id, std::string addr); protected: // Implement the entry points called by ListenerImplBoilerplate. void initImplFromLoop() override; void acceptImplFromLoop(accept_callback_fn fn) override; std::string addrImplFromLoop() const override; void handleErrorImpl() override; private: // Called by libuv if the listening socket can accept a new connection. Status // is 0 in case of success, < 0 otherwise. See `uv_connection_cb` for more // information. void connectionCallbackFromLoop(int status); // Called when libuv has closed the handle. void closeCallbackFromLoop(); const std::unique_ptr handle_; Sockaddr sockaddr_; // Once an accept callback fires, it becomes disarmed and must be rearmed. // Any firings that occur while the callback is disarmed are stashed and // triggered as soon as it's rearmed. With libuv we don't have the ability // to disable the lower-level callback when the user callback is disarmed. // So we'll keep getting notified of new connections even if we don't know // what to do with them and don't want them. Thus we must store them // somewhere. This is what RearmableCallback is for. RearmableCallback> callback_; }; } // namespace uv } // namespace transport } // namespace tensorpipe ================================================ FILE: tensorpipe/transport/uv/loop.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include namespace tensorpipe { namespace transport { namespace uv { Loop::Loop() { int rv; rv = uv_loop_init(&loop_); TP_THROW_UV_IF(rv < 0, rv); rv = uv_async_init(&loop_, &async_, uvAsyncCb); TP_THROW_UV_IF(rv < 0, rv); async_.data = this; startThread("TP_UV_loop"); } void Loop::close() { if (!closed_.exchange(true)) { // It's fine to capture this because the loop won't be destroyed until join // has completed, and join won't complete until this operation is performed. deferToLoop( [this]() { uv_unref(reinterpret_cast(&async_)); }); } } void Loop::join() { close(); if (!joined_.exchange(true)) { joinThread(); } } Loop::~Loop() noexcept { join(); } void Loop::wakeupEventLoopToDeferFunction() { auto rv = uv_async_send(&async_); TP_THROW_UV_IF(rv < 0, rv); } void Loop::eventLoop() { int rv; rv = uv_run(&loop_, UV_RUN_DEFAULT); TP_THROW_ASSERT_IF(rv > 0) << ": uv_run returned with active handles or requests"; } void Loop::cleanUpLoop() { int rv; uv_ref(reinterpret_cast(&async_)); uv_close(reinterpret_cast(&async_), nullptr); rv = uv_run(&loop_, UV_RUN_NOWAIT); TP_THROW_ASSERT_IF(rv > 0) << ": uv_run returned with active handles or requests"; // Release resources associated with loop. rv = uv_loop_close(&loop_); TP_THROW_UV_IF(rv < 0, rv); } void Loop::uvAsyncCb(uv_async_t* handle) { auto& loop = *reinterpret_cast(handle->data); loop.runDeferredFunctionsFromEventLoop(); } } // namespace uv } // namespace transport } // namespace tensorpipe ================================================ FILE: tensorpipe/transport/uv/loop.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include #include #include #include namespace tensorpipe { namespace transport { namespace uv { class Loop final : public EventLoopDeferredExecutor { public: Loop(); uv_loop_t* ptr() { return &loop_; } bool closed() { return closed_; } void close(); void join(); ~Loop() noexcept; protected: // Event loop thread entry function. void eventLoop() override; // Clean up after event loop transitioned to on-demand. void cleanUpLoop() override; // Wake up the event loop. void wakeupEventLoopToDeferFunction() override; private: uv_loop_t loop_; uv_async_t async_; std::atomic closed_{false}; std::atomic joined_{false}; // This function is called by the event loop thread whenever // we have to run a number of deferred functions. static void uvAsyncCb(uv_async_t* handle); }; } // namespace uv } // namespace transport } // namespace tensorpipe ================================================ FILE: tensorpipe/transport/uv/sockaddr.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include #include #include #include namespace tensorpipe { namespace transport { namespace uv { Sockaddr Sockaddr::createInetSockAddr(const std::string& str) { int port = 0; std::string addrStr; std::string portStr; // If the input string is an IPv6 address with port, the address // itself must be wrapped with brackets. if (addrStr.empty()) { auto start = str.find("["); auto stop = str.find("]"); if (start < stop && start != std::string::npos && stop != std::string::npos) { addrStr = str.substr(start + 1, stop - (start + 1)); if (stop + 1 < str.size() && str[stop + 1] == ':') { portStr = str.substr(stop + 2); } } } // If the input string is an IPv4 address with port, we expect // at least a single period and a single colon in the string. if (addrStr.empty()) { auto period = str.find("."); auto colon = str.find(":"); if (period != std::string::npos && colon != std::string::npos) { addrStr = str.substr(0, colon); portStr = str.substr(colon + 1); } } // Fallback to using entire input string as address without port. if (addrStr.empty()) { addrStr = str; } // Parse port number if specified. if (!portStr.empty()) { port = std::stoi(portStr); if (port < 0 || port > std::numeric_limits::max()) { TP_THROW_EINVAL() << str; } } // Try to convert an IPv4 address. { struct sockaddr_in addr; auto rv = uv_ip4_addr(addrStr.c_str(), port, &addr); if (rv == 0) { return Sockaddr(reinterpret_cast(&addr), sizeof(addr)); } } // Try to convert an IPv6 address. { struct sockaddr_in6 addr; auto rv = uv_ip6_addr(addrStr.c_str(), port, &addr); if (rv == 0) { return Sockaddr(reinterpret_cast(&addr), sizeof(addr)); } } // Invalid address. TP_THROW_EINVAL() << str; // Return bogus to silence "return from non-void function" warning. // Note: we don't reach this point per the throw above. return Sockaddr(nullptr, 0); } std::string Sockaddr::str() const { std::ostringstream oss; if (addr_.ss_family == AF_INET) { std::array buf; auto in = reinterpret_cast(&addr_); auto rv = uv_ip4_name(in, buf.data(), buf.size()); TP_THROW_UV_IF(rv < 0, rv); oss << buf.data() << ":" << htons(in->sin_port); } else if (addr_.ss_family == AF_INET6) { std::array buf; auto in6 = reinterpret_cast(&addr_); auto rv = uv_ip6_name(in6, buf.data(), buf.size()); TP_THROW_UV_IF(rv < 0, rv); oss << "[" << buf.data(); if (in6->sin6_scope_id > 0) { std::array scopeBuf; size_t size = sizeof(scopeBuf); rv = uv_if_indextoiid(in6->sin6_scope_id, scopeBuf.data(), &size); TP_THROW_UV_IF(rv < 0, rv); oss << "%" << scopeBuf.data(); } oss << "]:" << htons(in6->sin6_port); } else { TP_THROW_EINVAL() << "invalid address family: " << addr_.ss_family; } return oss.str(); } } // namespace uv } // namespace transport } // namespace tensorpipe ================================================ FILE: tensorpipe/transport/uv/sockaddr.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include namespace tensorpipe { namespace transport { namespace uv { class Sockaddr final : public tensorpipe::Sockaddr { public: static Sockaddr createInetSockAddr(const std::string& str); Sockaddr(const struct sockaddr* addr, socklen_t addrlen) { TP_ARG_CHECK(addr != nullptr); TP_ARG_CHECK_LE(addrlen, sizeof(addr_)); // Ensure the sockaddr_storage is zeroed, because we don't always // write to all fields in the `sockaddr_[in|in6]` structures. std::memset(&addr_, 0, sizeof(addr_)); std::memcpy(&addr_, addr, addrlen); addrlen_ = addrlen; } inline const struct sockaddr* addr() const override { return reinterpret_cast(&addr_); } inline struct sockaddr* addr() { return reinterpret_cast(&addr_); } inline socklen_t addrlen() const override { return addrlen_; } std::string str() const; private: struct sockaddr_storage addr_; socklen_t addrlen_; }; } // namespace uv } // namespace transport } // namespace tensorpipe ================================================ FILE: tensorpipe/transport/uv/utility.cc ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include namespace tensorpipe { namespace transport { namespace uv { std::tuple lookupAddrForIface(std::string iface) { int rv; InterfaceAddresses addresses; int count; std::tie(rv, addresses, count) = getInterfaceAddresses(); if (rv < 0) { return std::make_tuple(TP_CREATE_ERROR(UVError, rv), std::string()); } for (auto i = 0; i < count; i++) { const uv_interface_address_t& interface = addresses[i]; if (iface != interface.name) { continue; } const auto& address = interface.address; const struct sockaddr* sockaddr = reinterpret_cast(&address); switch (sockaddr->sa_family) { case AF_INET: return std::make_tuple( Error::kSuccess, Sockaddr(sockaddr, sizeof(address.address4)).str()); case AF_INET6: return std::make_tuple( Error::kSuccess, Sockaddr(sockaddr, sizeof(address.address6)).str()); } } return std::make_tuple(TP_CREATE_ERROR(NoAddrFoundError), std::string()); } std::tuple lookupAddrForHostname() { // For some operations we need a libuv event loop. We create a fresh one, just // for this purpose, which we'll drive inline from this thread. This way we // avoid misusing the main event loop in the context impl. struct InlineLoop { uv_loop_t loop; InlineLoop() { auto rv = uv_loop_init(&loop); TP_THROW_UV_IF(rv < 0, rv); } ~InlineLoop() { auto rv = uv_loop_close(&loop); TP_THROW_UV_IF(rv < 0, rv); } }; InlineLoop loop; struct InlineDeferredExecutor : public DeferredExecutor { std::thread::id threadId = std::this_thread::get_id(); void deferToLoop(TTask fn) override { TP_THROW_ASSERT() << "How could this be called?! This class is supposed to be " << "instantiated as const, and this method isn't const-qualified"; } bool inLoop() const override { return std::this_thread::get_id() == threadId; } }; const InlineDeferredExecutor executor; int rv; std::string hostname; std::tie(rv, hostname) = getHostname(); if (rv < 0) { return std::make_tuple(TP_CREATE_ERROR(UVError, rv), std::string()); } Addrinfo info; std::tie(rv, info) = getAddrinfoFromLoop(&loop.loop, std::move(hostname)); if (rv < 0) { return std::make_tuple(TP_CREATE_ERROR(UVError, rv), std::string()); } Error error; for (struct addrinfo* rp = info.get(); rp != nullptr; rp = rp->ai_next) { TP_DCHECK(rp->ai_family == AF_INET || rp->ai_family == AF_INET6); TP_DCHECK_EQ(rp->ai_socktype, SOCK_STREAM); TP_DCHECK_EQ(rp->ai_protocol, IPPROTO_TCP); Sockaddr addr = Sockaddr(rp->ai_addr, rp->ai_addrlen); TCPHandle handle(&loop.loop, executor); handle.initFromLoop(); rv = handle.bindFromLoop(addr); handle.closeFromLoop(); // The handle will only be closed at the next loop iteration, so run it. { auto rv = uv_run(&loop.loop, UV_RUN_DEFAULT); TP_THROW_ASSERT_IF(rv > 0); } if (rv < 0) { // Record the first binding error we encounter and return that in the end // if no working address is found, in order to help with debugging. if (!error) { error = TP_CREATE_ERROR(UVError, rv); } continue; } return std::make_tuple(Error::kSuccess, addr.str()); } if (error) { return std::make_tuple(std::move(error), std::string()); } else { return std::make_tuple(TP_CREATE_ERROR(NoAddrFoundError), std::string()); } } std::tuple lookupAddrLikeNccl( optional familyFilter) { int rv; InterfaceAddresses addresses; int count; std::tie(rv, addresses, count) = getInterfaceAddresses(); if (rv < 0) { return std::make_tuple(TP_CREATE_ERROR(UVError, rv), std::string()); } // Libuv already only returns the interfaces that are up and running, whose // address is not null, and whose family is IPv4 or IPv6. // NCCL prioritizes the interfaces whose name starts with "ib" (for IP over // InfiniBand?), and deprioritizes those that start with "docker" or "lo". optional withIbPrefix; optional withoutPrefix; optional withDockerPrefix; optional withLoPrefix; for (auto i = 0; i < count; i++) { const uv_interface_address_t& interface = addresses[i]; const struct sockaddr* sockaddr = reinterpret_cast(&interface.address); // NCCL also seems to ignore any IPv6 loopback address. if (sockaddr->sa_family == AF_INET6 && interface.is_internal) { continue; } if (familyFilter.has_value() && sockaddr->sa_family != familyFilter.value()) { continue; } std::string addr; switch (sockaddr->sa_family) { case AF_INET: addr = Sockaddr(sockaddr, sizeof(struct sockaddr_in)).str(); break; case AF_INET6: addr = Sockaddr(sockaddr, sizeof(struct sockaddr_in6)).str(); break; } std::string name = interface.name; if (name.find("ib") == 0) { if (!withIbPrefix.has_value()) { withIbPrefix = std::move(addr); } } else if (name.find("docker") == 0) { if (!withDockerPrefix.has_value()) { withDockerPrefix = std::move(addr); } } else if (name.find("lo") == 0) { if (!withLoPrefix.has_value()) { withLoPrefix = std::move(addr); } } else { if (!withoutPrefix.has_value()) { withoutPrefix = std::move(addr); } } } if (withIbPrefix.has_value()) { return std::make_tuple(Error::kSuccess, std::move(withIbPrefix).value()); } else if (withoutPrefix.has_value()) { return std::make_tuple(Error::kSuccess, std::move(withoutPrefix).value()); } else if (withDockerPrefix.has_value()) { return std::make_tuple( Error::kSuccess, std::move(withDockerPrefix).value()); } else if (withLoPrefix.has_value()) { return std::make_tuple(Error::kSuccess, std::move(withLoPrefix).value()); } return std::make_tuple(TP_CREATE_ERROR(NoAddrFoundError), std::string()); } } // namespace uv } // namespace transport } // namespace tensorpipe ================================================ FILE: tensorpipe/transport/uv/utility.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include namespace tensorpipe { namespace transport { namespace uv { std::tuple lookupAddrForIface(std::string iface); std::tuple lookupAddrForHostname(); // Try to replicate the same logic used by NCCL to find a node's own address. // Roughly, it returns the "first" usable address it can find, and prioritizes // the interfaces with an `ib` prefix and de-prioritizes those with a `docker` // or `lo` prefix. It can optionally only return only IPv4 or IPv4 addresses. std::tuple lookupAddrLikeNccl( optional familyFilter = nullopt); } // namespace uv } // namespace transport } // namespace tensorpipe ================================================ FILE: tensorpipe/transport/uv/uv.h ================================================ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include #include #define TP_THROW_UV(err) TP_THROW(std::runtime_error) #define TP_THROW_UV_IF(cond, err) \ if (unlikely(cond)) \ TP_THROW_UV(err) << TP_STRINGIFY(cond) << ": " << uv_strerror(err) namespace tensorpipe { namespace transport { namespace uv { template class BaseHandle { static void uvCloseCb(uv_handle_t* handle) { T& ref = *reinterpret_cast(handle->data); if (ref.closeCallback_ != nullptr) { ref.closeCallback_(); } } public: using TCloseCallback = std::function; explicit BaseHandle(uv_loop_t* loop, const DeferredExecutor& executor) : loop_(loop), executor_(executor) { handle_.data = this; } // Libuv's handles cannot be copied or moved. BaseHandle(const BaseHandle&) = delete; BaseHandle(BaseHandle&&) = delete; BaseHandle& operator=(const BaseHandle&) = delete; BaseHandle& operator=(BaseHandle&&) = delete; virtual ~BaseHandle() = default; U* ptr() { return &handle_; } void armCloseCallbackFromLoop(TCloseCallback fn) { TP_DCHECK(this->executor_.inLoop()); TP_THROW_ASSERT_IF(closeCallback_ != nullptr); closeCallback_ = std::move(fn); } void closeFromLoop() { TP_DCHECK(!uv_is_closing(reinterpret_cast(ptr()))); uv_close(reinterpret_cast(ptr()), uvCloseCb); } protected: // Underlying libuv handle. U handle_; // Underlying libuv event loop. uv_loop_t* const loop_; // This DeferredExecutor is only used to check that all calls are performed // from the right thread. const DeferredExecutor& executor_; TCloseCallback closeCallback_; }; template class BaseRequest { public: BaseRequest() { request_.data = this; } // Libuv's requests cannot be copied or moved. BaseRequest(const BaseRequest&) = delete; BaseRequest(BaseRequest&&) = delete; BaseRequest& operator=(const BaseRequest&) = delete; BaseRequest& operator=(BaseRequest&&) = delete; U* ptr() { return &request_; } private: // Underlying libuv request. U request_; }; class WriteRequest final : public BaseRequest { static void uvWriteCb(uv_write_t* req, int status) { std::unique_ptr request( reinterpret_cast(req->data)); request->writeCallback_(status); } public: using TWriteCallback = std::function; explicit WriteRequest(TWriteCallback fn) : writeCallback_(std::move(fn)) {} static int perform( uv_stream_t* handle, const uv_buf_t bufs[], unsigned int nbufs, TWriteCallback fn) { auto request = std::make_unique(std::move(fn)); auto rv = uv_write(request->ptr(), handle, bufs, nbufs, uvWriteCb); request.release(); return rv; } private: TWriteCallback writeCallback_; }; template class StreamHandle : public BaseHandle { static void uvConnectionCb(uv_stream_t* server, int status) { T& ref = *reinterpret_cast(server->data); TP_DCHECK(ref.connectionCallback_ != nullptr); ref.connectionCallback_(status); } static void uvAllocCb( uv_handle_t* handle, size_t /* unused */, uv_buf_t* buf) { T& ref = *reinterpret_cast(handle->data); TP_DCHECK(ref.allocCallback_ != nullptr); ref.allocCallback_(buf); } static void uvReadCb( uv_stream_t* server, ssize_t nread, const uv_buf_t* buf) { T& ref = *reinterpret_cast(server->data); TP_DCHECK(ref.readCallback_ != nullptr); ref.readCallback_(nread, buf); } static constexpr int kBacklog = 128; public: using TConnectionCallback = std::function; using TAcceptCallback = std::function; using TAllocCallback = std::function; using TReadCallback = std::function; using BaseHandle::BaseHandle; // TODO Split this into a armConnectionCallback, a listenStart and a // listenStop method, to propagate the backpressure to the clients. void listenFromLoop(TConnectionCallback connectionCallback) { TP_DCHECK(this->executor_.inLoop()); TP_THROW_ASSERT_IF(connectionCallback_ != nullptr); connectionCallback_ = std::move(connectionCallback); auto rv = uv_listen( reinterpret_cast(this->ptr()), kBacklog, uvConnectionCb); TP_THROW_UV_IF(rv < 0, rv); } template void acceptFromLoop(V& other) { TP_DCHECK(this->executor_.inLoop()); auto rv = uv_accept( reinterpret_cast(this->ptr()), reinterpret_cast(other.ptr())); TP_THROW_UV_IF(rv < 0, rv); } void armAllocCallbackFromLoop(TAllocCallback fn) { TP_DCHECK(this->executor_.inLoop()); TP_THROW_ASSERT_IF(allocCallback_ != nullptr); allocCallback_ = std::move(fn); } void armReadCallbackFromLoop(TReadCallback fn) { TP_DCHECK(this->executor_.inLoop()); TP_THROW_ASSERT_IF(readCallback_ != nullptr); readCallback_ = std::move(fn); } void readStartFromLoop() { TP_DCHECK(this->executor_.inLoop()); TP_THROW_ASSERT_IF(allocCallback_ == nullptr); TP_THROW_ASSERT_IF(readCallback_ == nullptr); auto rv = uv_read_start( reinterpret_cast(this->ptr()), uvAllocCb, uvReadCb); TP_THROW_UV_IF(rv < 0, rv); } void readStopFromLoop() { TP_DCHECK(this->executor_.inLoop()); auto rv = uv_read_stop(reinterpret_cast(this->ptr())); TP_THROW_UV_IF(rv < 0, rv); } void writeFromLoop( const uv_buf_t bufs[], unsigned int nbufs, WriteRequest::TWriteCallback fn) { TP_DCHECK(this->executor_.inLoop()); auto rv = WriteRequest::perform( reinterpret_cast(this->ptr()), bufs, nbufs, std::move(fn)); TP_THROW_UV_IF(rv < 0, rv); } protected: TConnectionCallback connectionCallback_; TAllocCallback allocCallback_; TReadCallback readCallback_; }; class ConnectRequest final : public BaseRequest { static void uvConnectCb(uv_connect_t* req, int status) { std::unique_ptr request( reinterpret_cast(req->data)); request->connectCallback_(status); } public: using TConnectCallback = std::function; explicit ConnectRequest(TConnectCallback fn) : connectCallback_(std::move(fn)) {} static int perform( uv_tcp_t* handle, const struct sockaddr* addr, TConnectCallback fn) { auto request = std::make_unique(std::move(fn)); auto rv = uv_tcp_connect(request->ptr(), handle, addr, uvConnectCb); request.release(); return rv; } private: TConnectCallback connectCallback_; }; class TCPHandle : public StreamHandle { public: using StreamHandle::StreamHandle; void initFromLoop() { TP_DCHECK(this->executor_.inLoop()); int rv; rv = uv_tcp_init(loop_, this->ptr()); TP_THROW_UV_IF(rv < 0, rv); rv = uv_tcp_nodelay(this->ptr(), 1); TP_THROW_UV_IF(rv < 0, rv); } [[nodiscard]] int bindFromLoop(const Sockaddr& addr) { TP_DCHECK(this->executor_.inLoop()); auto rv = uv_tcp_bind(ptr(), addr.addr(), 0); // We don't throw in case of errors here because sometimes we bind in order // to try if an address works and want to handle errors gracefully. return rv; } Sockaddr sockNameFromLoop() { TP_DCHECK(this->executor_.inLoop()); struct sockaddr_storage ss; struct sockaddr* addr = reinterpret_cast(&ss); int addrlen = sizeof(ss); auto rv = uv_tcp_getsockname(ptr(), addr, &addrlen); TP_THROW_UV_IF(rv < 0, rv); return Sockaddr(addr, addrlen); } void connectFromLoop( const Sockaddr& addr, ConnectRequest::TConnectCallback fn) { TP_DCHECK(this->executor_.inLoop()); auto rv = ConnectRequest::perform(ptr(), addr.addr(), std::move(fn)); TP_THROW_UV_IF(rv < 0, rv); } }; struct AddrinfoDeleter { void operator()(struct addrinfo* ptr) const { uv_freeaddrinfo(ptr); } }; using Addrinfo = std::unique_ptr; inline std::tuple getAddrinfoFromLoop( uv_loop_t* loop, std::string hostname) { struct addrinfo hints; std::memset(&hints, 0, sizeof(hints)); hints.ai_family = AF_UNSPEC; hints.ai_socktype = SOCK_STREAM; hints.ai_protocol = IPPROTO_TCP; uv_getaddrinfo_t request; // Don't use a callback, and thus perform the call synchronously, because the // asynchronous version uses a thread pool, and it's not worth spawning new // threads for a functionality which is used so sparingly. auto rv = uv_getaddrinfo( loop, &request, /*getaddrinfo_cb=*/nullptr, hostname.c_str(), /*service=*/nullptr, &hints); if (rv != 0) { return std::make_tuple(rv, Addrinfo()); } return std::make_tuple(0, Addrinfo(request.addrinfo, AddrinfoDeleter())); } struct InterfaceAddressesDeleter { explicit InterfaceAddressesDeleter(int count) : count_(count) {} InterfaceAddressesDeleter() = default; void operator()(uv_interface_address_t* ptr) const { uv_free_interface_addresses(ptr, count_); } private: int count_{-1}; }; using InterfaceAddresses = std::unique_ptr; inline std::tuple getInterfaceAddresses() { uv_interface_address_t* info; int count; auto rv = uv_interface_addresses(&info, &count); if (rv != 0) { return std::make_tuple(rv, InterfaceAddresses(), 0); } return std::make_tuple( 0, InterfaceAddresses(info, InterfaceAddressesDeleter(count)), count); } inline std::tuple getHostname() { std::array hostname; size_t size = hostname.size(); auto rv = uv_os_gethostname(hostname.data(), &size); if (rv != 0) { return std::make_tuple(rv, std::string()); } return std::make_tuple( 0, std::string(hostname.data(), hostname.data() + size)); } inline std::string formatUvError(int status) { if (status == 0) { return "success"; } else { std::ostringstream ss; ss << uv_err_name(status) << ": " << uv_strerror(status); return ss.str(); } } } // namespace uv } // namespace transport } // namespace tensorpipe ================================================ FILE: third_party/README.md ================================================ # third_party This directory includes dependencies as [submodules][submodules]. [submodules]: https://git-scm.com/book/en/v2/Git-Tools-Submodules ## Build dependencies * **libuv** is a multi-platform support library with a focus on asynchronous I/O. ## Test dependencies * **backward-cpp** is a beautiful stack trace pretty printer for C++. * **googletest** is a C++ test framework.