Repository: pytorch/tensorpipe
Branch: main
Commit: b4b77d1006e7
Files: 292
Total size: 1.2 MB

Directory structure:
gitextract_wzzfsv6c/

├── .circleci/
│   ├── Dockerfile.cuda10.1
│   ├── Dockerfile.cuda10.2
│   ├── Dockerfile.cuda11.0
│   ├── Dockerfile.cuda11.1
│   ├── Dockerfile.cuda9.2
│   └── config.yml
├── .gitignore
├── .gitmodules
├── CMakeLists.txt
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE.txt
├── README.md
├── cmake/
│   ├── FindPackageHandleStandardArgs.cmake
│   ├── FindPackageMessage.cmake
│   ├── Finduv.cmake
│   ├── MiscCheck.cmake
│   ├── Options.cmake
│   └── Sanitize.cmake
├── docs/
│   ├── cuda_gotchas.md
│   ├── development.md
│   ├── linux_support.md
│   ├── shm.md
│   └── thread_model.md
├── setup.py
├── tensorpipe/
│   ├── .clang-format
│   ├── .clang-tidy
│   ├── CMakeLists.txt
│   ├── benchmark/
│   │   ├── CMakeLists.txt
│   │   ├── benchmark_pipe.cc
│   │   ├── benchmark_transport.cc
│   │   ├── channel_registry.cc
│   │   ├── channel_registry.h
│   │   ├── measurements.h
│   │   ├── options.cc
│   │   ├── options.h
│   │   ├── registry.h
│   │   ├── transport_registry.cc
│   │   └── transport_registry.h
│   ├── channel/
│   │   ├── basic/
│   │   │   ├── channel_impl.cc
│   │   │   ├── channel_impl.h
│   │   │   ├── context_impl.cc
│   │   │   ├── context_impl.h
│   │   │   ├── factory.cc
│   │   │   └── factory.h
│   │   ├── channel.h
│   │   ├── channel_boilerplate.h
│   │   ├── channel_impl_boilerplate.h
│   │   ├── cma/
│   │   │   ├── channel_impl.cc
│   │   │   ├── channel_impl.h
│   │   │   ├── context_impl.cc
│   │   │   ├── context_impl.h
│   │   │   ├── factory.cc
│   │   │   └── factory.h
│   │   ├── context.h
│   │   ├── context_boilerplate.h
│   │   ├── context_impl_boilerplate.h
│   │   ├── cuda_basic/
│   │   │   ├── channel_impl.cc
│   │   │   ├── channel_impl.h
│   │   │   ├── constants.h
│   │   │   ├── context_impl.cc
│   │   │   ├── context_impl.h
│   │   │   ├── factory.cc
│   │   │   └── factory.h
│   │   ├── cuda_gdr/
│   │   │   ├── channel_impl.cc
│   │   │   ├── channel_impl.h
│   │   │   ├── constants.h
│   │   │   ├── context_impl.cc
│   │   │   ├── context_impl.h
│   │   │   ├── error.h
│   │   │   ├── factory.cc
│   │   │   └── factory.h
│   │   ├── cuda_ipc/
│   │   │   ├── channel_impl.cc
│   │   │   ├── channel_impl.h
│   │   │   ├── constants.h
│   │   │   ├── context_impl.cc
│   │   │   ├── context_impl.h
│   │   │   ├── factory.cc
│   │   │   └── factory.h
│   │   ├── cuda_xth/
│   │   │   ├── channel_impl.cc
│   │   │   ├── channel_impl.h
│   │   │   ├── context_impl.cc
│   │   │   ├── context_impl.h
│   │   │   ├── factory.cc
│   │   │   └── factory.h
│   │   ├── error.cc
│   │   ├── error.h
│   │   ├── helpers.cc
│   │   ├── helpers.h
│   │   ├── mpt/
│   │   │   ├── channel_impl.cc
│   │   │   ├── channel_impl.h
│   │   │   ├── context_impl.cc
│   │   │   ├── context_impl.h
│   │   │   ├── factory.cc
│   │   │   ├── factory.h
│   │   │   └── nop_types.h
│   │   └── xth/
│   │       ├── channel_impl.cc
│   │       ├── channel_impl.h
│   │       ├── context_impl.cc
│   │       ├── context_impl.h
│   │       ├── factory.cc
│   │       └── factory.h
│   ├── common/
│   │   ├── address.cc
│   │   ├── address.h
│   │   ├── allocator.cc
│   │   ├── allocator.h
│   │   ├── buffer.h
│   │   ├── busy_polling_loop.h
│   │   ├── callback.h
│   │   ├── cpu_buffer.h
│   │   ├── cuda.h
│   │   ├── cuda_buffer.cc
│   │   ├── cuda_buffer.h
│   │   ├── cuda_lib.h
│   │   ├── cuda_loop.cc
│   │   ├── cuda_loop.h
│   │   ├── deferred_executor.h
│   │   ├── defs.h
│   │   ├── device.h
│   │   ├── dl.h
│   │   ├── epoll_loop.cc
│   │   ├── epoll_loop.h
│   │   ├── error.cc
│   │   ├── error.h
│   │   ├── error_macros.h
│   │   ├── fd.cc
│   │   ├── fd.h
│   │   ├── ibv.cc
│   │   ├── ibv.h
│   │   ├── ibv_lib.h
│   │   ├── memory.h
│   │   ├── nop.h
│   │   ├── nvml_lib.h
│   │   ├── optional.h
│   │   ├── queue.h
│   │   ├── ringbuffer.h
│   │   ├── ringbuffer_read_write_ops.h
│   │   ├── ringbuffer_role.h
│   │   ├── shm_ringbuffer.h
│   │   ├── shm_segment.cc
│   │   ├── shm_segment.h
│   │   ├── socket.cc
│   │   ├── socket.h
│   │   ├── state_machine.h
│   │   ├── stream_read_write_ops.h
│   │   ├── strings.h
│   │   ├── system.cc
│   │   └── system.h
│   ├── config.h.in
│   ├── config_cuda.h.in
│   ├── core/
│   │   ├── context.cc
│   │   ├── context.h
│   │   ├── context_impl.cc
│   │   ├── context_impl.h
│   │   ├── error.cc
│   │   ├── error.h
│   │   ├── listener.cc
│   │   ├── listener.h
│   │   ├── listener_impl.cc
│   │   ├── listener_impl.h
│   │   ├── message.h
│   │   ├── nop_types.h
│   │   ├── pipe.cc
│   │   ├── pipe.h
│   │   ├── pipe_impl.cc
│   │   └── pipe_impl.h
│   ├── misc/
│   │   ├── CMakeLists.txt
│   │   └── dump_state_machine.cc
│   ├── python/
│   │   ├── CMakeLists.txt
│   │   └── tensorpipe.cc
│   ├── tensorpipe.h
│   ├── tensorpipe_cuda.h
│   ├── test/
│   │   ├── CMakeLists.txt
│   │   ├── channel/
│   │   │   ├── basic/
│   │   │   │   └── basic_test.cc
│   │   │   ├── channel_test.cc
│   │   │   ├── channel_test.h
│   │   │   ├── channel_test_cpu.cc
│   │   │   ├── channel_test_cpu.h
│   │   │   ├── channel_test_cuda.cc
│   │   │   ├── channel_test_cuda.h
│   │   │   ├── channel_test_cuda_multi_gpu.cc
│   │   │   ├── channel_test_cuda_xdtt.cc
│   │   │   ├── cma/
│   │   │   │   ├── CMakeLists.txt
│   │   │   │   ├── cma_test.cc
│   │   │   │   ├── docker_tests.sh
│   │   │   │   ├── probe.cc
│   │   │   │   └── probe_report_checker.py
│   │   │   ├── cuda_basic/
│   │   │   │   └── cuda_basic_test.cc
│   │   │   ├── cuda_gdr/
│   │   │   │   └── cuda_gdr_test.cc
│   │   │   ├── cuda_helpers.h
│   │   │   ├── cuda_ipc/
│   │   │   │   └── cuda_ipc_test.cc
│   │   │   ├── cuda_xth/
│   │   │   │   └── cuda_xth_test.cc
│   │   │   ├── kernel.cu
│   │   │   ├── kernel.cuh
│   │   │   ├── mpt/
│   │   │   │   └── mpt_test.cc
│   │   │   └── xth/
│   │   │       └── xth_test.cc
│   │   ├── common/
│   │   │   ├── cuda_test.cc
│   │   │   ├── defs_test.cc
│   │   │   ├── epoll_loop_test.cc
│   │   │   ├── ringbuffer_test.cc
│   │   │   ├── shm_ringbuffer_test.cc
│   │   │   ├── shm_segment_test.cc
│   │   │   └── system_test.cc
│   │   ├── core/
│   │   │   ├── context_test.cc
│   │   │   ├── listener_test.cc
│   │   │   ├── pipe_cuda_test.cc
│   │   │   ├── pipe_test.cc
│   │   │   └── pipe_test.h
│   │   ├── peer_group.h
│   │   ├── python/
│   │   │   └── tensorpipe.py
│   │   ├── test.cc
│   │   ├── test_environment.cc
│   │   ├── test_environment.h
│   │   └── transport/
│   │       ├── connection_test.cc
│   │       ├── context_test.cc
│   │       ├── ibv/
│   │       │   ├── connection_test.cc
│   │       │   ├── context_test.cc
│   │       │   ├── ibv_test.cc
│   │       │   ├── ibv_test.h
│   │       │   └── sockaddr_test.cc
│   │       ├── listener_test.cc
│   │       ├── shm/
│   │       │   ├── connection_test.cc
│   │       │   ├── listener_test.cc
│   │       │   ├── reactor_test.cc
│   │       │   ├── shm_test.cc
│   │       │   ├── shm_test.h
│   │       │   └── sockaddr_test.cc
│   │       ├── transport_test.h
│   │       └── uv/
│   │           ├── connection_test.cc
│   │           ├── context_test.cc
│   │           ├── loop_test.cc
│   │           ├── sockaddr_test.cc
│   │           ├── uv_test.cc
│   │           └── uv_test.h
│   └── transport/
│       ├── connection.h
│       ├── connection_boilerplate.h
│       ├── connection_impl_boilerplate.h
│       ├── context.h
│       ├── context_boilerplate.h
│       ├── context_impl_boilerplate.h
│       ├── error.cc
│       ├── error.h
│       ├── ibv/
│       │   ├── connection_impl.cc
│       │   ├── connection_impl.h
│       │   ├── constants.h
│       │   ├── context_impl.cc
│       │   ├── context_impl.h
│       │   ├── error.cc
│       │   ├── error.h
│       │   ├── factory.cc
│       │   ├── factory.h
│       │   ├── listener_impl.cc
│       │   ├── listener_impl.h
│       │   ├── reactor.cc
│       │   ├── reactor.h
│       │   ├── sockaddr.cc
│       │   ├── sockaddr.h
│       │   ├── utility.cc
│       │   └── utility.h
│       ├── listener.h
│       ├── listener_boilerplate.h
│       ├── listener_impl_boilerplate.h
│       ├── shm/
│       │   ├── connection_impl.cc
│       │   ├── connection_impl.h
│       │   ├── context_impl.cc
│       │   ├── context_impl.h
│       │   ├── factory.cc
│       │   ├── factory.h
│       │   ├── listener_impl.cc
│       │   ├── listener_impl.h
│       │   ├── reactor.cc
│       │   ├── reactor.h
│       │   ├── sockaddr.cc
│       │   └── sockaddr.h
│       └── uv/
│           ├── connection_impl.cc
│           ├── connection_impl.h
│           ├── context_impl.cc
│           ├── context_impl.h
│           ├── error.cc
│           ├── error.h
│           ├── factory.cc
│           ├── factory.h
│           ├── listener_impl.cc
│           ├── listener_impl.h
│           ├── loop.cc
│           ├── loop.h
│           ├── sockaddr.cc
│           ├── sockaddr.h
│           ├── utility.cc
│           ├── utility.h
│           └── uv.h
└── third_party/
    └── README.md

================================================
FILE CONTENTS
================================================

================================================
FILE: .circleci/Dockerfile.cuda10.1
================================================
FROM nvidia/cuda:10.1-devel-ubuntu18.04

# Install APT packages.
RUN apt-get update && \
        apt-get install -y build-essential cmake

COPY . /tensorpipe

WORKDIR /tensorpipe


================================================
FILE: .circleci/Dockerfile.cuda10.2
================================================
FROM nvidia/cuda:10.2-devel-ubuntu18.04

# Install APT packages.
RUN apt-get update && \
        apt-get install -y build-essential cmake

COPY . /tensorpipe

WORKDIR /tensorpipe


================================================
FILE: .circleci/Dockerfile.cuda11.0
================================================
FROM nvidia/cuda:11.0-devel-ubuntu18.04

# Install APT packages.
RUN apt-get update && \
        apt-get install -y build-essential cmake

COPY . /tensorpipe

WORKDIR /tensorpipe


================================================
FILE: .circleci/Dockerfile.cuda11.1
================================================
FROM nvidia/cuda:11.1-devel-ubuntu18.04

# Install APT packages.
RUN apt-get update && \
        apt-get install -y build-essential cmake

COPY . /tensorpipe

WORKDIR /tensorpipe


================================================
FILE: .circleci/Dockerfile.cuda9.2
================================================
FROM nvidia/cuda:9.2-devel-ubuntu18.04

# Install APT packages.
RUN apt-get update && \
        apt-get install -y build-essential cmake

COPY . /tensorpipe

WORKDIR /tensorpipe


================================================
FILE: .circleci/config.yml
================================================
version: 2.1

jobs:
  build:
    parameters:
      docker_image:
        type: string
        default: ""
      apt_get:
        type: string
        default: ""
      c_compiler:
        type: string
        default: ""
      cxx_compiler:
        type: string
        default: ""
      cmake_args:
        type: string
        default: ""
      nproc:
        type: integer
        default: 20
    docker:
      - image: << parameters.docker_image >>
    steps:
      - checkout
      - run:
          name: Install apt packages
          command: |
            apt-get update
            apt-get install -y git-core build-essential cmake << parameters.apt_get >>
      - run:
          name: Initialize submodules
          command: |
            git submodule init
            git submodule update
      - run:
          name: Build
          command: |
            mkdir build
            cd build
            cmake ../ \
              -DCMAKE_C_FLAGS="-Werror -Wno-deprecated-declarations" \
              -DCMAKE_CXX_FLAGS="-Werror -Wno-deprecated-declarations" \
              -DCMAKE_C_COMPILER=<< parameters.c_compiler >> \
              -DCMAKE_CXX_COMPILER=<< parameters.cxx_compiler >> \
              -DTP_ENABLE_CMA=OFF \
              -DTP_ENABLE_CUDA_IPC=OFF \
              -DTP_ENABLE_IBV=OFF \
              -DTP_BUILD_TESTING=ON \
              << parameters.cmake_args >>
            make -j<<parameters.nproc>>
      - run:
          name: Test
          command: |
            cd build
            ./tensorpipe/test/tensorpipe_test
      - run:
          name: Install
          command: |
            cd build
            make install
  build_gpu:
    parameters:
      cuda_version:
        type: string
      exclude_tests:
        type: string
        default: ""
    machine:
      resource_class: gpu.nvidia.small.multi
      image: ubuntu-1604-cuda-10.1:201909-23
      docker_layer_caching: true
    steps:
      - checkout
      - run:
          name: Initialize submodules
          command: |
            git submodule init
            git submodule update
      - run:
          name: Build/test
          command: |
              docker build -t tensorpipe -f .circleci/Dockerfile.cuda<< parameters.cuda_version >> .
              docker run --gpus all --pid=host tensorpipe sh -c "
                mkdir build && cd build &&
                cmake ../ \
                  -DCMAKE_C_FLAGS=\"-Werror -Wno-deprecated-declarations\" \
                  -DCMAKE_CXX_FLAGS=\"-Werror -Wno-deprecated-declarations\" \
                  -DCUDA_NVCC_FLAGS=\"-gencode arch=compute_61,code=sm_61\" \
                  -DTP_ENABLE_SHM=OFF \
                  -DTP_ENABLE_CMA=OFF \
                  -DTP_USE_CUDA=ON \
                  -DTP_ENABLE_CUDA_IPC=ON \
                  -DTP_ENABLE_IBV=OFF \
                  -DTP_BUILD_TESTING=ON &&
                make -j20 &&
                ./tensorpipe/test/tensorpipe_test --gtest_filter='-<< parameters.exclude_tests >>' &&
                make install"
  bare_metal:
    parameters:
      image:
        type: string
        default: ""
      apt_get:
        type: string
        default: ""
      c_compiler:
        type: string
        default: ""
      cxx_compiler:
        type: string
        default: ""
      cmake_args:
        type: string
        default: ""
      nproc:
        type: integer
        default: 20
    machine:
      image: << parameters.image >>
    steps:
      - checkout
      - run:
          name: Install apt packages
          command: |
            sudo apt-get update
            sudo apt-get install -y git-core build-essential cmake libibverbs1 rdma-core linux-modules-extra-$(uname -r) << parameters.apt_get >>
      - run:
          name: Initialize submodules
          command: |
            git submodule init
            git submodule update
      - run:
          name: Build
          command: |
            mkdir build
            cd build
            cmake ../ \
              -DCMAKE_C_FLAGS="-Werror -Wno-deprecated-declarations" \
              -DCMAKE_CXX_FLAGS="-Werror -Wno-deprecated-declarations" \
              -DCMAKE_C_COMPILER=<< parameters.c_compiler >> \
              -DCMAKE_CXX_COMPILER=<< parameters.cxx_compiler >> \
              -DTP_ENABLE_CUDA_IPC=OFF \
              -DTP_ENABLE_IBV=ON \
              -DTP_BUILD_TESTING=ON \
              << parameters.cmake_args >>
            make -j<<parameters.nproc>>
      - run:
          name: Configure Soft-RoCE (RXE) InfiniBand interface
          command: |
            # Find the name of the first non-loopback IP interface
            INTERFACE_NAME=$(ip link | grep '^2: ' | sed -re 's/2: ([a-z0-9]+): .*/\1/')
            sudo rdma link add rxe0 type rxe netdev $INTERFACE_NAME
      - run:
          name: Test
          command: |
            cd build
            ./tensorpipe/test/tensorpipe_test
      - run:
          name: Test CMA channel autodetection with Docker
          command: |
            bash -eo pipefail tensorpipe/test/channel/cma/docker_tests.sh
      - run:
          name: Install
          command: |
            cd build
            sudo make install
  build_osx:
    macos:
      xcode: 12.4.0
    steps:
      - checkout
      - run:
          name: Install homebrew packages
          command: |
            brew install cmake
      - run:
          name: Initialize submodules
          command: |
            git submodule init
            git submodule update
      - run:
          name: Build
          command: |
            mkdir build
            cd build
            cmake ../ \
              -DCMAKE_C_FLAGS="-Werror -Wno-deprecated-declarations" \
              -DCMAKE_CXX_FLAGS="-Werror -Wno-deprecated-declarations" \
              -DTP_BUILD_TESTING=ON
            make -j
      - run:
          name: Test
          command: |
            cd build
            ./tensorpipe/test/tensorpipe_test
      - run:
          name: Install
          command: |
            cd build
            make install
  python:
    parameters:
      docker_image:
        type: string
        default: ""
      apt_get:
        type: string
        default: ""
    docker:
      - image: << parameters.docker_image >>
    steps:
      - checkout
      - run:
          name: Install apt packages
          command: |
            apt-get update
            apt-get install -y git-core build-essential cmake python3-dev python3-venv << parameters.apt_get >>
      - run:
          name: Initialize submodules
          command: |
            git submodule init
            git submodule update
      - run:
          name: Build
          command: |
            python3 -m venv venv
            source venv/bin/activate
            TP_ENABLE_CMA=OFF TP_ENABLE_CUDA_IPC=OFF TP_ENABLE_IBV=OFF python3 setup.py install
      - run:
          name: Test
          command: |
            source venv/bin/activate
            python3 tensorpipe/test/python/tensorpipe.py
  format:
    docker:
      - image: ubuntu:18.04
    steps:
      - checkout
      - run:
          name: Install clang-format
          command: |
            apt-get update
            apt-get install -y git-core clang-format-10
      - run:
          name: Verify clang-format
          command: |
             git ls-files | grep -E  '\.(cc|h)$' | xargs clang-format-10 -i
             if git diff --quiet; then
               echo "Formatting OK!"
             else
               echo "Formatting not OK!"
               echo "------------------"
               git --no-pager diff --color
               exit 1
             fi

workflows:
  build:
    jobs:
      - build:
          name: gcc5
          docker_image: ubuntu:18.04
          apt_get: "gcc-5 g++-5"
          c_compiler: gcc-5
          cxx_compiler: g++-5
      - build:
          name: gcc7
          docker_image: ubuntu:18.04
          apt_get: "gcc-7 g++-7"
          c_compiler: gcc-7
          cxx_compiler: g++-7
      - build:
          name: clang6
          docker_image: ubuntu:18.04
          apt_get: "clang-6.0"
          c_compiler: clang-6.0
          cxx_compiler: clang++-6.0
      - build:
          name: gcc7-asan
          docker_image: ubuntu:18.04
          apt_get: "gcc-7 g++-7"
          c_compiler: gcc-7
          cxx_compiler: g++-7
          cmake_args: -DSANITIZE=address
      - build:
          name: gcc7-tsan
          docker_image: ubuntu:18.04
          apt_get: "gcc-7 g++-7"
          c_compiler: gcc-7
          cxx_compiler: g++-7
          cmake_args: -DSANITIZE=thread
      - bare_metal:
          name: bare-metal
          image: ubuntu-2004:202008-01
          apt_get: "gcc-7 g++-7"
          c_compiler: gcc-7
          cxx_compiler: g++-7
      - build_gpu:
          name: GPU (CUDA 9.2)
          cuda_version: "9.2"
          # Excluding CudaGdr for lack of InfiniBand hardware, and CudaIpc on
          # multi GPU for lack of p2p capabilities.
          exclude_tests: "CudaGdr*:CudaIpc/CudaMultiGPUChannelTestSuite*"
      - build_gpu:
          name: GPU (CUDA 10.1)
          cuda_version: "10.1"
          # Excluding CudaGdr for lack of InfiniBand hardware, and CudaIpc on
          # multi GPU for lack of p2p capabilities.
          exclude_tests: "CudaGdr*:CudaIpc/CudaMultiGPUChannelTestSuite*"
      - build_gpu:
          name: GPU (CUDA 10.2)
          cuda_version: "10.2"
          # Excluding CudaGdr for lack of InfiniBand hardware, and CudaIpc on
          # multi GPU for lack of p2p capabilities.
          exclude_tests: "CudaGdr*:CudaIpc/CudaMultiGPUChannelTestSuite*"
      - build_gpu:
          name: GPU (CUDA 11.0)
          cuda_version: "11.0"
          # Excluding CudaGdr for lack of InfiniBand hardware, and CudaIpc on
          # multi GPU for lack of p2p capabilities.
          exclude_tests: "CudaGdr*:CudaIpc/CudaMultiGPUChannelTestSuite*"
      - build_gpu:
          name: GPU (CUDA 11.1)
          cuda_version: "11.1"
          # Excluding CudaGdr for lack of InfiniBand hardware, and CudaIpc on
          # multi GPU for lack of p2p capabilities, and CudaBasic/CudaMultiGPUChannelTestSuite.SendAcrossNonDefaultDevices/0
          # because it does not work with CUDA 11.1 (cf. https://github.com/pytorch/tensorpipe/issues/368).
          exclude_tests: "CudaGdr*:CudaIpc/CudaMultiGPUChannelTestSuite*:CudaBasic/CudaMultiGPUChannelTestSuite.SendAcrossNonDefaultDevices/0"
      - build_osx:
          name: OSX
      - python:
          name: python
          docker_image: ubuntu:18.04
          apt_get: "clang-6.0"
      - format:
          name: clang-format


================================================
FILE: .gitignore
================================================
*~
.DS_Store
/build/
/cmake-build-debug/


================================================
FILE: .gitmodules
================================================
[submodule "third_party/pybind11"]
	path = third_party/pybind11
	url = https://github.com/pybind/pybind11.git
[submodule "third_party/libuv"]
	path = third_party/libuv
	url = https://github.com/libuv/libuv.git
	branch = v1.x
[submodule "third_party/googletest"]
	path = third_party/googletest
	url = https://github.com/google/googletest.git
[submodule "third_party/libnop"]
	path = third_party/libnop
	url = https://github.com/google/libnop.git


================================================
FILE: CMakeLists.txt
================================================
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

cmake_minimum_required(VERSION 3.18 FATAL_ERROR)

project(tensorpipe LANGUAGES C CXX)

set(CMAKE_CXX_STANDARD 17)

list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")

# Expose build options.
include(Options)

# Define sanitizer option, if specified.
include(Sanitize)

# Misc checks to cope with various compiler modes.
include(MiscCheck)

add_subdirectory(tensorpipe)

install(EXPORT TensorpipeTargets
        DESTINATION share/cmake/Tensorpipe
        FILE TensorpipeTargets.cmake)


================================================
FILE: CODE_OF_CONDUCT.md
================================================
# Code of Conduct

## Our Pledge

In the interest of fostering an open and welcoming environment, we as
contributors and maintainers pledge to make participation in our project and
our community a harassment-free experience for everyone, regardless of age, body
size, disability, ethnicity, sex characteristics, gender identity and expression,
level of experience, education, socio-economic status, nationality, personal
appearance, race, religion, or sexual identity and orientation.

## Our Standards

Examples of behavior that contributes to creating a positive environment
include:

* Using welcoming and inclusive language
* Being respectful of differing viewpoints and experiences
* Gracefully accepting constructive criticism
* Focusing on what is best for the community
* Showing empathy towards other community members

Examples of unacceptable behavior by participants include:

* The use of sexualized language or imagery and unwelcome sexual attention or
  advances
* Trolling, insulting/derogatory comments, and personal or political attacks
* Public or private harassment
* Publishing others' private information, such as a physical or electronic
  address, without explicit permission
* Other conduct which could reasonably be considered inappropriate in a
  professional setting

## Our Responsibilities

Project maintainers are responsible for clarifying the standards of acceptable
behavior and are expected to take appropriate and fair corrective action in
response to any instances of unacceptable behavior.

Project maintainers have the right and responsibility to remove, edit, or
reject comments, commits, code, wiki edits, issues, and other contributions
that are not aligned to this Code of Conduct, or to ban temporarily or
permanently any contributor for other behaviors that they deem inappropriate,
threatening, offensive, or harmful.

## Scope

This Code of Conduct applies within all project spaces, and it also applies when
an individual is representing the project or its community in public spaces.
Examples of representing a project or community include using an official
project e-mail address, posting via an official social media account, or acting
as an appointed representative at an online or offline event. Representation of
a project may be further defined and clarified by project maintainers.

## Enforcement

Instances of abusive, harassing, or otherwise unacceptable behavior may be
reported by contacting the project team at <conduct@pytorch.org>. All
complaints will be reviewed and investigated and will result in a response that
is deemed necessary and appropriate to the circumstances. The project team is
obligated to maintain confidentiality with regard to the reporter of an incident.
Further details of specific enforcement policies may be posted separately.

Project maintainers who do not follow or enforce the Code of Conduct in good
faith may face temporary or permanent repercussions as determined by other
members of the project's leadership.

## Attribution

This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html

[homepage]: https://www.contributor-covenant.org

For answers to common questions about this code of conduct, see
https://www.contributor-covenant.org/faq


================================================
FILE: CONTRIBUTING.md
================================================
# Contributing to TensorPipe

We want to make contributing to this project as easy and transparent as
possible.

## Our Development Process

This project's source-of-truth is the version in Facebook's internal codebase,
which is continuously synced with the GitHub mirror using
[ShipIt](https://github.com/facebook/fbshipit). Pull requests on GitHub are
copied over using ImportIt (a companion tool for ShipIt).

## Pull Requests

We actively welcome your pull requests.

1. Fork the repo and create your branch from `main`.
2. If you've added code that should be tested, add tests.
3. If you've changed APIs, update the documentation.
4. Ensure the test suite passes.
5. Make sure your code lints.
6. If you haven't already, complete the Contributor License Agreement ("CLA").

## Contributor License Agreement ("CLA")

In order to accept your pull request, we need you to submit a CLA. You only
need to do this once to work on any of Facebook's open source projects.

Complete your CLA here: <https://code.facebook.com/cla>

## Issues

We use GitHub issues to track public bugs. Please ensure your description is
clear and has sufficient instructions to be able to reproduce the issue.

Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the
safe disclosure of security bugs. In those cases, please go through the process
outlined on that page and do not file a public issue.

## Coding Style

This source code is formatted using `clang-format`, with project-specific rules
recorded in the `.clang-format` file.

## License

By contributing to TensorPipe, you agree that your contributions will be
licensed under the LICENSE.txt file in the root directory of this source tree.


================================================
FILE: LICENSE.txt
================================================
BSD License

For TensorPipe software

Copyright (c) Meta Platforms, Inc. and affiliates. All rights reserved.

Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:

 * Redistributions of source code must retain the above copyright notice, this
   list of conditions and the following disclaimer.

 * Redistributions in binary form must reproduce the above copyright notice,
   this list of conditions and the following disclaimer in the documentation
   and/or other materials provided with the distribution.

 * Neither the name Meta nor the names of its contributors may be used to
   endorse or promote products derived from this software without specific
   prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


================================================
FILE: README.md
================================================
# TensorPipe

The TensorPipe project provides a tensor-aware channel to transfer rich objects
from one process to another while using the fastest transport for the tensors
contained therein (e.g., CUDA device-to-device copy).

> :warning: Update (2025-12) tensorpipe is in maintenance mode and no new changes are planned beyond minimal build fixes. Please see https://github.com/meta-pytorch/torchcomms and https://github.com/meta-pytorch/monarch for alternatives.

## Getting started

First clone the repository:

```shell
$ git clone --recursive https://github.com/pytorch/tensorpipe
```

Then, build as follows (using ninja instead of make):

``` shell
$ cd tensorpipe
$ mkdir build
$ cd build
$ cmake ../ -GNinja
$ ninja
```

You can find test executables in `build/tensorpipe/test`.

## Interface

There are four classes you need to know about:

- `tensorpipe::Context`, which keeps track of the global state of the system,
  such as thread pools, open file descriptors, etc.
- `tensorpipe::Listener`, which allows one process to open an entry point for
  other processes to connect to.
- `tensorpipe::Pipe`, the one communication primitive that this entire project
  is about. You can obtain one either by connecting to the listener of another
  process or from such a listener when another process connects to it. Once you
  have a pipe, you can send messages on it, and that's the whole point.
- `tensorpipe::Message`, which is the the language that pipes read and write in.
  Pipes are streams of structured messages (not just raw byte buffers), and a
  message is composed of a "core" payload (memory living on CPU) plus a list of
  tensors (memory living on any device, like GPUs).

Sending a message from one end of the pipe to the other can be achieved using
the `write` method, which takes a message (with the data to send) and a
callback which will be invoked once the sending has completed. This callback
will be invoked with an error (if one happened) and with the message.

Receiving a message takes two steps: on an incoming message, first the pipe
asks you to provide some memory to hold the message in, and then you ask the
pipe to read the data into that memory. In order to do this, first you must
register a callback that will be notified for incoming messages. This is
performed by calling the `readDescriptor` method with said callback. The
callback will be invoked with a so-called descriptor, which can be seen as a
"message skeleton", i.e., a message with no buffers attached to it (they are
set to null pointers). The job of this callback is filling in those buffers,
either by allocating the required memory or by obtaining it from somewhere else
(from a cache, as a slice of a batch that's being assembled, ...). This
descriptor also contains some metadata, given by the sender, which can be used
to provide allocation hints or any other information that can help the receiver
determine where to store the data. Once the message's buffers are ready, you
can tell the pipe to go ahead and fill them in with the incoming data by
passing the message to the `read` method, together with a callback which will
be called when all the data has been received and stored. As when writing, this
callback will be given a (possibly empty) error and the original message. The
`readDescriptor` callback is one-shot, which means that after it fires it
"expires" and will not be called again. It must be re-armed for a new event to
be received.

When you pass a message to the pipe, to send it or to receive into it, you must
not tamper with the underlying memory until the callback has completed, even if
the `write` or `read` call already returned. (The `write` and `read` calls, and
all other calls, are non-blocking so that it's easier to schedule asynchronous
parallel trasfers without having to use threads). This means you can not deallocate
the memory or alter it in any way, as the pipe may still be reading or
modifying it. In other terms, you relinquish control over the memory when you
pass a message to the pipe, only to reacquire it once the message is given back
to you in the callback. This contract is encoded by the requirement to move the
messages into and out of the pipe (using rvalue references). Also, because of
this agreement, all callbacks will always be called, even if the pipe is closed
or if it errors, in order to give back the memory.

The order in which messages are written to a pipe is preserved when these
messages are read on the other side. Moreover, for a given pipe endpoint, the
callbacks of the performed operations are executed in the same order that these
operations were scheduled, even if the operations are performed asynchronously
or out-of-band and thus may overlap or occur out of order. What this means is
that if two write operations are scheduled one after the other back-to-back,
even if the second one completes before the first one, its callback is delayed
until the first one also completes and its callback is invoked. The same
applies for reads. All the callbacks of all the pipes in a given context are
called from the same per-context thread and thus no two callbacks will occur at
the same time. However, different contexts will use different threads and their
callbacks may thus overlap.

All the callbacks are invoked with an error reference. This may be "empty",
i.e., indicate that no error has in fact occurred. In this case, the error
object evaluates to false. In case of an actual error it will instead evaluate
to true. When invoked with an error, the remaining arguments of the callback
may be meaningless. For the `read` and `write` callbacks they will still
contain the message that these methods will be invoked with, but the
`readDescriptor` one will be an empty or invalid message. It should not be
used.

There is no expectation for the `readDescriptor` callback to be armed at all
times. Similarly, it is not necessary to call the `read` method immediately
after a descriptor has been read. Both these possibilities are by design, in
order to allow the user of the pipe to apply some backpressure in case it's
receiving messages at a faster rate than it can handle, or for any other
reason. This backpressure will be propagated to the lower-level components as
as far down as possible (e.g., by stopping listening for readability events on
the socket file descriptor).

## Transports and channels

TensorPipe aims to be "backend-agnostic": it doesn't want to be restricted to a
single way of copying data around but wants to be able to choose the fastest
medium from a library of backends, based on the circumstances (e.g., are the two
processes on the same machine?) and on the available hardware (e.g., are the
GPUs connected with NVLink?). TensorPipe strives to have the largest selection
of backends, enabling users to implement specific backends for their systems
(should the default ones prove limited) and encouraging contributions.

The two processes that are establishing a pipe will automatically negotiate
during setup to determine which of the backends they have at their disposal can
be used and how well they would perform, in order to choose the best one in a
way that is completely transparent to the user.

Backends come in two flavors:

- Transports are the connections used by the pipes to transfer control messages,
  and the (smallish) core payloads. They are meant to be lightweight and
  low-latency. The most basic transport is a simple TCP one, which should work
  in all scenarios. A more optimized one, for example, is based on a ring buffer
  allocated in shared memory, which two processes on the same machine can use to
  communicate by performing just a memory copy, without passing through the
  kernel.

- Channels are where the heavy lifting takes place, as they take care of copying
  the (larger) tensor data. High bandwidths are a requirement. Examples include
  multiplexing chunks of data across multiple TCP sockets and processes, so to
  saturate the NIC's bandwidth. Or using a CUDA memcpy call to transfer memory
  from one GPU to another using NVLink.

These different usage patterns promote different design choices when
implementing transports and channels, which means the two are not perfectly
interchangeable. For example, a TCP-based transport is best implemented using a
single connection, whereas a TCP-based channel will benefit from using multiple
connection and chunk and multiplex the payload over them in order to saturate
the bandwidth even on the most powerful NICs.

Moreover, the APIs of transports and channels put different constraints on
them, which demand and permit different approaches. As a rule of thumb, we
require more from the transports: the only out-of-band information they can use
is a simple address, which is all they can use to bootstrap the connection, and
they need to include some "signaling" capabilities (a write on one side "wakes
up" the other side by causing a read). Channels, on the other hand, have much
looser requirements: they basically just need to implement a `memcpy` and, for
anything beyond that, they can leverage a transport that the pipe gives to them
for support.

## License

TensorPipe is BSD licensed, as found in the [LICENSE.txt](LICENSE.txt) file.


================================================
FILE: cmake/FindPackageHandleStandardArgs.cmake
================================================
# Copyright 2000-2020 Kitware, Inc. and Contributors
# All rights reserved.
#
# Distributed under the OSI-approved BSD 3-Clause License. See
# https://cmake.org/licensing for details.

#[=======================================================================[.rst:
FindPackageHandleStandardArgs
-----------------------------

This module provides a function intended to be used in :ref:`Find Modules`
implementing :command:`find_package(<PackageName>)` calls.  It handles the
``REQUIRED``, ``QUIET`` and version-related arguments of ``find_package``.
It also sets the ``<PackageName>_FOUND`` variable.  The package is
considered found if all variables listed contain valid results, e.g.
valid filepaths.

.. command:: find_package_handle_standard_args

  There are two signatures::

    find_package_handle_standard_args(<PackageName>
      (DEFAULT_MSG|<custom-failure-message>)
      <required-var>...
      )

    find_package_handle_standard_args(<PackageName>
      [FOUND_VAR <result-var>]
      [REQUIRED_VARS <required-var>...]
      [VERSION_VAR <version-var>]
      [HANDLE_COMPONENTS]
      [CONFIG_MODE]
      [FAIL_MESSAGE <custom-failure-message>]
      )

  The ``<PackageName>_FOUND`` variable will be set to ``TRUE`` if all
  the variables ``<required-var>...`` are valid and any optional
  constraints are satisfied, and ``FALSE`` otherwise.  A success or
  failure message may be displayed based on the results and on
  whether the ``REQUIRED`` and/or ``QUIET`` option was given to
  the :command:`find_package` call.

  The options are:

  ``(DEFAULT_MSG|<custom-failure-message>)``
    In the simple signature this specifies the failure message.
    Use ``DEFAULT_MSG`` to ask for a default message to be computed
    (recommended).  Not valid in the full signature.

  ``FOUND_VAR <result-var>``
    Obsolete.  Specifies either ``<PackageName>_FOUND`` or
    ``<PACKAGENAME>_FOUND`` as the result variable.  This exists only
    for compatibility with older versions of CMake and is now ignored.
    Result variables of both names are always set for compatibility.

  ``REQUIRED_VARS <required-var>...``
    Specify the variables which are required for this package.
    These may be named in the generated failure message asking the
    user to set the missing variable values.  Therefore these should
    typically be cache entries such as ``FOO_LIBRARY`` and not output
    variables like ``FOO_LIBRARIES``.

  ``VERSION_VAR <version-var>``
    Specify the name of a variable that holds the version of the package
    that has been found.  This version will be checked against the
    (potentially) specified required version given to the
    :command:`find_package` call, including its ``EXACT`` option.
    The default messages include information about the required
    version and the version which has been actually found, both
    if the version is ok or not.

  ``HANDLE_COMPONENTS``
    Enable handling of package components.  In this case, the command
    will report which components have been found and which are missing,
    and the ``<PackageName>_FOUND`` variable will be set to ``FALSE``
    if any of the required components (i.e. not the ones listed after
    the ``OPTIONAL_COMPONENTS`` option of :command:`find_package`) are
    missing.

  ``CONFIG_MODE``
    Specify that the calling find module is a wrapper around a
    call to ``find_package(<PackageName> NO_MODULE)``.  This implies
    a ``VERSION_VAR`` value of ``<PackageName>_VERSION``.  The command
    will automatically check whether the package configuration file
    was found.

  ``FAIL_MESSAGE <custom-failure-message>``
    Specify a custom failure message instead of using the default
    generated message.  Not recommended.

Example for the simple signature:

.. code-block:: cmake

  find_package_handle_standard_args(LibXml2 DEFAULT_MSG
    LIBXML2_LIBRARY LIBXML2_INCLUDE_DIR)

The ``LibXml2`` package is considered to be found if both
``LIBXML2_LIBRARY`` and ``LIBXML2_INCLUDE_DIR`` are valid.
Then also ``LibXml2_FOUND`` is set to ``TRUE``.  If it is not found
and ``REQUIRED`` was used, it fails with a
:command:`message(FATAL_ERROR)`, independent whether ``QUIET`` was
used or not.  If it is found, success will be reported, including
the content of the first ``<required-var>``.  On repeated CMake runs,
the same message will not be printed again.

Example for the full signature:

.. code-block:: cmake

  find_package_handle_standard_args(LibArchive
    REQUIRED_VARS LibArchive_LIBRARY LibArchive_INCLUDE_DIR
    VERSION_VAR LibArchive_VERSION)

In this case, the ``LibArchive`` package is considered to be found if
both ``LibArchive_LIBRARY`` and ``LibArchive_INCLUDE_DIR`` are valid.
Also the version of ``LibArchive`` will be checked by using the version
contained in ``LibArchive_VERSION``.  Since no ``FAIL_MESSAGE`` is given,
the default messages will be printed.

Another example for the full signature:

.. code-block:: cmake

  find_package(Automoc4 QUIET NO_MODULE HINTS /opt/automoc4)
  find_package_handle_standard_args(Automoc4  CONFIG_MODE)

In this case, a ``FindAutmoc4.cmake`` module wraps a call to
``find_package(Automoc4 NO_MODULE)`` and adds an additional search
directory for ``automoc4``.  Then the call to
``find_package_handle_standard_args`` produces a proper success/failure
message.
#]=======================================================================]

include(${CMAKE_CURRENT_LIST_DIR}/FindPackageMessage.cmake)

# internal helper macro
macro(_FPHSA_FAILURE_MESSAGE _msg)
  if (${_NAME}_FIND_REQUIRED)
    message(FATAL_ERROR "${_msg}")
  else ()
    if (NOT ${_NAME}_FIND_QUIETLY)
      message(STATUS "${_msg}")
    endif ()
  endif ()
endmacro()


# internal helper macro to generate the failure message when used in CONFIG_MODE:
macro(_FPHSA_HANDLE_FAILURE_CONFIG_MODE)
  # <name>_CONFIG is set, but FOUND is false, this means that some other of the REQUIRED_VARS was not found:
  if(${_NAME}_CONFIG)
    _FPHSA_FAILURE_MESSAGE("${FPHSA_FAIL_MESSAGE}: missing:${MISSING_VARS} (found ${${_NAME}_CONFIG} ${VERSION_MSG})")
  else()
    # If _CONSIDERED_CONFIGS is set, the config-file has been found, but no suitable version.
    # List them all in the error message:
    if(${_NAME}_CONSIDERED_CONFIGS)
      set(configsText "")
      list(LENGTH ${_NAME}_CONSIDERED_CONFIGS configsCount)
      math(EXPR configsCount "${configsCount} - 1")
      foreach(currentConfigIndex RANGE ${configsCount})
        list(GET ${_NAME}_CONSIDERED_CONFIGS ${currentConfigIndex} filename)
        list(GET ${_NAME}_CONSIDERED_VERSIONS ${currentConfigIndex} version)
        string(APPEND configsText "    ${filename} (version ${version})\n")
      endforeach()
      if (${_NAME}_NOT_FOUND_MESSAGE)
        string(APPEND configsText "    Reason given by package: ${${_NAME}_NOT_FOUND_MESSAGE}\n")
      endif()
      _FPHSA_FAILURE_MESSAGE("${FPHSA_FAIL_MESSAGE} ${VERSION_MSG}, checked the following files:\n${configsText}")

    else()
      # Simple case: No Config-file was found at all:
      _FPHSA_FAILURE_MESSAGE("${FPHSA_FAIL_MESSAGE}: found neither ${_NAME}Config.cmake nor ${_NAME_LOWER}-config.cmake ${VERSION_MSG}")
    endif()
  endif()
endmacro()


function(FIND_PACKAGE_HANDLE_STANDARD_ARGS _NAME _FIRST_ARG)

# Set up the arguments for `cmake_parse_arguments`.
  set(options  CONFIG_MODE  HANDLE_COMPONENTS)
  set(oneValueArgs  FAIL_MESSAGE  VERSION_VAR  FOUND_VAR)
  set(multiValueArgs REQUIRED_VARS)

# Check whether we are in 'simple' or 'extended' mode:
  set(_KEYWORDS_FOR_EXTENDED_MODE  ${options} ${oneValueArgs} ${multiValueArgs} )
  list(FIND _KEYWORDS_FOR_EXTENDED_MODE "${_FIRST_ARG}" INDEX)

  if(${INDEX} EQUAL -1)
    set(FPHSA_FAIL_MESSAGE ${_FIRST_ARG})
    set(FPHSA_REQUIRED_VARS ${ARGN})
    set(FPHSA_VERSION_VAR)
  else()
    cmake_parse_arguments(FPHSA "${options}" "${oneValueArgs}" "${multiValueArgs}"  ${_FIRST_ARG} ${ARGN})

    if(FPHSA_UNPARSED_ARGUMENTS)
      message(FATAL_ERROR "Unknown keywords given to FIND_PACKAGE_HANDLE_STANDARD_ARGS(): \"${FPHSA_UNPARSED_ARGUMENTS}\"")
    endif()

    if(NOT FPHSA_FAIL_MESSAGE)
      set(FPHSA_FAIL_MESSAGE  "DEFAULT_MSG")
    endif()

    # In config-mode, we rely on the variable <package>_CONFIG, which is set by find_package()
    # when it successfully found the config-file, including version checking:
    if(FPHSA_CONFIG_MODE)
      list(INSERT FPHSA_REQUIRED_VARS 0 ${_NAME}_CONFIG)
      list(REMOVE_DUPLICATES FPHSA_REQUIRED_VARS)
      set(FPHSA_VERSION_VAR ${_NAME}_VERSION)
    endif()

    if(NOT FPHSA_REQUIRED_VARS)
      message(FATAL_ERROR "No REQUIRED_VARS specified for FIND_PACKAGE_HANDLE_STANDARD_ARGS()")
    endif()
  endif()

# now that we collected all arguments, process them

  if("x${FPHSA_FAIL_MESSAGE}" STREQUAL "xDEFAULT_MSG")
    set(FPHSA_FAIL_MESSAGE "Could NOT find ${_NAME}")
  endif()

  list(GET FPHSA_REQUIRED_VARS 0 _FIRST_REQUIRED_VAR)

  string(TOUPPER ${_NAME} _NAME_UPPER)
  string(TOLOWER ${_NAME} _NAME_LOWER)

  if(FPHSA_FOUND_VAR)
    if(FPHSA_FOUND_VAR MATCHES "^${_NAME}_FOUND$"  OR  FPHSA_FOUND_VAR MATCHES "^${_NAME_UPPER}_FOUND$")
      set(_FOUND_VAR ${FPHSA_FOUND_VAR})
    else()
      message(FATAL_ERROR "The argument for FOUND_VAR is \"${FPHSA_FOUND_VAR}\", but only \"${_NAME}_FOUND\" and \"${_NAME_UPPER}_FOUND\" are valid names.")
    endif()
  else()
    set(_FOUND_VAR ${_NAME_UPPER}_FOUND)
  endif()

  # collect all variables which were not found, so they can be printed, so the
  # user knows better what went wrong (#6375)
  set(MISSING_VARS "")
  set(DETAILS "")
  # check if all passed variables are valid
  set(FPHSA_FOUND_${_NAME} TRUE)
  foreach(_CURRENT_VAR ${FPHSA_REQUIRED_VARS})
    if(NOT ${_CURRENT_VAR})
      set(FPHSA_FOUND_${_NAME} FALSE)
      string(APPEND MISSING_VARS " ${_CURRENT_VAR}")
    else()
      string(APPEND DETAILS "[${${_CURRENT_VAR}}]")
    endif()
  endforeach()
  if(FPHSA_FOUND_${_NAME})
    set(${_NAME}_FOUND TRUE)
    set(${_NAME_UPPER}_FOUND TRUE)
  else()
    set(${_NAME}_FOUND FALSE)
    set(${_NAME_UPPER}_FOUND FALSE)
  endif()

  # component handling
  unset(FOUND_COMPONENTS_MSG)
  unset(MISSING_COMPONENTS_MSG)

  if(FPHSA_HANDLE_COMPONENTS)
    foreach(comp ${${_NAME}_FIND_COMPONENTS})
      if(${_NAME}_${comp}_FOUND)

        if(NOT DEFINED FOUND_COMPONENTS_MSG)
          set(FOUND_COMPONENTS_MSG "found components: ")
        endif()
        string(APPEND FOUND_COMPONENTS_MSG " ${comp}")

      else()

        if(NOT DEFINED MISSING_COMPONENTS_MSG)
          set(MISSING_COMPONENTS_MSG "missing components: ")
        endif()
        string(APPEND MISSING_COMPONENTS_MSG " ${comp}")

        if(${_NAME}_FIND_REQUIRED_${comp})
          set(${_NAME}_FOUND FALSE)
          string(APPEND MISSING_VARS " ${comp}")
        endif()

      endif()
    endforeach()
    set(COMPONENT_MSG "${FOUND_COMPONENTS_MSG} ${MISSING_COMPONENTS_MSG}")
    string(APPEND DETAILS "[c${COMPONENT_MSG}]")
  endif()

  # version handling:
  set(VERSION_MSG "")
  set(VERSION_OK TRUE)

  # check with DEFINED here as the requested or found version may be "0"
  if (DEFINED ${_NAME}_FIND_VERSION)
    if(DEFINED ${FPHSA_VERSION_VAR})
      set(_FOUND_VERSION ${${FPHSA_VERSION_VAR}})

      if(${_NAME}_FIND_VERSION_EXACT)       # exact version required
        # count the dots in the version string
        string(REGEX REPLACE "[^.]" "" _VERSION_DOTS "${_FOUND_VERSION}")
        # add one dot because there is one dot more than there are components
        string(LENGTH "${_VERSION_DOTS}." _VERSION_DOTS)
        if (_VERSION_DOTS GREATER ${_NAME}_FIND_VERSION_COUNT)
          # Because of the C++ implementation of find_package() ${_NAME}_FIND_VERSION_COUNT
          # is at most 4 here. Therefore a simple lookup table is used.
          if (${_NAME}_FIND_VERSION_COUNT EQUAL 1)
            set(_VERSION_REGEX "[^.]*")
          elseif (${_NAME}_FIND_VERSION_COUNT EQUAL 2)
            set(_VERSION_REGEX "[^.]*\\.[^.]*")
          elseif (${_NAME}_FIND_VERSION_COUNT EQUAL 3)
            set(_VERSION_REGEX "[^.]*\\.[^.]*\\.[^.]*")
          else ()
            set(_VERSION_REGEX "[^.]*\\.[^.]*\\.[^.]*\\.[^.]*")
          endif ()
          string(REGEX REPLACE "^(${_VERSION_REGEX})\\..*" "\\1" _VERSION_HEAD "${_FOUND_VERSION}")
          unset(_VERSION_REGEX)
          if (NOT ${_NAME}_FIND_VERSION VERSION_EQUAL _VERSION_HEAD)
            set(VERSION_MSG "Found unsuitable version \"${_FOUND_VERSION}\", but required is exact version \"${${_NAME}_FIND_VERSION}\"")
            set(VERSION_OK FALSE)
          else ()
            set(VERSION_MSG "(found suitable exact version \"${_FOUND_VERSION}\")")
          endif ()
          unset(_VERSION_HEAD)
        else ()
          if (NOT ${_NAME}_FIND_VERSION VERSION_EQUAL _FOUND_VERSION)
            set(VERSION_MSG "Found unsuitable version \"${_FOUND_VERSION}\", but required is exact version \"${${_NAME}_FIND_VERSION}\"")
            set(VERSION_OK FALSE)
          else ()
            set(VERSION_MSG "(found suitable exact version \"${_FOUND_VERSION}\")")
          endif ()
        endif ()
        unset(_VERSION_DOTS)

      else()     # minimum version specified:
        if (${_NAME}_FIND_VERSION VERSION_GREATER _FOUND_VERSION)
          set(VERSION_MSG "Found unsuitable version \"${_FOUND_VERSION}\", but required is at least \"${${_NAME}_FIND_VERSION}\"")
          set(VERSION_OK FALSE)
        else ()
          set(VERSION_MSG "(found suitable version \"${_FOUND_VERSION}\", minimum required is \"${${_NAME}_FIND_VERSION}\")")
        endif ()
      endif()

    else()

      # if the package was not found, but a version was given, add that to the output:
      if(${_NAME}_FIND_VERSION_EXACT)
         set(VERSION_MSG "(Required is exact version \"${${_NAME}_FIND_VERSION}\")")
      else()
         set(VERSION_MSG "(Required is at least version \"${${_NAME}_FIND_VERSION}\")")
      endif()

    endif()
  else ()
    # Check with DEFINED as the found version may be 0.
    if(DEFINED ${FPHSA_VERSION_VAR})
      set(VERSION_MSG "(found version \"${${FPHSA_VERSION_VAR}}\")")
    endif()
  endif ()

  if(VERSION_OK)
    string(APPEND DETAILS "[v${${FPHSA_VERSION_VAR}}(${${_NAME}_FIND_VERSION})]")
  else()
    set(${_NAME}_FOUND FALSE)
  endif()


  # print the result:
  if (${_NAME}_FOUND)
    FIND_PACKAGE_MESSAGE(${_NAME} "Found ${_NAME}: ${${_FIRST_REQUIRED_VAR}} ${VERSION_MSG} ${COMPONENT_MSG}" "${DETAILS}")
  else ()

    if(FPHSA_CONFIG_MODE)
      _FPHSA_HANDLE_FAILURE_CONFIG_MODE()
    else()
      if(NOT VERSION_OK)
        _FPHSA_FAILURE_MESSAGE("${FPHSA_FAIL_MESSAGE}: ${VERSION_MSG} (found ${${_FIRST_REQUIRED_VAR}})")
      else()
        _FPHSA_FAILURE_MESSAGE("${FPHSA_FAIL_MESSAGE} (missing:${MISSING_VARS}) ${VERSION_MSG}")
      endif()
    endif()

  endif ()

  set(${_NAME}_FOUND ${${_NAME}_FOUND} PARENT_SCOPE)
  set(${_NAME_UPPER}_FOUND ${${_NAME}_FOUND} PARENT_SCOPE)
endfunction()


================================================
FILE: cmake/FindPackageMessage.cmake
================================================
# Copyright 2000-2020 Kitware, Inc. and Contributors
# All rights reserved.
#
# Distributed under the OSI-approved BSD 3-Clause License. See
# https://cmake.org/licensing for details.

#.rst:
# FindPackageMessage
# ------------------
#
#
#
# FIND_PACKAGE_MESSAGE(<name> "message for user" "find result details")
#
# This macro is intended to be used in FindXXX.cmake modules files.  It
# will print a message once for each unique find result.  This is useful
# for telling the user where a package was found.  The first argument
# specifies the name (XXX) of the package.  The second argument
# specifies the message to display.  The third argument lists details
# about the find result so that if they change the message will be
# displayed again.  The macro also obeys the QUIET argument to the
# find_package command.
#
# Example:
#
# ::
#
#   if(X11_FOUND)
#     FIND_PACKAGE_MESSAGE(X11 "Found X11: ${X11_X11_LIB}"
#       "[${X11_X11_LIB}][${X11_INCLUDE_DIR}]")
#   else()
#    ...
#   endif()

function(FIND_PACKAGE_MESSAGE pkg msg details)
  # Avoid printing a message repeatedly for the same find result.
  if(NOT ${pkg}_FIND_QUIETLY)
    string(REPLACE "\n" "" details "${details}")
    set(DETAILS_VAR FIND_PACKAGE_MESSAGE_DETAILS_${pkg})
    if(NOT "${details}" STREQUAL "${${DETAILS_VAR}}")
      # The message has not yet been printed.
      message(STATUS "${msg}")

      # Save the find details in the cache to avoid printing the same
      # message again.
      set("${DETAILS_VAR}" "${details}"
        CACHE INTERNAL "Details about finding ${pkg}")
    endif()
  endif()
endfunction()


================================================
FILE: cmake/Finduv.cmake
================================================
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

#
# Finduv
# ------
#
# Imported Targets
# ^^^^^^^^^^^^^^^^
#
# An imported target named ``uv::uv`` is provided if libuv has been found.
#
# Result Variables
# ^^^^^^^^^^^^^^^^
#
# This module defines the following variables:
#
# ``uv_FOUND``
#   True if libuv was found, false otherwise.
# ``uv_LIBRARY_DIRS``
#   The path(s) to uv libraries.
# ``uv_VERSION``
#   The version of libuv found.
#

find_package(PkgConfig QUIET)

if((NOT TP_BUILD_LIBUV) AND PkgConfig_FOUND)
  pkg_check_modules(uv QUIET IMPORTED_TARGET GLOBAL libuv)
  if(uv_FOUND)
    add_library(uv::uv ALIAS PkgConfig::uv)
  endif()
endif()

if(NOT uv_FOUND)
  set(uv_VERSION "1.51.0")
  set(uv_LIBRARY_DIRS "submodule")

  set(libuv_DIR ${PROJECT_SOURCE_DIR}/third_party/libuv)
  add_subdirectory(${libuv_DIR}
    ${PROJECT_BINARY_DIR}/third_party/libuv
    EXCLUDE_FROM_ALL)

  # This hack duplicates the `uv_a` target, so that we can call
  # install(TARGETS ... EXPORT) on it, which is not possible when the target is
  # defined in a subdirectory in CMake 3.5.
  get_target_property(_uv_sources uv_a SOURCES)
  set(_uv_sources_abs)
  foreach(_uv_src ${_uv_sources})
    list(APPEND _uv_sources_abs "${libuv_DIR}/${_uv_src}")
  endforeach()

  add_library(tensorpipe_uv STATIC ${_uv_sources_abs})
  if(BUILD_SHARED_LIBS)
    set_target_properties(tensorpipe_uv PROPERTIES POSITION_INDEPENDENT_CODE 1)
  endif()

  get_target_property(_link_libs uv_a LINK_LIBRARIES)
  target_link_libraries(tensorpipe_uv PRIVATE ${_link_libs})

  get_target_property(_include_dirs uv_a INCLUDE_DIRECTORIES)
  target_include_directories(tensorpipe_uv PRIVATE ${_include_dirs})
  target_include_directories(tensorpipe_uv PUBLIC $<BUILD_INTERFACE:${libuv_DIR}/include>)

  get_target_property(_compile_definitions uv_a COMPILE_DEFINITIONS)
  target_compile_definitions(tensorpipe_uv PRIVATE ${_compile_definitions})

  get_target_property(_compile_options uv_a COMPILE_OPTIONS)
  target_compile_options(tensorpipe_uv PRIVATE ${_compile_options})

  install(TARGETS tensorpipe_uv
          EXPORT TensorpipeTargets
          ARCHIVE DESTINATION ${TP_INSTALL_LIBDIR})

  add_library(uv::uv ALIAS tensorpipe_uv)
endif()

include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(uv
  REQUIRED_VARS uv_VERSION
  VERSION_VAR uv_VERSION)


================================================
FILE: cmake/MiscCheck.cmake
================================================
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

include(CheckCXXSourceCompiles)
include(CMakePushCheckState)

# We use the [[nodiscard]] attribute, which GCC 5 complains about.
# Silence this warning if GCC 5 is used.
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
  if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 6)
    add_definitions("-Wno-attributes")
  endif()
endif()


================================================
FILE: cmake/Options.cmake
================================================
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
  set(LINUX ON)
else()
  set(LINUX OFF)
endif()

macro(TP_CONDITIONAL_BACKEND name docstring condition)
  # No clue why this monstrosity is needed. But cmake_dependent_option has it,
  # and the code doesn't seem to work without it.
  string(REGEX REPLACE " +" ";" TP_CONDITIONAL_BACKEND_CONDITION "${condition}")
  if(${TP_CONDITIONAL_BACKEND_CONDITION})
    set(TP_CONDITIONAL_BACKEND_CAN_ENABLE ON)
  else()
    set(TP_CONDITIONAL_BACKEND_CAN_ENABLE OFF)
  endif()
  set(${name} ${TP_CONDITIONAL_BACKEND_CAN_ENABLE} CACHE BOOL ${docstring})
  if(${name} AND NOT ${TP_CONDITIONAL_BACKEND_CAN_ENABLE})
    message(FATAL_ERROR "${name} was explicitly set, but that can't be honored")
  endif()
endmacro()

# Try to auto-detect the presence of some libraries in order to enable/disable
# the transports/channels that make use of them.
# TODO Add CUDA to this list, in order to fix the TODO below

# TODO: Default to ON if CUDA available.
option(TP_USE_CUDA "Enable support for CUDA tensors" OFF)

# Optional features
option(TP_BUILD_BENCHMARK "Build benchmarks" OFF)
option(TP_BUILD_MISC "Build misc tools" OFF)
option(TP_BUILD_PYTHON "Build python bindings" OFF)
option(TP_BUILD_TESTING "Build tests" OFF)

# Whether to build a static or shared library
if(BUILD_SHARED_LIBS)
  set(TP_STATIC_OR_SHARED SHARED CACHE STRING "")
else()
  set(TP_STATIC_OR_SHARED STATIC CACHE STRING "")
endif()
mark_as_advanced(TP_STATIC_OR_SHARED)

# Force to build libuv from the included submodule
option(TP_BUILD_LIBUV "Build libuv from source" OFF)

# Directories
include(GNUInstallDirs)
set(TP_INSTALL_LIBDIR ${CMAKE_INSTALL_LIBDIR} CACHE STRING "Directory in which to install libraries")
mark_as_advanced(TP_INSTALL_LIBDIR)
set(TP_INSTALL_INCLUDEDIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE STRING "Directory in which to install public headers")
mark_as_advanced(TP_INSTALL_INCLUDEDIR)


================================================
FILE: cmake/Sanitize.cmake
================================================
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

if(SANITIZE)
  add_definitions("-fsanitize=${SANITIZE}")
  add_definitions("-fno-omit-frame-pointer")
  set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsanitize=${SANITIZE}")
endif()


================================================
FILE: docs/cuda_gotchas.md
================================================
# CUDA gotchas

While implementing CUDA channels we hit some undocumented "quirks" which forced us to adapt our original designs. We collect them here for future reference (although this list may not be exhaustive). Please add more items whenever we learn new things "the hard way". We’re mostly interested in unexpected behaviors that could entail substantial design changes, although smaller technical pitfalls are welcome too.

## Most functions initialize a context on the current device

A lot of CUDA functions cause a CUDA context to be initialized for the "current" device (which is a thread-local variable managed by CUDA). This consumes on-device memory (plus it can cause deadlocks when combined with NCCL). By invoking CUDA functions without first explicitly setting the current device we risk accidentally initializing CUDA contexts on devices on which we weren’t supposed to (especially device 0, since it’s the "default"). In order to avoid this, a device guard should be used for *all* operations. They are very cheap, hence don’t be shy! At times it’s not clear which device should be used in such guard, for example during initialization, however we must only use devices that the user has explicitly provided, hence we may have to lazily delay initialization in those cases.

## Querying the device of a pointer can fail

By choice, TensorPipe doesn’t ask users to provide the device index when they pass in a CUDA pointer, for simplicity, since it would be redundant as the device index can be extracted from the pointer. This "extraction" is thus the only CUDA operation for which we can’t possibly set up a device guard. This has proven to be a problem because, due to a bug in CUDA, the extraction would fail if the current device had been *explicitly* set to an invalid (uninitialized) device. (A default "unset" current device would work). This occurred often, because if we used a device guard when the current device was unset, its destructor would explicitly reset the current device to 0. Our investigation seemed to show that an unset current device in the CUDA runtime corresponded to a null current context in the CUDA driver, whereas an invalid current device corresponded to an invalid non-null context. Thus our workaround was to use the driver API directly and first reset its current context to null (in a sense, use a "reverse" device guard, which temporarily "unsets" the current device).

## Releasing shared resources implicitly synchronizes

Some CUDA operations perform an implicit device synchronization: they block the CPU thread until the GPU "catches up", that is, it waits for *all* previously-launched kernels for that device (on any stream) to complete. Such functions also cause later kernels (enqueued by another concurrent thread) to delay their launch on the device until the blocking function returns (we’ve occasionally been calling this a "kernel fence"). This is bad because it would mean that an internal TensorPipe operation can interfere with the user’s scheduling of kernels and thus degrade GPU utilization. The [CUDA programming guide](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#implicit-synchronization) mentions such a behavior (in section 3.2.6.5.4), however we’ve found out that the list of circumstances where this occurs is incomplete and incorrect. As a rule of thumb, we’ve seen this behavior happen mainly when *releasing* a resource shared among kernels (e.g., device memory, pinned host memory, IPC memory handles), as if CUDA wanted to ensure there were no kernels using this resource anymore before freeing it. A mental model could be to imagine that kernels acquire a shared lock to it, while freeing it needs a unique lock. The only solution to this limitation is to allocate a pool of these resources at the beginning and reuse them.

## Creating IPC events deadlocks

Another CUDA bug we hit was that the creation of CUDA events with the interprocess flag would sometimes deadlock. [Here’s a (not so small) repro](https://gist.github.com/lw/f34836416e7674bbdda8b4925c2999f2). We couldn’t pin it down to a specific condition, or to a race with another call. NVIDIA confirmed the bug and supposedly fixed it in version 450 of the CUDA driver. Since we still need to support earlier versions, as a workaround we’re taking great care to create all our IPC events as early as possible (hoping to avoid whatever races) and reuse them.

## Memory won’t be freed if there’s open IPC handles to it

Imagine that process B has received and opened an IPC handle to some device memory allocated and owned by process A, and process A frees this memory without B first closing its handle to it. The CUDA doc described this as undefined behavior (hence we can’t complain), but in practice what we’ve observed is that the memory will *not* be freed, that is, it will not be reused for subsequent allocation requests, thus possibly causing OOMs. In a sense, it’s if as that memory were "leaked". This is displayed rather confusingly in `nvidia-smi`’s accounting: the memory appears as occupied in the device statistics, but no process appears to be responsible for it.

## Cannot open same IPC handle more than once

There’s a limitation in older versions of CUDA where, if process A allocates some memory, only *one* binding to it can be opened in process B using IPC handles. Attempting to re-open the same handle a second time will fail. Note that one cannot get multiple "different" handles for the same memory, as CUDA always returns the same one. In practice it means that the user could pass some memory for TensorPipe for which it has already manually created and shared a handle, thus it’s unsafe for TensorPipe to also get and open a handle. We can only safely do it for private memory that we’re managing ourselves. Also note that this limitation was lifted in CUDA 11.1.

## The pointer for an opened IPC handle could be "offset" wrt the source pointer

The CUDA doc on this is clear albeit cryptic: given a pointer, CUDA returns the IPC handle for its *allocation*. Hence if we allocate some memory at address p0 and ask for the IPC handle of address p1 = p0 + offset, we’ll get the IPC handle for p0! This means that when we open the handle we need to add back that offset. Luckily CUDA offers a function to query p0 given p1. Note that this situation happens a lot in PyTorch due to the caching allocator sometimes returning slices from larger blocks.

## Not all pairs of GPUs can access each other’s memory

Device to device (D2D) transfers are supported by CUDA only when peer-to-peer (P2P) capabilities exist between the two GPUs. This is handled transparently by CUDA, which will automatically select the most performant direct link. Concretely, it will use NVLink, but only if there’s a dedicated "cable" connecting those two devices. If the NVLink mesh is not a complete graph (as is often the case, e.g., hybrid-cube meshes (HCM) are very common), for the missing pairs CUDA will use PCIe transfers, but only if the two devices are attached to the same chipset/controller/host bridge. If there are multiple chipsets (which is also common, e.g., the DGX machines have two), then D2D transfers between some pairs of GPUs might just not be possible through CUDA! In principle this is easy enough to detect since CUDA offers a function for it (and `nvidia-smi topo` also displays it), however we can’t use it if the two devices aren’t both "visible" to the process (we’re referring to the `CUDA_VISIBLE_DEVICES` environment variable). For such cases the only option is to use the NVML library, which doesn’t honor that env var, but in turn adds the complexity of matching corresponding devices between CUDA and NVML (which is best done through their UUID). Moreover, additional complexity was required in TensorPipe to handle the case where some but not all pairs of GPUs between two processes supported P2P.

## Registering CUDA memory with IB is slow

This is kinda known, but it’s better to repeat it: the registration and deregistration of memory with InfiniBand is considered a "setup" step, and is very slow, and should thus be avoided as much as possible during the "hot" data path, for example using a staging area or by caching these registrations.

## Registering CUDA memory with IB requires an extra NVIDIA kernel module

When we pass a pointer to InfiniBand for registration, InfiniBand needs to understand that this virtual address points to CUDA device memory and not to some CPU memory. For that it needs to be aware of CUDA, and it does so through so-called "peer memory client", which NVIDIA provides (through a separate kernel module) and registers with InfiniBand, and which is queried by InfiniBand before "falling back" to assuming the pointer points to CPU memory. This peer memory client feature is only available in Mellanox’s InfiniBand distribution (called OFED, OpenFabrics Enterprise Distribution), and not in vanilla upstream InfiniBand. On the client side (our side) luckily nothing changes in the API.

## Registering CUDA memory with IB occupies the PCIe window

Each PCIe device has a handful of "memory windows" it exposes, through which the host or other devices can access and modify the device’s memory (both to issue commands and to send/retrieve data). These are called BARs (base address registers). In the case of NVIDIA GPUs the BAR that appears to map to the device’s main memory is BAR1. This is often sized much smaller than the memory itself (say, 256MB for a 16GB GPU), with the idea that it will just be used as a staging area. Also note that CUDA already reserves a few dozen MBs of that window. When registering CUDA device memory with InfiniBand, an additional mapping is created in that window (during the `ibv_reg_mr` call) and will thus fail if the window doesn’t have enough remaining space (e.g., if the buffer being registered is larger than the window). This means we can’t straightforwardly register the user-provided buffers. However, with the right combination of GPU and of CPU BIOS, the BAR1 can become as large as the GPU’s main memory itself, in which case this won’t be a problem anymore.

## Registering CUDA memory with IB doesn’t leak it

Contrary to IPC handles, freeing CUDA device memory while it’s still registered with InfiniBand does not appear to interfere with the deallocation, hence the memory will correctly become reusable.

## IB messages have a maximum size

Each send/recv operation over InfiniBand can only handle up to a certain amount of data, usually at least 1GB, and will fail for larger amounts. This limit can be queried on the device, and chunking must be used for larger sizes.

It appears that, at least on some NICs and with some drivers, there's also a "minimum size" of 32 bytes, with messages failing with odd errors for smaller sizes. It's still unclear whether it's a bug.

## GPUs need to be matched with the right IB NIC

On some machine types there may be multiple GPUs and multiple InfiniBand devices and they need to be carefully matched. Using the same IB NIC for all GPUs will introduce a bottleneck while leaving all other NICs unused. Matching them up "randomly" means that the data paths over PCIe of different GPU-NIC pairs might cross each other (thus, again, causing a bottleneck), might traverse the host, or otherwise interfere. These machines are usually set up so that each GPU has one NIC that it’s "naturally" closest to, for example they share the same PCIe switch, thus we need a logic to be able to detect and implement this.


================================================
FILE: docs/development.md
================================================
# Development

TensorPipe uses CMake for its build system.

## Dependencies

To build TensorPipe, you need:

* C++14 compatible compiler (GCC >= 5.5 or Clang >= 6)

## Clone the repository

Example:

``` shell
git clone --recursive https://github.com/pytorch/tensorpipe
```

If you have updated an already cloned repository, make sure that the
submodules are up to date:

``` shell
git submodule sync
git submodule update --init
```

It is imperative to check out the submodules before running CMake.

Find the list of submodules and a description of what they're used for
on [this page][third_party].

[third_party]: https://github.com/pytorch/tensorpipe/tree/main/third_party

## Using CMake

Example:

``` shell
mkdir build
cd build
cmake ../ -DCMAKE_BUILD_TYPE=Debug -DSANITIZE=thread
make
```

You can specify CMake variables by passing them as arguments to the `cmake` command.

Useful CMake variables:

* `CMAKE_C_COMPILER` -- Define which C compiler to use.
* `CMAKE_CXX_COMPILER` -- Define which C++ compiler to use.
* `CMAKE_C_FLAGS` -- Additional flags for the C compiler.
* `CMAKE_CXX_FLAGS` -- Additional flags for the C++ compiler.
* `CMAKE_BUILD_TYPE` -- For example: `release`, `debug`.

Useful TensorPipe specific variables:

* `SANITIZE` -- configure the sanitizer to use (if any); for
  example: `address` or `thread`, to run with `asan` or `tsan`,
  respectively.

## Ninja

To make CMake output something other than the default `Makefile`, see
[`cmake-generators(7)`][cmake-generators]. We like to use the
[Ninja][ninja] generator because it works well for incremental builds.
On the command line, specify `-GNinja` to use it.

[cmake-generators]: https://cmake.org/cmake/help/v3.4/manual/cmake-generators.7.html
[ninja]: https://en.wikipedia.org/wiki/Ninja_(build_system)


================================================
FILE: docs/linux_support.md
================================================
This document is intended for developers and advanced users. It’s the kind of document that risks going out of date very quickly, hence take it with a grain of salt.

In order to try to be as performant as possible, TensorPipe sometimes relies on new and advanced kernel features. This is causing issues to users who are building and/or running on old kernels. Hence, whenever we use such features, we should always “guard” them somehow, i.e., detect their availability at compile-time or (preferably) at runtime, and disable the backend or mark it non-viable. It is ok-ish for users with old kernels to not have access to all backends, as long as there’s always at least one backend they can use.

## Compile-time vs runtime, Linux vs glibc

Unfortunately, both the kernel version used for building and the one used for running affect whether we can use a feature. This means that the availability of a function or flag during build doesn’t mean it will be supported at runtime (this is especially true for the official builds of PyTorch). On the other hand, it also means that even if the runtime kernel supports a feature, we may not be able to use it because we didn’t have access to a system header when building (e.g., to get a flag). While sometimes we can “polyfill” this information, it’s not always doable.

An additional complication is added by the fact that we typically access syscalls through their glibc wrappers. First of all, this means we only get access to a syscall once glibc wraps it, which could happen years later. But it also means we link to a glibc symbol, and thus to a specific version of glibc’s shared object. With the kernel, using an unsupported feature results in a runtime error when first used, which we can catch; but with glibc we get a loader error due to missing symbols at startup, even if the user doesn’t use TensorPipe, even if we could “tolerate” these symbols’ absence. It is thus desirable at times to avoid the glibc wrappers.

## Common tricks for how to guard/polyfill

* Kernel flags are typically defined as preprocessor flags (i.e., `#define FOO`). This is stuff like `O_TMPFILE`, `MAP_SHARED_VALIDATE`, `PR_SET_PTRACER`, ... It’s easy to detect this in the code, with a `#ifdef FOO`, and since these flags are (usually?) constants, it’s also easy to define them ourselves. This “polyfill” allows us to build on an old kernel but still run on a new one.
* For a new-ish syscall, we probably don’t want to use the glibc wrapper, for the problems described above, and because it’s hard to detect its availability (the best option is a CMake check whose result we inject as a preprocessor flag). An alternative is to invoke it through the generic `syscall` syscall, using the `SYS_foo` flags. This could bring a few issues on its own (especially for 32bit systems) but for now it hasn’t come to bite us. This way we skip glibc entirely, and simply end up getting ENOSYS if the runtime kernel doesn’t support the syscall. Those `SYS_foo` flags are defined by glibc, but it seems glibc defines them automatically for all the syscalls it “finds” in the kernel, and not just for the syscalls that glibc supports. Unfortunately we cannot “polyfill” the `SYS_foo` flags if we don’t find them, because they have different values on different architectures.

## What do others do?

Since [Apr 2017](https://github.com/libuv/libuv/commit/4e6101388015c6d0879308d566f0a4b79edc0c13), libuv only supports Linux 2.6.32 (December 2009) and glibc 2.12 (May 2010). (This doesn’t mean earlier versions are necessarily broken, but that libuv reserves the right to break them). Libuv seems to be somewhat tied to the RedHat/CentOS releases, which are common and have a very long lifespan. It doesn’t make sense for us to support older versions than what libuv does, because if libuv decides to break them there’s nothing we can do.

PyTorch tries to support the [manylinux2014 platform](https://www.python.org/dev/peps/pep-0599/) (defined by Python for use in PyPI/pip), which allows up to glibc 2.17 (December 2012). However, it’s not clear if we’re there yet, and the previous version is `manylinux2010` which comes with glibc 2.12.

Hence a reasonable recommendation seems to be to draw the line at Linux 2.6.32 and glibc 2.12. However, people with older versions than those have already reported issues and asked for fixes, which we can probably consider on a case-by-case basis.

## Kernel features used by TensorPipe

### Linux 2.1.4 (October 1996)

* The `getresuid` and `getresgid` syscalls.

### Linux 2.3.16 (September 1999)

* The `/proc/sys/kernel/random/boot_id` file. See `random(4)`.

  No git hash as it predates the use of git by Linux

  https://github.com/torvalds/linux/blob/1da177e4c3f41524e886b7f1b8a0c1fc7321cac2/drivers/char/random.c#L1270-L1278

### Linux 2.3.20 (October 1999)

* The `PR_GET_DUMPABLE` flag for `prctl`.

  No git hash as it predates the use of git by Linux

  https://github.com/torvalds/linux/blob/1da177e4c3f41524e886b7f1b8a0c1fc7321cac2/include/linux/prctl.h#L10

### Linux 2.6.26 (July 2008)

* Version 3 of Linux capabilities. (Initial capability support, including the `capget` syscall, dates back to Linux 2.1.100, from May 1998). See `capget(2)`.

  https://github.com/torvalds/linux/commit/ca05a99a54db1db5bca72eccb5866d2a86f8517f

### Linux 3.2 (January 2012)

* Cross-Memory Attach (i.e., the `process_vm_readv` syscall). See `process_vm_readv(2)`.

  https://github.com/torvalds/linux/commit/fcf634098c00dd9cd247447368495f0b79be12d1

### Linux 3.4 (May 2012)

* The YAMA security module, and thus the `/proc/sys/kernel/yama/ptrace_scope` file. This includes the `PR_SET_PTRACER` and the `PR_SET_PTRACER_ANY` flags for `prctl`. See `ptrace(2)`.

  https://github.com/torvalds/linux/commit/2d514487faf188938a4ee4fb3464eeecfbdcf8eb
  https://github.com/torvalds/linux/commit/bf06189e4d14641c0148bea16e9dd24943862215

### Linux 3.8 (February 2013)

* The `/proc/[pid]/ns/[ns]` files. Although that directory, and the `net` file therein, were already present in 3.0, the `pid` and `user` ones only arrived in 3.8 and, more importantly, the ability to identify a namespace by the inode number of those files came in 3.8 (when they stopped being hardlinks and became symlinks). See `proc(5)` and `namespaces(7)` and others.

  https://github.com/torvalds/linux/commit/6b4e306aa3dc94a0545eb9279475b1ab6209a31f
  https://github.com/torvalds/linux/commit/13b6f57623bc485e116344fe91fbcb29f149242b
  https://github.com/torvalds/linux/commit/57e8391d327609cbf12d843259c968b9e5c1838f
  https://github.com/torvalds/linux/commit/cde1975bc242f3e1072bde623ef378e547b73f91
  https://github.com/torvalds/linux/commit/bf056bfa80596a5d14b26b17276a56a0dcb080e5
  https://github.com/torvalds/linux/commit/98f842e675f96ffac96e6c50315790912b2812be

### Linux 3.11 (September 2013)

* The `O_TMPFILE` flag for `open`. See `open(2)`.

  https://github.com/torvalds/linux/commit/60545d0d4610b02e55f65d141c95b18ccf855b6e

### Linux 3.17 (October 2014)

* The `memfd_create` syscall. See `memfd_create(2)`.

  https://github.com/torvalds/linux/commit/9183df25fe7b194563db3fec6dc3202a5855839c

### Linux 4.11 (April 2017)

* The `/sys/kernel/security/lsm` file in `securityfs` (a list of active Linux Security Modules).

  https://github.com/torvalds/linux/commit/d69dece5f5b6bc7a5e39d2b6136ddc69469331fe

### TODO

* All that sysfs PCIe stuff done by CUDA GDR (e.g., resolving GPUs and NICs to PCIe paths, getting the BAR1 size, ...), plus checking the nv_mem_peer module

## Glibc features required by TensorPipe

### Glibc 2.2.5 (January 2002)

* The `capget` function.

### Glibc 2.3.3 (December 2003)

* The `dlinfo` function. (All of `dlopen`, `dlclose`, `dlsym` and `dlerror` were present since at least glibc 2.0).

### Glibc 2.12 (May 2010)

* The `pthread_setname_np` function.


================================================
FILE: docs/shm.md
================================================
# The shm transport

This document is an attempt to capture the design principles and inner
working of the shm transport (see `tensorpipe/transport/shm`). Its
performance makes it an efficient alternative to IP based transports
for same-machine communication.

At the core of a transport implementation lies a listener, a
connection, and a context. Listeners accept connections. Contexts
create listeners and can connect to remote listeners.

## Concepts


### Ring buffers

Shared memory ring buffers are a core building block for the shm
transport. They are implemented with split control and data
sections. This means the data section can be fully aligned. The header
section stores a read/write transaction flag and the head and tail
offsets into the data section. Producers and consumers of the ring
buffer use atomic instructions to mutate this header depending on
their intent.

### File descriptors

The header and data segments of a shared memory ring buffer are
created as follows. First, a file is created in `/dev/shm` with the
`O_TMPFILE` flag. This means that anything written to the resulting
file is lost when the last file descriptor is closed, unless the file
is given a name. Because we never give this file a name, the segment
is automatically cleaned up when the last process that has its file
descriptor terminates.

Per above, creating a shared memory ring buffer yields 2 file
descriptors, one for the header segment and one for the data segment.
These file descriptors are shared over a Unix domain socket.

### The reactor

This is a TensorPipe specific component. It uses a shared memory ring
buffer to allow other processes to trigger functions. If a process wants
another process to trigger a function, it registers this function with
the reactor, and gets back a 32-bit token. Then, the file descriptors of
the reactor's ring buffer, as well as the token, are sent to another
process. The other process can now map the reactor ring buffer, and
trigger the registered function by writing the token to the ring buffer.

See [considerations](#considerations) below on why this was used.

### Unix domain sockets

Coordination between process to bootstrap a connection that uses
shared memory ring buffers is implemented using Unix domain sockets.
The listening side of a connection binds and listens on an abstract
socket address. A typical Unix domain socket "address" is a filesystem
pathname. An abstract socket address, by contrast, is not visible on
any filesystem. They exist in a single abstract socket namespace
shared by all processes on the machine. Removing the filesystem
dependency means two things:

1. (+) It is not necessary to purge stale Unix domain socket files.
2. (-) These sockets don't have permissions, so any process that has
   its name can connect.

Read more about abstract domain sockets [here][1] and [here][2].

[1]: http://man7.org/linux/man-pages/man7/unix.7.html
[2]: https://utcc.utoronto.ca/~cks/space/blog/linux/SocketAbstractNamespace

Once processes have established a Unix domain socket, it is used to:

1. Pass the shared memory file descriptors to a peer process.
2. Signal peer termination (through eof on socket closure).
3. ... nothing else. All data moves through the ring buffers.

**Note:** abstract socket addresses are a Linux specific feature.

## Bringing it together

So, to establish one of these shared memory connections, we first
listen on some unique abstract socket address. This address must be
known to the process that wishes to connect. For a quick test we can
use a pre-shared address. Otherwise, we can generate a UUID and share
it with some out of band mechanism. The connecting process connects
and the listening process accepts. We have now established a Unix
domain socket and move on to the next step.

Each process creates a new shared memory ring buffer specifically for
this connection. We refer to this ring buffer as the _inbox_. We
expect each process to be pinned to a specific NUMA node and perform
the memory allocation in the same NUMA domain.

The file descriptors of the inbox, the file descriptors of the
reactor, and a token to trigger readability of the inbox, are shared
over the socket.

Each process receives file descriptors from their peer and initializes
the corresponding ring buffers. The peer's inbox is referred to as the
_outbox_. The token to trigger remote readability is referred to as
the _outbox trigger_.

The connection is now established! Writes are performed by writing
directly into the outbox and triggering the outbox trigger. The
trigger wakes up the peer's reactor and executes a function that
notifies the connection of readability. Subsequently, the connection
checks if there was a pending read operation, and processes it if so.

When either process destructs the connection, or crashes, the original
Unix domain socket is closed, which signals the peer process that it
shouldn't expect more writes to its inbox and can destruct the
connection as well.

## Considerations

A single process may have multiple connections. Therefore, it may have
multiple inbox ring buffers. One way to react to incoming writes is to
simply check if there are any bytes to read. This requires checking all
N inboxes for reads, which can become problematic if N gets large. To
better solve this multiplexing problem we initially used an
[`eventfd(2)`][eventfd] per inbox. This file descriptor was registered
with the existing [`epoll(7)`][epoll] loop and would trigger the
readability function when it became readable. To perform a write, the
peer process would first write to the outbox and then write to the
peer's eventfd.

[eventfd]: http://man7.org/linux/man-pages/man2/eventfd.2.html
[epoll]: http://man7.org/linux/man-pages/man7/epoll.7.html

A simple ping/pong performance benchmark using this approach, with both
processes pinned to the same NUMA node, showed a lower bound latency of
~12 microseconds. This seemed high for a pair of ring buffer writes, so
we explored alternatives, and came up with the reactor approach. Now,
the same benchmark runs with a lower bound latency of about ~1.7
microseconds, which is a 7x improvement over the `eventfd(2)`/`epoll(7)`
approach.


================================================
FILE: docs/thread_model.md
================================================
# TensorPipe's thread model

TensorPipe is spawning multiple threads internally. This is a design
requirement as, for example, a single thread wouldn't manage to drive a
modern network interface card (NIC) at capacity and saturate its
bandwidth, even if it did nothing by write on the socket: multiple
threads writing in parallel to multiple sockets are the only way to
achieve that.

Moreover, the possibility of spawning new threads when needed allows
for a simpler architecture in the implementation of TensorPipe's
modular approach to backends (transports and channels): if one of these
backends needs to perform some heavy operation (a blocking syscall, an
event loop, ...) it can launch a dedicated thread for it rather than
having to schedule it on the user thread or on a shared thread pool,
thus having to "fit" the operation into some framework.

This heavy reliance on multi-threading poses of course challenges in
coordination and robustness. This document aims to outline the patterns
we've ended up adopting to have a structured and principled design
around this.

## Callbacks

TensorPipe uses callbacks to organize the control flow around
asynchronous and deferred execution. While this may be an anti-pattern
leading to so-called "spaghetti code" or "callback hell", we realized
that it was the only approach that would yield the performance we need.
Modern alternatives to callbacks (promises/futures, coroutines, ...) 
would have introduced an unacceptable overhead in some cases.

Nearly all operations in TensorPipe are non-blocking and are performed
asynchronously, in background, with their results notified through
callbacks. This includes the creation of pipes and connections (the
objects may still be performing initialization when they are given to
the user and, although operations can be performed on them, these will
be delayed until setup completes). And it also includes destruction,
which means that internal resources may not be immediately freed when a
user-facing object is deleted. The only synchronization point that
allows the user to wait for such cleanup to finish is the context's
`join` method. Some other methods that may occasionally wait are the
ones that return a value, for example the ones to retrieve addresses.

## Shared pointers

As soon as threads and callbacks enter the mix, race conditions start
to pop up. Among the first ones, there's the problem of ownership:
ideally we want a `unique_ptr`-style semantics, where each object has a
clear owner who controls its lifetime. However, when this owner asks
another thread to perform an operation on that object as part of a
callback, that callback also (temporarily) needs access to the object.
As there may be multiple operations with multiple callbacks at the same
time, transferring ownership isn't an option, and sharing it is the
only way to go. This however requires synchronization among the various
users: if the "real" user had a `unique_ptr` and gave raw pointers to
the callbacks, the real user may delete the object without the
callbacks noticing or having any way to stop/delay it. This would then
cause use-after-free errors. There must thus be a sort of "lock" that
prevents the object from being deleted while someone is working on it,
like a "semaphore" counting the users. It turns out a perfect tool for
the job is `shared_ptr`. Acquiring a lock on the object corresponds to
obtaining a `shared_ptr` instance, which increases the reference count.
The object will only be deleted when its refcount reaches zero, which
means all its users (the "real" ones and the callbacks) have stopped
using the object.

We have however solved a problem by creating an opposite one: a memory
leak. Imagine an object (say, a pipe) that is the "real" owner of
another one (say, a channel) from which it is expecting a callback, and
that callback captures a `shared_ptr` to the first object in its
closure. This is a reference cycle. It means that even if the "real"
owner of the first object relinquishes its `shared_ptr`, the objects
won't be destroyed until the callback fires (if ever). An easy solution
to this is to have callbacks only keep a `shared_ptr` when they are
running, not while they are waiting. Again, the standard library has
the perfect tool for the job: the `weak_ptr`, which will keep the
refcount unchanged but can be "locked" to obtain a real `shared_ptr`
when needed (curious coincidence that the terminology aligns with our).

So, in short: the real owner of an object keeps a `shared_ptr` to it,
it passes `weak_ptr`s to be stored in callbacks, and these are locked
back to `shared_ptr`s just before running the callbacks. (If locking
fails, the callback isn't run).

## Public objects vs private implementations

It turns out that what we said above isn't always true: in some cases
we may want a callback to keep the object alive until it has fired.
This happens because some callbacks are one half of a "contract"
regarding data ownership: throughout the API (at higher and lower
levels), `read`, `write`, `send` and `recv` methods take some data
(source or destination buffers), and by doing so the caller hands over
control of the data to the object. The way for the object to yield
ownership back to the caller is by invoking the callback. We must thus
ensure that these callbacks are always called. However, we must also
avoid calling them when we're not ready yet to give up access to the
data. For a more concrete example, consider the user trying to destroy
a pipe that has a pending write operation, while some other thread is
simultaneously performing a memory copy as part of that write
operation. If we invoke the write operation's callback before aborting
the memory copy we're giving the user the right to deallocate the
buffer, which may lead the other thread to segfault.

Here is what needs to happen: when a user deletes a pipe, all its
pending operations must be interrupted, which in turn also aborts the
lower level operations; the pipe's callbacks, however, must not be
fired and instead kept alive while waiting for the lower level
operations to wrap up, and only then they can be triggered. This shows
that a subset of the pipe, containing at least the callbacks, must
survive the destruction of the whole pipe. In other words, the lifetime
of the inner part must be detacheable from the one of the outer shell.

In order to do so, most public objects are just thin wrappers around a
single member field, which is just a pointer to an instance of a
private "implementation" (abbreviated as impl), which is where
everything happens. The impl is a `shared_ptr` so that its life cycle
can be detached and extended with respect to the one of the public
object. The callbacks that we must wait for in order to regain control
of some resource also capture a `shared_ptr`. This way we can still get
the "signal" from when the public object is deleted (and can start
terminating pending operations) but we're also able to keep the impl
around while wait for the shut down to complete.

## Locking

Objects can be accessed and worked on from many threads, from all
directions, above (user threads, higher up the stack) and below 
(low-level backend threads). To avoid race conditions on the internal
state of these object, we must have mutual exclusion between threads,
using locks. While it may be possible to have separate fine-grained
locks for different parts of some objects, in general it is safer
and easier to have one mutex per object, and use it to lock all
operations.

That's easily said, but it just as easily leads to deadlocks, which in
our experience come in two flavors:

- When an object (holding its own lock) calls a "upward" callback which
  (inline/serially) tries to perform an operation on that same object,
  which tries to acquire the same lock. This is a perfectly legitimate
  behavior, since all of our callbacks are "one-shot", that is, they
  "burn out" after they fire and thus must be immediately rearmed.

- When an object (holding its own lock) performs an operation on a
  lower level object, passing a callback to it, and this callback is
  called immediately (inline/serially) and tries to also acquire the
  lock of the first object. This typically happens when the lower level
  object is in an error state and can thus "shortcut" the operation and
  immediately trigger the callback instead of deferring it to a thread.

Mitigations for these problems are possible but none is universal and
they all have drawbacks. Examples are:

- When calling upward callbacks, extract one from the object onto the
  stack, put the object in a consistent state, release its lock and
  then call the callback. This works but there's a racing risk which
  would cause callbacks to not be called in their intended order.

- Have a dedicated thread from which to invoke callbacks. Therefore
  other threads, instead of triggering callbacks, push them to some
  queue that is consumed by this thread. This resembles the semi-future
  and executor pattern. We used to have such a pattern in place for
  calling the pipe callbacks but it was introducing an unacceptable
  latency overhead.

- The backends already typically have a thread they can defer callbacks
  to, and for the most part they already do. However having such a
  thread isn't necessarily a requirement for a transport, and such
  threads may not be running at all times (e.g., once a backend has
  been joined).

- We could replace regular locks with reentrant locks (also called
  recursive). This is typically considered bad practice, though, and
  when at some point we tried this we indeed hit problems.

The next section presents a more disciplined way of dealing with races.

## Event loops

A classic way of dealing with parallel I/O is event loops: repeatedly
polling a set of file descriptors for readability/writability (blocking
to wait for them to become ready), dealing with them, and repeating.
Syscalls to do this are `select`, `epoll`, and more. The `libuv`
library used by one of TensorPipe's transports is also based on an
event loop. Event loops are typically single-threaded, and they allow
to "simulate" parallelism by multiplexing thread if those threads would
spend most of their time doing blocking I/O.

The simplicity of event loops, their single-threaded safety and their
established effectiveness prompted us to make them a foundation of our
threading model.

If an object already has a thread to which it offloads some operations
(this is the case for most transports and some channels, but not the
pipe) then we defer all operations to it. And we really mean all of
them: all manipulation of the object (scheduling operations, querying
information, running callbacks) must be done from within that event
loop thread. All operations that are attempted on the object, either
from another thread or from within the event loop thread (for example,
by a callback in user code) are deferred, appended to a queue, and
dealt with at a later iteration of the loop. This guarantees that we'll
always have a single thread accessing such objects, thus ensuring
thread safety without even using any locks. Note that such design isn't
a requirement for transports, it's just the pattern that we've adopted
for all our current transports.

If, on the other hand, an object does not have access to a thread to
use as an event loop, we'll "borrow" the caller's thread and
temporarily use it as an event loop. We'll similarly have a queue of
tasks, and the thread will consume them one by one, until none are
left, at which point we'll stop occupying the thread and release it
back to the caller. If any new operation is attempted by another thread
while one of these temporary event loops is running, that operation is
added to the queue and thus deferred to the already-running event loop,
with the new thread immediately able to return to what it was doing.


================================================
FILE: setup.py
================================================
#!/usr/bin/env python3
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

import os
import subprocess
import sys
from pathlib import Path

from setuptools import Extension, setup
from setuptools.command.build_ext import build_ext


class CMakeBuild(build_ext):
    def run(self):
        for ext in self.extensions:
            self.build_extension(ext)

    def build_extension(self, ext):
        if not os.path.exists(self.build_temp):
            os.makedirs(self.build_temp)

        source_path = Path(__file__).parent.resolve()
        output_path = Path(self.get_ext_fullpath(ext.name)).parent.resolve()
        build_type = "Debug" if self.debug else "Release"

        cmake_cmd = [
            "cmake",
            f"{source_path}",
            f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={output_path}",
            f"-DPYTHON_EXECUTABLE={sys.executable}",
            f"-DCMAKE_BUILD_TYPE={build_type}",
            "-DCMAKE_C_COMPILER=clang-6.0",
            "-DCMAKE_CXX_COMPILER=clang++-6.0",
            "-DCMAKE_POSITION_INDEPENDENT_CODE=true",
            "-DTP_BUILD_PYTHON=true",
        ]

        for opt in os.environ:
            if opt.startswith("TP_"):
                cmake_cmd.append(f"-D{opt}={os.environ[opt]}")

        make_cmd = ["make", "-j", "pytensorpipe"]

        subprocess.check_call(cmake_cmd, cwd=self.build_temp)
        subprocess.check_call(make_cmd, cwd=self.build_temp)


setup(
    name="tensorpipe",
    version="0.0.0",
    author="Facebook AI Research",
    ext_modules=[Extension("pytensorpipe", sources=[])],
    cmdclass={"build_ext": CMakeBuild},
    zip_safe=False,
)


================================================
FILE: tensorpipe/.clang-format
================================================
---
AccessModifierOffset: -1
AlignAfterOpenBracket: AlwaysBreak
AlignConsecutiveAssignments: false
AlignConsecutiveDeclarations: false
AlignEscapedNewlinesLeft: true
AlignOperands:   false
AlignTrailingComments: false
AllowAllParametersOfDeclarationOnNextLine: false
AllowShortBlocksOnASingleLine: false
AllowShortCaseLabelsOnASingleLine: false
AllowShortFunctionsOnASingleLine: Empty
AllowShortIfStatementsOnASingleLine: false
AllowShortLoopsOnASingleLine: false
AlwaysBreakAfterReturnType: None
AlwaysBreakBeforeMultilineStrings: true
AlwaysBreakTemplateDeclarations: true
BinPackArguments: false
BinPackParameters: false
BraceWrapping:
  AfterClass:      false
  AfterControlStatement: false
  AfterEnum:       false
  AfterFunction:   false
  AfterNamespace:  false
  AfterObjCDeclaration: false
  AfterStruct:     false
  AfterUnion:      false
  BeforeCatch:     false
  BeforeElse:      false
  IndentBraces:    false
BreakBeforeBinaryOperators: None
BreakBeforeBraces: Attach
BreakBeforeTernaryOperators: true
BreakConstructorInitializersBeforeComma: false
BreakAfterJavaFieldAnnotations: false
BreakStringLiterals: false
ColumnLimit:     80
CommentPragmas:  '^ IWYU pragma:'
CompactNamespaces: false
ConstructorInitializerAllOnOneLineOrOnePerLine: true
ConstructorInitializerIndentWidth: 4
ContinuationIndentWidth: 4
Cpp11BracedListStyle: true
DerivePointerAlignment: false
DisableFormat:   false
ForEachMacros:   [ FOR_EACH_RANGE, FOR_EACH, ]
IncludeCategories:
  - Regex:           '^<.*\.h(pp)?>'
    Priority:        1
  - Regex:           '^<.*'
    Priority:        2
  - Regex:           '.*'
    Priority:        3
IndentCaseLabels: true
IndentWidth:     2
IndentWrappedFunctionNames: false
KeepEmptyLinesAtTheStartOfBlocks: false
MacroBlockBegin: ''
MacroBlockEnd:   ''
MaxEmptyLinesToKeep: 1
NamespaceIndentation: None
ObjCBlockIndentWidth: 2
ObjCSpaceAfterProperty: false
ObjCSpaceBeforeProtocolList: false
PenaltyBreakBeforeFirstCallParameter: 1
PenaltyBreakComment: 300
PenaltyBreakFirstLessLess: 120
PenaltyBreakString: 1000
PenaltyExcessCharacter: 1000000
PenaltyReturnTypeOnItsOwnLine: 2000000
PointerAlignment: Left
ReflowComments:  true
SortIncludes:    true
SpaceAfterCStyleCast: false
SpaceBeforeAssignmentOperators: true
SpaceBeforeParens: ControlStatements
SpaceInEmptyParentheses: false
SpacesBeforeTrailingComments: 1
SpacesInAngles:  false
SpacesInContainerLiterals: true
SpacesInCStyleCastParentheses: false
SpacesInParentheses: false
SpacesInSquareBrackets: false
Standard:        Cpp11
TabWidth:        8
UseTab:          Never
...


================================================
FILE: tensorpipe/.clang-tidy
================================================
---
InheritParentConfig: true
Checks: '
readability-identifier-naming,
readability-inconsistent-declaration-parameter-name,
readability-named-parameter,
'
FormatStyle: file
CheckOptions:
# Names of classes (and structs?)
- { key: readability-identifier-naming.ClassCase, value: CamelCase }
# Names of enums and enum classes
- { key: readability-identifier-naming.EnumCase, value: CamelCase }
# Names of members and methods of classes (and structs?)
- { key: readability-identifier-naming.MemberCase, value: camelBack }
- { key: readability-identifier-naming.PrivateMemberCase, value: camelBack }
- { key: readability-identifier-naming.PrivateMemberSuffix, value: '_' }
- { key: readability-identifier-naming.ProtectedMemberCase, value: camelBack }
- { key: readability-identifier-naming.ProtectedMemberSuffix, value: '_' }
- { key: readability-identifier-naming.MethodCase, value: camelBack }
# Names of parameters and local variables
- { key: readability-identifier-naming.LocalVariableCase, value: camelBack }
- { key: readability-identifier-naming.ParameterCase, value: camelBack }
# Names of constants
- { key: readability-identifier-naming.GlobalConstantCase, value: CamelCase }
- { key: readability-identifier-naming.GlobalConstantPrefix, value: 'k' }
# FIXME scoped enums are only supported in clang-tidy 12.
# Names of (non-class) enum members
# - { key: readability-identifier-naming.EnumConstantCase, value: UPPER_CASE }
# Names of enum class members
# - { key: readability-identifier-naming.ScopedEnumConstantCase, value: CamelCase }
# - { key: readability-identifier-naming.ScopedEnumConstantPrefix, value: 'k' }
# Names of template parameters
- { key: readability-identifier-naming.TemplateParameterCase, value: CamelCase }
# Names of global functions
- { key: readability-identifier-naming.FunctionCase, value: camelBack }
# Names of namespaces
- { key: readability-identifier-naming.NamespaceCase, value: lower_case }
...


================================================
FILE: tensorpipe/CMakeLists.txt
================================================
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

# TP_SRCS is the list of source files that we need to build libtensorpipe.
set(TP_SRCS)

# TP_PUBLIC_HDRS is the list of public header files that we need to install.
set(TP_PUBLIC_HDRS)

# TP_LINK_LIBRARIES is list of dependent libraries to be linked
set(TP_LINK_LIBRARIES)

# TP_INCLUDE_DIRS is list of include path to be used
set(TP_INCLUDE_DIRS)

list(APPEND TP_SRCS
  channel/error.cc
  channel/helpers.cc
  common/address.cc
  common/allocator.cc
  common/error.cc
  common/fd.cc
  common/socket.cc
  common/system.cc
  core/context.cc
  core/context_impl.cc
  core/error.cc
  core/listener.cc
  core/listener_impl.cc
  core/pipe.cc
  core/pipe_impl.cc
  transport/error.cc)

list(APPEND TP_PUBLIC_HDRS
  tensorpipe.h
  channel/context.h
  channel/error.h
  common/buffer.h
  common/cpu_buffer.h
  common/device.h
  common/error.h
  common/optional.h
  core/context.h
  core/error.h
  core/listener.h
  core/message.h
  core/pipe.h
  transport/context.h
  transport/error.h)

list(APPEND TP_INCLUDE_DIRS
  $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}>
  $<BUILD_INTERFACE:${PROJECT_BINARY_DIR}>
  $<INSTALL_INTERFACE:${TP_INSTALL_INCLUDEDIR}>)


## Channels

### basic

list(APPEND TP_SRCS
  channel/basic/channel_impl.cc
  channel/basic/context_impl.cc
  channel/basic/factory.cc)
list(APPEND TP_PUBLIC_HDRS
  channel/basic/factory.h)

### xth

list(APPEND TP_SRCS
  channel/xth/channel_impl.cc
  channel/xth/context_impl.cc
  channel/xth/factory.cc)
list(APPEND TP_PUBLIC_HDRS
  channel/xth/factory.h)

### cma

tp_conditional_backend(
  TP_ENABLE_CMA "Enable cross-memory attach channel" "LINUX")
if(TP_ENABLE_CMA)
  list(APPEND TP_SRCS
    channel/cma/channel_impl.cc
    channel/cma/context_impl.cc
    channel/cma/factory.cc)
  list(APPEND TP_PUBLIC_HDRS
    channel/cma/factory.h)
  set(TENSORPIPE_HAS_CMA_CHANNEL 1)
endif()

### mpt

list(APPEND TP_SRCS
  channel/mpt/channel_impl.cc
  channel/mpt/context_impl.cc
  channel/mpt/factory.cc)
list(APPEND TP_PUBLIC_HDRS
  channel/mpt/factory.h)

## Transports

### uv

list(APPEND TP_SRCS
  transport/uv/connection_impl.cc
  transport/uv/context_impl.cc
  transport/uv/error.cc
  transport/uv/factory.cc
  transport/uv/listener_impl.cc
  transport/uv/loop.cc
  transport/uv/sockaddr.cc
  transport/uv/utility.cc)
list(APPEND TP_PUBLIC_HDRS
  transport/uv/error.h
  transport/uv/factory.h
  transport/uv/utility.h)

# Add uv package
find_package(uv REQUIRED)
list(APPEND TP_LINK_LIBRARIES uv::uv)

### shm

tp_conditional_backend(
  TP_ENABLE_SHM "Enable shared-memory transport" "LINUX")
if(TP_ENABLE_SHM)
  list(APPEND TP_SRCS
    common/epoll_loop.cc
    common/shm_segment.cc
    transport/shm/connection_impl.cc
    transport/shm/context_impl.cc
    transport/shm/factory.cc
    transport/shm/listener_impl.cc
    transport/shm/reactor.cc
    transport/shm/sockaddr.cc)
  list(APPEND TP_PUBLIC_HDRS
    transport/shm/factory.h)
  set(TENSORPIPE_HAS_SHM_TRANSPORT 1)
endif()

### ibv

tp_conditional_backend(
  TP_ENABLE_IBV "Enable InfiniBand transport" "LINUX")
if(TP_ENABLE_IBV)
  list(APPEND TP_SRCS
    common/epoll_loop.cc
    common/ibv.cc
    transport/ibv/connection_impl.cc
    transport/ibv/context_impl.cc
    transport/ibv/error.cc
    transport/ibv/factory.cc
    transport/ibv/listener_impl.cc
    transport/ibv/reactor.cc
    transport/ibv/sockaddr.cc
    transport/ibv/utility.cc)
  list(APPEND TP_PUBLIC_HDRS
    transport/ibv/error.h
    transport/ibv/factory.h
    transport/ibv/utility.h)
  set(TENSORPIPE_HAS_IBV_TRANSPORT 1)
endif()


## MAC OS specific library deps

if(APPLE)
  find_library(CF CoreFoundation)
  find_library(IOKIT IOKit)
  list(APPEND TP_LINK_LIBRARIES ${CF} ${IOKIT})
endif()


## Config

configure_file(config.h.in config.h)


## Libnop

# We should keep libnop headers private as they should not be exposed to downstream users,
# but they're currently transitively included by tensorpipe/transport/connection.h (which
# is still unclear whether it should be a public or private header).
list(APPEND TP_INCLUDE_DIRS $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/third_party/libnop/include>)


## Target

# Add the tensorpipe library target
add_library(tensorpipe ${TP_STATIC_OR_SHARED} ${TP_SRCS})

# Set target properties
if(BUILD_SHARED_LIBS)
  set_target_properties(tensorpipe PROPERTIES POSITION_INDEPENDENT_CODE 1)
endif()

# Add all the link libraries and include directories to the tensorpipe target and keeping the link PUBLIC
target_link_libraries(tensorpipe PRIVATE ${TP_LINK_LIBRARIES})
target_include_directories(tensorpipe PUBLIC ${TP_INCLUDE_DIRS})


## Install

install(TARGETS tensorpipe
        EXPORT TensorpipeTargets
        LIBRARY DESTINATION ${TP_INSTALL_LIBDIR}
        ARCHIVE DESTINATION ${TP_INSTALL_LIBDIR})

foreach(_header_file ${TP_PUBLIC_HDRS})
  get_filename_component(_TP_HEADER_SUBDIR "${_header_file}" DIRECTORY)
  install(FILES ${_header_file}
          DESTINATION ${TP_INSTALL_INCLUDEDIR}/tensorpipe/${_TP_HEADER_SUBDIR})
endforeach()

install(FILES ${CMAKE_CURRENT_BINARY_DIR}/config.h
        DESTINATION ${TP_INSTALL_INCLUDEDIR}/tensorpipe)


## CUDA

if(TP_USE_CUDA)
  # TP_SRCS is the list of source files that we need to build libtensorpipe.
  set(TP_CUDA_SRCS)

  # TP_PUBLIC_HDRS is the list of public header files that we need to install.
  set(TP_CUDA_PUBLIC_HDRS)

  # TP_LINK_LIBRARIES is list of dependent libraries to be linked
  set(TP_CUDA_LINK_LIBRARIES)

  # TP_INCLUDE_DIRS is list of include path to be used
  set(TP_CUDA_INCLUDE_DIRS)

  find_package(CUDA REQUIRED)
  list(APPEND TP_CUDA_LINK_LIBRARIES ${CUDA_LIBRARIES})
  list(APPEND TP_CUDA_INCLUDE_DIRS ${CUDA_INCLUDE_DIRS})

  list(APPEND TP_CUDA_SRCS
    common/cuda_buffer.cc)
  list(APPEND TP_CUDA_PUBLIC_HDRS
    tensorpipe_cuda.h
    common/cuda_buffer.h)

  ### cuda_xth

  list(APPEND TP_CUDA_SRCS
    channel/cuda_xth/channel_impl.cc
    channel/cuda_xth/context_impl.cc
    channel/cuda_xth/factory.cc)
  list(APPEND TP_CUDA_PUBLIC_HDRS
    channel/cuda_xth/factory.h)

  ### cuda_basic

  list(APPEND TP_CUDA_SRCS
    channel/cuda_basic/channel_impl.cc
    channel/cuda_basic/context_impl.cc
    channel/cuda_basic/factory.cc
    common/cuda_loop.cc)
  list(APPEND TP_CUDA_PUBLIC_HDRS
    channel/cuda_basic/factory.h)

  ### cuda_ipc

  tp_conditional_backend(
    TP_ENABLE_CUDA_IPC "Enable CUDA inter-process communication channel" "TP_USE_CUDA")
  if(TP_ENABLE_CUDA_IPC)
    list(APPEND TP_CUDA_SRCS
      channel/cuda_ipc/channel_impl.cc
      channel/cuda_ipc/context_impl.cc
      channel/cuda_ipc/factory.cc)
    list(APPEND TP_CUDA_PUBLIC_HDRS
      channel/cuda_ipc/factory.h)
    set(TENSORPIPE_HAS_CUDA_IPC_CHANNEL 1)
  endif()

  ### cuda_gdr

  tp_conditional_backend(
    TP_ENABLE_CUDA_GDR "Enable CUDA GpuDirect (InfiniBand) channel" "LINUX")
  if(TP_ENABLE_CUDA_GDR)
    list(APPEND TP_CUDA_SRCS
      common/ibv.cc
      channel/cuda_gdr/channel_impl.cc
      channel/cuda_gdr/context_impl.cc
      channel/cuda_gdr/factory.cc)
    list(APPEND TP_CUDA_PUBLIC_HDRS
      channel/cuda_gdr/error.h
      channel/cuda_gdr/factory.h)
    set(TENSORPIPE_HAS_CUDA_GDR_CHANNEL 1)
  endif()

  configure_file(config_cuda.h.in config_cuda.h)

  add_library(tensorpipe_cuda ${TP_STATIC_OR_SHARED} ${TP_CUDA_SRCS})

  if(BUILD_SHARED_LIBS)
    set_target_properties(tensorpipe_cuda PROPERTIES POSITION_INDEPENDENT_CODE 1)
  endif()

  target_link_libraries(tensorpipe_cuda PUBLIC tensorpipe)
  target_link_libraries(tensorpipe_cuda PRIVATE ${TP_CUDA_LINK_LIBRARIES})
  target_include_directories(tensorpipe_cuda PUBLIC ${TP_CUDA_INCLUDE_DIRS})

  install(TARGETS tensorpipe_cuda
          EXPORT TensorpipeTargets
          LIBRARY DESTINATION ${TP_INSTALL_LIBDIR}
          ARCHIVE DESTINATION ${TP_INSTALL_LIBDIR})

  foreach(_header_file ${TP_CUDA_PUBLIC_HDRS})
    get_filename_component(_TP_HEADER_SUBDIR "${_header_file}" DIRECTORY)
    install(FILES ${_header_file}
            DESTINATION ${TP_INSTALL_INCLUDEDIR}/tensorpipe/${_TP_HEADER_SUBDIR})
  endforeach()

  install(FILES ${CMAKE_CURRENT_BINARY_DIR}/config_cuda.h
          DESTINATION ${TP_INSTALL_INCLUDEDIR}/tensorpipe)

endif()


## Python bindings

if(TP_BUILD_PYTHON)
  add_subdirectory(python)
endif()


## Benchmarks

if (TP_BUILD_BENCHMARK)
  add_subdirectory(benchmark)
endif()


## Misc tools

if (TP_BUILD_MISC)
  add_subdirectory(misc)
endif()


## Tests

if(TP_BUILD_TESTING)
  add_subdirectory(test)
endif()


================================================
FILE: tensorpipe/benchmark/CMakeLists.txt
================================================
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

# TODO: Make those separate CMake projects.

add_executable(benchmark_transport benchmark_transport.cc options.cc transport_registry.cc)
target_link_libraries(benchmark_transport PRIVATE tensorpipe)

add_executable(benchmark_pipe benchmark_pipe.cc options.cc transport_registry.cc channel_registry.cc)
target_link_libraries(benchmark_pipe PRIVATE tensorpipe tensorpipe_cuda)


================================================
FILE: tensorpipe/benchmark/benchmark_pipe.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <cstring>

#include <future>

#include <tensorpipe/benchmark/channel_registry.h>
#include <tensorpipe/benchmark/measurements.h>
#include <tensorpipe/benchmark/options.h>
#include <tensorpipe/benchmark/transport_registry.h>
#include <tensorpipe/common/cpu_buffer.h>
#include <tensorpipe/common/cuda.h>
#include <tensorpipe/common/cuda_buffer.h>
#include <tensorpipe/common/defs.h>
#include <tensorpipe/core/context.h>
#include <tensorpipe/core/listener.h>
#include <tensorpipe/core/pipe.h>

// We might sometimes want to run this benchmark using NCCL instead of
// TensorPipe. We don't want to include NCCL as a submodule and deal with the
// build issues. So we've prepared the code and left it around, but disabled it.
#if USE_NCCL
#include <nccl.h>

#define TP_NCCL_CHECK(op)                   \
  {                                         \
    ncclResult_t res = (op);                \
    TP_THROW_ASSERT_IF(res != ncclSuccess); \
  }

struct NcclCommDeleter {
  void operator()(ncclComm_t comm) {
    TP_NCCL_CHECK(ncclCommDestroy(comm));
  }
};

using NcclComm =
    std::unique_ptr<std::remove_pointer_t<ncclComm_t>, NcclCommDeleter>;

static NcclComm createNcclComm(int rank, int worldSize, ncclUniqueId uniqueId) {
  ncclComm_t comm;
  TP_NCCL_CHECK(ncclCommInitRank(&comm, worldSize, uniqueId, rank));
  return NcclComm(comm, NcclCommDeleter{});
}
#endif // USE_NCCL

using namespace tensorpipe;
using namespace tensorpipe::benchmark;

static constexpr int kNumWarmUpRounds = 5;

using Payload = std::unique_ptr<uint8_t[]>;
using CpuTensor = std::unique_ptr<uint8_t[]>;

struct CudaMemoryDeleter {
  void operator()(void* ptr) {
    TP_CUDA_CHECK(cudaFree(ptr));
  }
};

struct CudaStreamDeleter {
  void operator()(cudaStream_t stream) {
    TP_CUDA_CHECK(cudaStreamDestroy(stream));
  }
};

using CudaTensor = std::unique_ptr<uint8_t[], CudaMemoryDeleter>;
using CudaStream =
    std::unique_ptr<std::remove_pointer_t<cudaStream_t>, CudaStreamDeleter>;

struct Data {
  size_t numPayloads;
  size_t payloadSize;
  std::vector<Payload> expectedPayload;
  std::vector<std::string> expectedPayloadMetadata;
  std::vector<Payload> temporaryPayload;

  size_t numTensors;
  size_t tensorSize;
  TensorType tensorType;
  std::vector<CpuTensor> expectedCpuTensor;
  std::vector<CudaTensor> expectedCudaTensor;
  std::vector<std::string> expectedTensorMetadata;
  std::vector<CpuTensor> temporaryCpuTensor;
  std::vector<CudaTensor> temporaryCudaTensor;
  CudaStream cudaStream;
  size_t cudaSyncPeriod;

  std::string expectedMetadata;

#if USE_NCCL
  NcclComm ncclComm;
#endif // USE_NCCL
};

struct MultiDeviceMeasurements {
  // The CPU time to do each ping-pong.
  Measurements cpu;
  // The CPU time of N iterations, including a final CUDA stream sync.
  Measurements cuda;
};

static void printMeasurements(Measurements& measurements, size_t dataLen) {
  measurements.sort();
  fprintf(
      stderr,
      "%-15s %-15s %-12s %-7s %-7s %-7s %-7s\n",
      "chunk-size",
      "# ping-pong",
      "avg (usec)",
      "p50",
      "p75",
      "p90",
      "p95");
  fprintf(
      stderr,
      "%-15lu %-15lu %-12.3f %-7.3f %-7.3f %-7.3f %-7.3f\n",
      dataLen,
      measurements.size(),
      measurements.sum().count() / (float)measurements.size() / 1000.0,
      measurements.percentile(0.50).count() / 1000.0,
      measurements.percentile(0.75).count() / 1000.0,
      measurements.percentile(0.90).count() / 1000.0,
      measurements.percentile(0.95).count() / 1000.0);
}

static void printMultiDeviceMeasurements(
    MultiDeviceMeasurements& measurements,
    size_t dataLen) {
  printMeasurements(measurements.cpu, dataLen);
  printMeasurements(measurements.cuda, dataLen);
}

static std::unique_ptr<uint8_t[]> createEmptyCpuData(size_t size) {
  return std::make_unique<uint8_t[]>(size);
}

static std::unique_ptr<uint8_t[]> createFullCpuData(size_t size) {
  std::unique_ptr<uint8_t[]> data = createEmptyCpuData(size);
  // Generate fixed data for validation between peers
  for (size_t i = 0; i < size; i++) {
    data[i] = (i >> 8) ^ (i & 0xff);
  }
  return data;
}

static CudaTensor createEmptyCudaData(size_t size) {
  uint8_t* ptr;
  TP_CUDA_CHECK(cudaMalloc(&ptr, size));
  return CudaTensor(ptr);
}

static CudaTensor createFullCudaData(size_t size) {
  uint8_t* ptr;
  TP_CUDA_CHECK(cudaMalloc(&ptr, size));
  CpuTensor data = createFullCpuData(size);
  TP_CUDA_CHECK(cudaMemcpy(ptr, data.get(), size, cudaMemcpyHostToDevice));
  return CudaTensor(ptr);
}

static CudaStream createCudaStream() {
  cudaStream_t stream;
  TP_CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
  return CudaStream(stream);
}

static void serverPongPingNonBlock(
    std::shared_ptr<Pipe> pipe,
    int& numWarmUps,
    int& numRoundTrips,
    std::promise<void>& doneProm,
    Data& data,
    Measurements& measurements) {
#if USE_NCCL
  for (int iterIdx = 0; iterIdx < numWarmUps + numRoundTrips; iterIdx++) {
    // TODO Handle multiple tensors.
    TP_NCCL_CHECK(ncclRecv(
        data.temporaryCudaTensor[0].get(),
        data.tensorSize,
        ncclInt8,
        1,
        data.ncclComm.get(),
        data.cudaStream.get()));
    TP_NCCL_CHECK(ncclSend(
        data.temporaryCudaTensor[0].get(),
        data.tensorSize,
        ncclInt8,
        1,
        data.ncclComm.get(),
        data.cudaStream.get()));
  }
  doneProm.set_value();
  return;
#endif // USE_NCCL
  pipe->readDescriptor(
      [pipe, &numWarmUps, &numRoundTrips, &doneProm, &data, &measurements](
          const Error& error, Descriptor descriptor) {
        TP_THROW_ASSERT_IF(error) << error.what();
        Allocation allocation;
        TP_DCHECK_EQ(descriptor.metadata, data.expectedMetadata);
        if (data.payloadSize > 0) {
          TP_DCHECK_EQ(descriptor.payloads.size(), data.numPayloads);
          allocation.payloads.resize(data.numPayloads);
          for (size_t payloadIdx = 0; payloadIdx < data.numPayloads;
               payloadIdx++) {
            TP_DCHECK_EQ(
                descriptor.payloads[payloadIdx].metadata,
                data.expectedPayloadMetadata[payloadIdx]);
            TP_DCHECK_EQ(
                descriptor.payloads[payloadIdx].length, data.payloadSize);
            allocation.payloads[payloadIdx].data =
                data.temporaryPayload[payloadIdx].get();
          }
        } else {
          TP_DCHECK_EQ(descriptor.payloads.size(), 0);
        }
        if (data.tensorSize > 0) {
          TP_DCHECK_EQ(descriptor.tensors.size(), data.numTensors);
          allocation.tensors.resize(data.numTensors);
          for (size_t tensorIdx = 0; tensorIdx < data.numTensors; tensorIdx++) {
            TP_DCHECK_EQ(
                descriptor.tensors[tensorIdx].metadata,
                data.expectedTensorMetadata[tensorIdx]);
            TP_DCHECK_EQ(descriptor.tensors[tensorIdx].length, data.tensorSize);
            if (data.tensorType == TensorType::kCpu) {
              allocation.tensors[tensorIdx].buffer = CpuBuffer{
                  .ptr = data.temporaryCpuTensor[tensorIdx].get(),
              };
            } else if (data.tensorType == TensorType::kCuda) {
              allocation.tensors[tensorIdx].buffer = CudaBuffer{
                  .ptr = data.temporaryCudaTensor[tensorIdx].get(),
                  .stream = data.cudaStream.get(),
              };
            } else {
              TP_THROW_ASSERT() << "Unknown tensor type";
            }
          }
        } else {
          TP_DCHECK_EQ(descriptor.tensors.size(), 0);
        }

        pipe->read(
            allocation,
            [pipe,
             &numWarmUps,
             &numRoundTrips,
             &doneProm,
             &data,
             &measurements,
             descriptor{std::move(descriptor)},
             allocation](const Error& error) {
              TP_THROW_ASSERT_IF(error) << error.what();

              Message message;
              if (data.payloadSize > 0) {
                TP_DCHECK_EQ(allocation.payloads.size(), data.numPayloads);
                message.payloads.resize(data.numPayloads);
                for (size_t payloadIdx = 0; payloadIdx < data.numPayloads;
                     payloadIdx++) {
                  TP_DCHECK_EQ(
                      descriptor.payloads[payloadIdx].length, data.payloadSize);
                  TP_DCHECK_EQ(
                      memcmp(
                          allocation.payloads[payloadIdx].data,
                          data.expectedPayload[payloadIdx].get(),
                          descriptor.payloads[payloadIdx].length),
                      0);
                  message.payloads[payloadIdx] = {
                      .data = data.expectedPayload[payloadIdx].get(),
                      .length = descriptor.payloads[payloadIdx].length,
                  };
                }
              } else {
                TP_DCHECK_EQ(allocation.payloads.size(), 0);
              }
              if (data.tensorSize > 0) {
                TP_DCHECK_EQ(allocation.tensors.size(), data.numTensors);
                message.tensors.resize(data.numTensors);
                for (size_t tensorIdx = 0; tensorIdx < data.numTensors;
                     tensorIdx++) {
                  TP_DCHECK_EQ(
                      descriptor.tensors[tensorIdx].length, data.tensorSize);
                  if (data.tensorType == TensorType::kCpu) {
                    TP_DCHECK_EQ(
                        memcmp(
                            allocation.tensors[tensorIdx]
                                .buffer.unwrap<CpuBuffer>()
                                .ptr,
                            data.expectedCpuTensor[tensorIdx].get(),
                            descriptor.tensors[tensorIdx].length),
                        0);
                  } else if (data.tensorType == TensorType::kCuda) {
                    // No (easy) way to do a memcmp with CUDA, I believe...
                  } else {
                    TP_THROW_ASSERT() << "Unknown tensor type";
                  }
                  message.tensors[tensorIdx] = {
                      .buffer = allocation.tensors[tensorIdx].buffer,
                      .length = descriptor.tensors[tensorIdx].length,
                      .targetDevice =
                          descriptor.tensors[tensorIdx].sourceDevice,
                  };
                }
              } else {
                TP_DCHECK_EQ(allocation.tensors.size(), 0);
              }

              pipe->write(
                  std::move(message),
                  [pipe,
                   &numWarmUps,
                   &numRoundTrips,
                   &doneProm,
                   &data,
                   &measurements](const Error& error) {
                    TP_THROW_ASSERT_IF(error) << error.what();
                    if (numWarmUps > 0) {
                      numWarmUps -= 1;
                    } else {
                      numRoundTrips -= 1;
                    }
                    if (numRoundTrips > 0) {
                      serverPongPingNonBlock(
                          pipe,
                          numWarmUps,
                          numRoundTrips,
                          doneProm,
                          data,
                          measurements);
                    } else {
                      doneProm.set_value();
                    }
                  });
            });
      });
}

// Start with receiving ping
static void runServer(const Options& options) {
  std::string addr = options.address;
  int numWarmUps = kNumWarmUpRounds;
  int numRoundTrips = options.numRoundTrips;

  Data data;
  data.numPayloads = options.numPayloads;
  data.payloadSize = options.payloadSize;
  for (size_t payloadIdx = 0; payloadIdx < options.numPayloads; payloadIdx++) {
    data.expectedPayload.push_back(createFullCpuData(options.payloadSize));
    data.expectedPayloadMetadata.push_back(
        std::string(options.metadataSize, 0x42));
    data.temporaryPayload.push_back(createEmptyCpuData(options.payloadSize));
  }
  data.numTensors = options.numTensors;
  data.tensorSize = options.tensorSize;
  data.tensorType = options.tensorType;
  for (size_t tensorIdx = 0; tensorIdx < options.numTensors; tensorIdx++) {
    data.expectedTensorMetadata.push_back(
        std::string(options.metadataSize, 0x42));
    if (options.tensorType == TensorType::kCpu) {
      data.expectedCpuTensor.push_back(createFullCpuData(options.tensorSize));
      data.temporaryCpuTensor.push_back(createEmptyCpuData(options.tensorSize));
    } else if (options.tensorType == TensorType::kCuda) {
      data.expectedCudaTensor.push_back(createFullCudaData(options.tensorSize));
      data.temporaryCudaTensor.push_back(
          createEmptyCudaData(options.tensorSize));
      data.cudaStream = createCudaStream();
    } else {
      TP_THROW_ASSERT() << "Unknown tensor type";
    }
  }
  data.cudaSyncPeriod = options.cudaSyncPeriod;
  data.expectedMetadata = std::string(options.metadataSize, 0x42);

  Measurements measurements;
  measurements.reserve(options.numRoundTrips);

  std::shared_ptr<Context> context = std::make_shared<Context>();
  auto transportContext =
      TensorpipeTransportRegistry().create(options.transport);
  validateTransportContext(transportContext);
  context->registerTransport(0, options.transport, transportContext);

  auto channelContext = TensorpipeChannelRegistry().create(options.channel);
  validateChannelContext(channelContext);
  context->registerChannel(0, options.channel, channelContext);

  std::promise<std::shared_ptr<Pipe>> pipeProm;
  std::shared_ptr<Listener> listener = context->listen({addr});
  listener->accept([&](const Error& error, std::shared_ptr<Pipe> pipe) {
    TP_THROW_ASSERT_IF(error) << error.what();
    pipeProm.set_value(std::move(pipe));
  });
  std::shared_ptr<Pipe> pipe = pipeProm.get_future().get();

#if USE_NCCL
  std::promise<ncclUniqueId> uniqueIdProm;
  pipe->readDescriptor([&](const Error& error, Descriptor descriptor) {
    TP_THROW_ASSERT_IF(error) << error.what();
    uniqueIdProm.set_value(
        *reinterpret_cast<const ncclUniqueId*>(descriptor.metadata.c_str()));
  });
  ncclUniqueId uniqueId = uniqueIdProm.get_future().get();

  data.ncclComm = createNcclComm(/*rank=*/0, /*worldSize=*/2, uniqueId);
#endif

  std::promise<void> doneProm;
  serverPongPingNonBlock(
      std::move(pipe), numWarmUps, numRoundTrips, doneProm, data, measurements);

  doneProm.get_future().get();
  listener.reset();
  context->join();
}

static void clientPingPongNonBlock(
    std::shared_ptr<Pipe> pipe,
    int& numWarmUps,
    int& numRoundTrips,
    std::promise<void>& doneProm,
    Data& data,
    MultiDeviceMeasurements& measurements) {
#if USE_NCCL
  for (int iterIdx = 0; iterIdx < numWarmUps + numRoundTrips; iterIdx++) {
    if (iterIdx >= numWarmUps) {
      measurements.cpu.markStart();
      if ((iterIdx - numWarmUps) % data.cudaSyncPeriod == 0) {
        measurements.cuda.markStart();
      }
    }
    TP_NCCL_CHECK(ncclSend(
        data.expectedCudaTensor[0].get(),
        data.tensorSize,
        ncclInt8,
        0,
        data.ncclComm.get(),
        data.cudaStream.get()));
    TP_NCCL_CHECK(ncclRecv(
        data.temporaryCudaTensor[0].get(),
        data.tensorSize,
        ncclInt8,
        0,
        data.ncclComm.get(),
        data.cudaStream.get()));
    if (iterIdx >= numWarmUps) {
      measurements.cpu.markStop();
      if ((iterIdx - numWarmUps + 1) % data.cudaSyncPeriod == 0) {
        TP_CUDA_CHECK(cudaStreamSynchronize(data.cudaStream.get()));
        measurements.cuda.markStop(data.cudaSyncPeriod);
      }
    }
  }
  printMultiDeviceMeasurements(measurements, data.payloadSize);
  doneProm.set_value();
  return;
#endif // USE_NCCL
  if (numWarmUps == 0) {
    measurements.cpu.markStart();
    if (numRoundTrips % data.cudaSyncPeriod == 0) {
      measurements.cuda.markStart();
    }
  }
  Message message;
  message.metadata = data.expectedMetadata;
  if (data.payloadSize > 0) {
    for (size_t payloadIdx = 0; payloadIdx < data.numPayloads; payloadIdx++) {
      Message::Payload payload;
      payload.data = data.expectedPayload[payloadIdx].get();
      payload.length = data.payloadSize;
      message.payloads.push_back(std::move(payload));
    }
  } else {
    TP_DCHECK_EQ(message.payloads.size(), 0);
  }
  if (data.tensorSize > 0) {
    for (size_t tensorIdx = 0; tensorIdx < data.numTensors; tensorIdx++) {
      Message::Tensor tensor;
      tensor.length = data.tensorSize;
      if (data.tensorType == TensorType::kCpu) {
        tensor.buffer =
            CpuBuffer{.ptr = data.expectedCpuTensor[tensorIdx].get()};
        tensor.targetDevice = Device(kCpuDeviceType, 0);
      } else if (data.tensorType == TensorType::kCuda) {
        tensor.buffer = CudaBuffer{
            .ptr = data.expectedCudaTensor[tensorIdx].get(),
            .stream = data.cudaStream.get(),
        };
        tensor.targetDevice = Device(kCudaDeviceType, 0);
      } else {
        TP_THROW_ASSERT() << "Unknown tensor type";
      }
      message.tensors.push_back(std::move(tensor));
    }
  } else {
    TP_DCHECK_EQ(message.tensors.size(), 0);
  }
  pipe->write(
      std::move(message),
      [pipe, &numWarmUps, &numRoundTrips, &doneProm, &data, &measurements](
          const Error& error) {
        TP_THROW_ASSERT_IF(error) << error.what();
        pipe->readDescriptor([pipe,
                              &numWarmUps,
                              &numRoundTrips,
                              &doneProm,
                              &data,
                              &measurements](
                                 const Error& error, Descriptor descriptor) {
          TP_THROW_ASSERT_IF(error) << error.what();

          Allocation allocation;
          TP_DCHECK_EQ(descriptor.metadata, data.expectedMetadata);
          if (data.payloadSize > 0) {
            TP_DCHECK_EQ(descriptor.payloads.size(), data.numPayloads);
            allocation.payloads.resize(data.numPayloads);
            for (size_t payloadIdx = 0; payloadIdx < data.numPayloads;
                 payloadIdx++) {
              TP_DCHECK_EQ(
                  descriptor.payloads[payloadIdx].metadata,
                  data.expectedPayloadMetadata[payloadIdx]);
              TP_DCHECK_EQ(
                  descriptor.payloads[payloadIdx].length, data.payloadSize);
              allocation.payloads[payloadIdx].data =
                  data.temporaryPayload[payloadIdx].get();
            }
          } else {
            TP_DCHECK_EQ(descriptor.payloads.size(), 0);
          }
          if (data.tensorSize > 0) {
            TP_DCHECK_EQ(descriptor.tensors.size(), data.numTensors);
            allocation.tensors.resize(data.numTensors);
            for (size_t tensorIdx = 0; tensorIdx < data.numTensors;
                 tensorIdx++) {
              TP_DCHECK_EQ(
                  descriptor.tensors[tensorIdx].metadata,
                  data.expectedTensorMetadata[tensorIdx]);
              TP_DCHECK_EQ(
                  descriptor.tensors[tensorIdx].length, data.tensorSize);
              if (data.tensorType == TensorType::kCpu) {
                allocation.tensors[tensorIdx].buffer = CpuBuffer{
                    .ptr = data.temporaryCpuTensor[tensorIdx].get(),
                };
              } else if (data.tensorType == TensorType::kCuda) {
                allocation.tensors[tensorIdx].buffer = CudaBuffer{
                    .ptr = data.temporaryCudaTensor[tensorIdx].get(),
                    .stream = data.cudaStream.get(),
                };
              } else {
                TP_THROW_ASSERT() << "Unknown tensor type";
              }
            }
          } else {
            TP_DCHECK_EQ(descriptor.tensors.size(), 0);
          }
          pipe->read(
              allocation,
              [pipe,
               &numWarmUps,
               &numRoundTrips,
               &doneProm,
               &data,
               &measurements,
               descriptor{std::move(descriptor)},
               allocation](const Error& error) {
                if (numWarmUps == 0) {
                  measurements.cpu.markStop();
                  if ((numRoundTrips - 1) % data.cudaSyncPeriod == 0) {
                    TP_CUDA_CHECK(cudaStreamSynchronize(data.cudaStream.get()));
                    measurements.cuda.markStop(data.cudaSyncPeriod);
                  }
                }
                TP_THROW_ASSERT_IF(error) << error.what();
                if (data.payloadSize > 0) {
                  TP_DCHECK_EQ(allocation.payloads.size(), data.numPayloads);
                  for (size_t payloadIdx = 0; payloadIdx < data.numPayloads;
                       payloadIdx++) {
                    TP_DCHECK_EQ(
                        memcmp(
                            allocation.payloads[payloadIdx].data,
                            data.expectedPayload[payloadIdx].get(),
                            descriptor.payloads[payloadIdx].length),
                        0);
                  }
                } else {
                  TP_DCHECK_EQ(allocation.payloads.size(), 0);
                }
                if (data.tensorSize > 0) {
                  TP_DCHECK_EQ(allocation.tensors.size(), data.numTensors);
                  for (size_t tensorIdx = 0; tensorIdx < data.numTensors;
                       tensorIdx++) {
                    if (data.tensorType == TensorType::kCpu) {
                      TP_DCHECK_EQ(
                          memcmp(
                              allocation.tensors[tensorIdx]
                                  .buffer.unwrap<CpuBuffer>()
                                  .ptr,
                              data.expectedCpuTensor[tensorIdx].get(),
                              descriptor.tensors[tensorIdx].length),
                          0);
                    } else if (data.tensorType == TensorType::kCuda) {
                      // No (easy) way to do a memcmp with CUDA, I
                      // believe...
                    } else {
                      TP_THROW_ASSERT() << "Unknown tensor type";
                    }
                  }
                } else {
                  TP_DCHECK_EQ(allocation.tensors.size(), 0);
                }
                if (numWarmUps > 0) {
                  numWarmUps -= 1;
                } else {
                  numRoundTrips -= 1;
                }
                if (numRoundTrips > 0) {
                  clientPingPongNonBlock(
                      pipe,
                      numWarmUps,
                      numRoundTrips,
                      doneProm,
                      data,
                      measurements);
                } else {
                  printMultiDeviceMeasurements(measurements, data.payloadSize);
                  doneProm.set_value();
                }
              });
        });
      });
}

// Start with sending ping
static void runClient(const Options& options) {
  std::string addr = options.address;
  int numWarmUps = kNumWarmUpRounds;
  int numRoundTrips = options.numRoundTrips;

  Data data;
  data.numPayloads = options.numPayloads;
  data.payloadSize = options.payloadSize;
  for (size_t payloadIdx = 0; payloadIdx < options.numPayloads; payloadIdx++) {
    data.expectedPayload.push_back(createFullCpuData(options.payloadSize));
    data.expectedPayloadMetadata.push_back(
        std::string(options.metadataSize, 0x42));
    data.temporaryPayload.push_back(createEmptyCpuData(options.payloadSize));
  }
  data.numTensors = options.numTensors;
  data.tensorSize = options.tensorSize;
  data.tensorType = options.tensorType;
  for (size_t tensorIdx = 0; tensorIdx < options.numTensors; tensorIdx++) {
    data.expectedTensorMetadata.push_back(
        std::string(options.metadataSize, 0x42));
    if (data.tensorType == TensorType::kCpu) {
      data.expectedCpuTensor.push_back(createFullCpuData(options.tensorSize));
      data.temporaryCpuTensor.push_back(createEmptyCpuData(options.tensorSize));
    } else if (data.tensorType == TensorType::kCuda) {
      data.expectedCudaTensor.push_back(createFullCudaData(options.tensorSize));
      data.temporaryCudaTensor.push_back(
          createEmptyCudaData(options.tensorSize));
      data.cudaStream = createCudaStream();
    } else {
      TP_THROW_ASSERT() << "Unknown tensor type";
    }
  }
  data.cudaSyncPeriod = options.cudaSyncPeriod;
  data.expectedMetadata = std::string(options.metadataSize, 0x42);

  MultiDeviceMeasurements measurements;
  measurements.cpu.reserve(options.numRoundTrips);
  measurements.cuda.reserve(options.numRoundTrips / data.cudaSyncPeriod);

  std::shared_ptr<Context> context = std::make_shared<Context>();
  auto transportContext =
      TensorpipeTransportRegistry().create(options.transport);
  validateTransportContext(transportContext);
  context->registerTransport(0, options.transport, transportContext);

  auto channelContext = TensorpipeChannelRegistry().create(options.channel);
  validateChannelContext(channelContext);
  context->registerChannel(0, options.channel, channelContext);

  std::shared_ptr<Pipe> pipe = context->connect(addr);

#if USE_NCCL
  ncclUniqueId uniqueId;
  TP_NCCL_CHECK(ncclGetUniqueId(&uniqueId));
  Message message;
  message.metadata = std::string(
      reinterpret_cast<char*>(&uniqueId),
      reinterpret_cast<char*>(&uniqueId) + sizeof(ncclUniqueId));
  std::promise<void> promise;
  pipe->write(std::move(message), [&](const Error& error) {
    TP_THROW_ASSERT_IF(error) << error.what();
    promise.set_value();
  });
  promise.get_future().get();

  data.ncclComm = createNcclComm(/*rank=*/1, /*worldSize=*/2, uniqueId);
#endif // USE_NCCL

  std::promise<void> doneProm;
  clientPingPongNonBlock(
      std::move(pipe), numWarmUps, numRoundTrips, doneProm, data, measurements);

  doneProm.get_future().get();
  context->join();
}

int main(int argc, char** argv) {
  struct Options x = parseOptions(argc, argv);
  std::cout << "mode = " << x.mode << "\n";
  std::cout << "transport = " << x.transport << "\n";
  std::cout << "channel = " << x.channel << "\n";
  std::cout << "address = " << x.address << "\n";
  std::cout << "num_round_trips = " << x.numRoundTrips << "\n";
  std::cout << "num_payloads = " << x.numPayloads << "\n";
  std::cout << "payload_size = " << x.payloadSize << "\n";
  std::cout << "num_tensors = " << x.numTensors << "\n";
  std::cout << "tensor_size = " << x.tensorSize << "\n";
  std::cout << "tensor_type = "
            << (x.tensorType == TensorType::kCpu ? "cpu" : "cuda") << "\n";
  std::cout << "metadata_size = " << x.metadataSize << "\n";

  if (x.mode == "listen") {
    runServer(x);
  } else if (x.mode == "connect") {
    runClient(x);
  } else {
    // Should never be here
    TP_THROW_ASSERT() << "unknown mode: " << x.mode;
  }

  return 0;
}


================================================
FILE: tensorpipe/benchmark/benchmark_transport.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <cstring>

#include <future>

#include <tensorpipe/benchmark/measurements.h>
#include <tensorpipe/benchmark/options.h>
#include <tensorpipe/benchmark/transport_registry.h>
#include <tensorpipe/common/defs.h>
#include <tensorpipe/transport/connection.h>
#include <tensorpipe/transport/listener.h>

using namespace tensorpipe;
using namespace tensorpipe::benchmark;
using namespace tensorpipe::transport;

struct Data {
  std::unique_ptr<uint8_t[]> expected;
  std::unique_ptr<uint8_t[]> temporary;
  size_t size;
};

static void printMeasurements(Measurements& measurements, size_t dataLen) {
  measurements.sort();
  fprintf(
      stderr,
      "%-15s %-15s %-12s %-7s %-7s %-7s %-7s\n",
      "chunk-size",
      "# ping-pong",
      "avg (usec)",
      "p50",
      "p75",
      "p90",
      "p95");
  fprintf(
      stderr,
      "%-15lu %-15lu %-12.3f %-7.3f %-7.3f %-7.3f %-7.3f\n",
      dataLen,
      measurements.size(),
      measurements.sum().count() / (float)measurements.size() / 1000.0,
      measurements.percentile(0.50).count() / 1000.0,
      measurements.percentile(0.75).count() / 1000.0,
      measurements.percentile(0.90).count() / 1000.0,
      measurements.percentile(0.95).count() / 1000.0);
}

static std::unique_ptr<uint8_t[]> createData(const int size) {
  auto data = std::make_unique<uint8_t[]>(size);
  // Generate fixed data for validation between peers
  for (int i = 0; i < size; i++) {
    data[i] = (i >> 8) ^ (i & 0xff);
  }
  return data;
}

static void serverPongPingNonBlock(
    std::shared_ptr<Connection> conn,
    int& numRoundTrips,
    std::promise<void>& doneProm,
    Data& data,
    Measurements& measurements) {
  conn->read(
      data.temporary.get(),
      data.size,
      [conn, &numRoundTrips, &doneProm, &data, &measurements](
          const Error& error, const void* ptr, size_t len) {
        TP_THROW_ASSERT_IF(error) << error.what();
        TP_DCHECK_EQ(len, data.size);
        TP_DCHECK_EQ(memcmp(ptr, data.expected.get(), len), 0);
        conn->write(
            data.temporary.get(),
            data.size,
            [conn, &numRoundTrips, &doneProm, &data, &measurements](
                const Error& error) {
              TP_THROW_ASSERT_IF(error) << error.what();
              if (--numRoundTrips > 0) {
                serverPongPingNonBlock(
                    conn, numRoundTrips, doneProm, data, measurements);
              } else {
                doneProm.set_value();
              }
            });
      });
}

// Start with receiving ping
static void runServer(const Options& options) {
  std::string addr = options.address;
  int numRoundTrips = options.numRoundTrips;
  Data data = {
      createData(options.payloadSize),
      std::make_unique<uint8_t[]>(options.payloadSize),
      options.payloadSize};
  Measurements measurements;
  measurements.reserve(options.numRoundTrips);

  std::shared_ptr<transport::Context> context;
  context = TensorpipeTransportRegistry().create(options.transport);
  validateTransportContext(context);

  std::promise<std::shared_ptr<Connection>> connProm;
  std::shared_ptr<transport::Listener> listener = context->listen(addr);
  listener->accept([&](const Error& error, std::shared_ptr<Connection> conn) {
    TP_THROW_ASSERT_IF(error) << error.what();
    connProm.set_value(std::move(conn));
  });
  std::shared_ptr<Connection> conn = connProm.get_future().get();

  std::promise<void> doneProm;
  serverPongPingNonBlock(
      std::move(conn), numRoundTrips, doneProm, data, measurements);

  doneProm.get_future().get();
  context->join();
}

static void clientPingPongNonBlock(
    std::shared_ptr<Connection> conn,
    int& numRoundTrips,
    std::promise<void>& doneProm,
    Data& data,
    Measurements& measurements) {
  measurements.markStart();
  conn->write(
      data.expected.get(),
      data.size,
      [conn, &numRoundTrips, &doneProm, &data, &measurements](
          const Error& error) {
        TP_THROW_ASSERT_IF(error) << error.what();
        conn->read(
            data.temporary.get(),
            data.size,
            [conn, &numRoundTrips, &doneProm, &data, &measurements](
                const Error& error, const void* ptr, size_t len) {
              measurements.markStop();
              TP_THROW_ASSERT_IF(error) << error.what();
              TP_DCHECK_EQ(len, data.size);
              TP_DCHECK_EQ(memcmp(ptr, data.expected.get(), len), 0);
              if (--numRoundTrips > 0) {
                clientPingPongNonBlock(
                    conn, numRoundTrips, doneProm, data, measurements);
              } else {
                printMeasurements(measurements, data.size);
                doneProm.set_value();
              }
            });
      });
}

// Start with sending ping
static void runClient(const Options& options) {
  std::string addr = options.address;
  int numRoundTrips = options.numRoundTrips;
  Data data = {
      createData(options.payloadSize),
      std::make_unique<uint8_t[]>(options.payloadSize),
      options.payloadSize};
  Measurements measurements;
  measurements.reserve(options.numRoundTrips);

  std::shared_ptr<transport::Context> context;
  context = TensorpipeTransportRegistry().create(options.transport);
  validateTransportContext(context);
  std::shared_ptr<Connection> conn = context->connect(addr);

  std::promise<void> doneProm;
  clientPingPongNonBlock(
      std::move(conn), numRoundTrips, doneProm, data, measurements);

  doneProm.get_future().get();
  context->join();
}

int main(int argc, char** argv) {
  struct Options x = parseOptions(argc, argv);
  std::cout << "mode = " << x.mode << "\n";
  std::cout << "transport = " << x.transport << "\n";
  std::cout << "address = " << x.address << "\n";
  std::cout << "num_round_trips = " << x.numRoundTrips << "\n";
  std::cout << "payload_size = " << x.payloadSize << "\n";

  if (x.mode == "listen") {
    runServer(x);
  } else if (x.mode == "connect") {
    runClient(x);
  } else {
    // Should never be here
    TP_THROW_ASSERT() << "unknown mode: " << x.mode;
  }

  return 0;
}


================================================
FILE: tensorpipe/benchmark/channel_registry.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/benchmark/channel_registry.h>

#include <tensorpipe/tensorpipe.h>
#include <tensorpipe/tensorpipe_cuda.h>

TP_DEFINE_SHARED_REGISTRY(
    TensorpipeChannelRegistry,
    tensorpipe::channel::Context);

// BASIC

std::shared_ptr<tensorpipe::channel::Context> makeBasicChannel() {
  return tensorpipe::channel::basic::create();
}

TP_REGISTER_CREATOR(TensorpipeChannelRegistry, basic, makeBasicChannel);

// CMA

#if TENSORPIPE_HAS_CMA_CHANNEL
std::shared_ptr<tensorpipe::channel::Context> makeCmaChannel() {
  return tensorpipe::channel::cma::create();
}

TP_REGISTER_CREATOR(TensorpipeChannelRegistry, cma, makeCmaChannel);
#endif // TENSORPIPE_HAS_CMA_CHANNEL

// MPT

std::shared_ptr<tensorpipe::channel::Context> makeMptChannel() {
  throw std::runtime_error("mtp channel requires arguments");
}

TP_REGISTER_CREATOR(TensorpipeChannelRegistry, mpt, makeMptChannel);

// XTH

std::shared_ptr<tensorpipe::channel::Context> makeXthChannel() {
  return tensorpipe::channel::xth::create();
}

TP_REGISTER_CREATOR(TensorpipeChannelRegistry, xth, makeXthChannel);

// CUDA XTH

std::shared_ptr<tensorpipe::channel::Context> makeCudaXthChannel() {
  return tensorpipe::channel::cuda_xth::create();
}

TP_REGISTER_CREATOR(TensorpipeChannelRegistry, cuda_xth, makeCudaXthChannel);

// CUDA BASIC

std::shared_ptr<tensorpipe::channel::Context> makeCudaBasicChannel() {
  return tensorpipe::channel::cuda_basic::create(
      tensorpipe::channel::basic::create());
}

TP_REGISTER_CREATOR(
    TensorpipeChannelRegistry,
    cuda_basic,
    makeCudaBasicChannel);

// CUDA IPC

#if TENSORPIPE_HAS_CUDA_IPC_CHANNEL
std::shared_ptr<tensorpipe::channel::Context> makeCudaIpcChannel() {
  return tensorpipe::channel::cuda_ipc::create();
}

TP_REGISTER_CREATOR(TensorpipeChannelRegistry, cuda_ipc, makeCudaIpcChannel);
#endif // TENSORPIPE_HAS_CUDA_IPC_CHANNEL

// CUDA GDR

#if TENSORPIPE_HAS_CUDA_GDR_CHANNEL
std::shared_ptr<tensorpipe::channel::Context> makeCudaGdrChannel() {
  return tensorpipe::channel::cuda_gdr::create();
}

TP_REGISTER_CREATOR(TensorpipeChannelRegistry, cuda_gdr, makeCudaGdrChannel);
#endif // TENSORPIPE_HAS_CUDA_GDR_CHANNEL

void validateChannelContext(
    std::shared_ptr<tensorpipe::channel::Context> context) {
  if (!context) {
    auto keys = TensorpipeChannelRegistry().keys();
    std::cout
        << "The channel you passed in is not supported. The following channels are valid: ";
    for (const auto& key : keys) {
      std::cout << key << ", ";
    }
    std::cout << "\n";
    exit(EXIT_FAILURE);
  }
}


================================================
FILE: tensorpipe/benchmark/channel_registry.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <tensorpipe/benchmark/registry.h>
#include <tensorpipe/channel/context.h>

TP_DECLARE_SHARED_REGISTRY(
    TensorpipeChannelRegistry,
    tensorpipe::channel::Context);

void validateChannelContext(
    std::shared_ptr<tensorpipe::channel::Context> context);


================================================
FILE: tensorpipe/benchmark/measurements.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <algorithm>
#include <chrono>
#include <vector>

namespace tensorpipe {
namespace benchmark {

class Measurements {
  using clock = std::chrono::high_resolution_clock;
  using nanoseconds = std::chrono::nanoseconds;

 public:
  void markStart() {
    start_ = clock::now();
  }

  void markStop(size_t count = 1) {
    samples_.push_back((clock::now() - start_) / count);
  }

  void sort() {
    std::sort(samples_.begin(), samples_.end());
  }

  void reserve(size_t capacity) {
    samples_.reserve(capacity);
  }

  size_t size() const {
    return samples_.size();
  }

  nanoseconds sum() const {
    nanoseconds sum{0};
    for (const auto& sample : samples_) {
      sum += sample;
    }
    return sum;
  }

  nanoseconds percentile(float f) const {
    return samples_[static_cast<size_t>(f * samples_.size())];
  }

 private:
  clock::time_point start_;
  std::vector<nanoseconds> samples_;
};

} // namespace benchmark
} // namespace tensorpipe


================================================
FILE: tensorpipe/benchmark/options.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/benchmark/options.h>

#include <getopt.h>
#include <stdio.h>
#include <stdlib.h>

namespace tensorpipe {
namespace benchmark {

static void usage(int status, const char* argv0) {
  if (status != EXIT_SUCCESS) {
    fprintf(stderr, "`%s --help' for more information.\n", argv0);
    exit(status);
  }

  fprintf(stderr, "Usage: %s [OPTIONS]\n", argv0);
#define X(x) fputs(x "\n", stderr);
  X("");
  X("--mode=MODE                      Running mode [listen|connect]");
  X("--transport=TRANSPORT            Transport backend [shm|uv]");
  X("--channel=CHANNEL                Channel backend [basic]");
  X("--address=ADDRESS                Address to listen or connect to");
  X("--num-round-trips=NUM            Number of write/read pairs to perform");
  X("--num-payloads=NUM [optional]    Number of payloads of each write/read pair");
  X("--payload-size=SIZE [optional]   Size of payload of each write/read pair");
  X("--num-tensors=NUM [optional]     Number of tensors of each write/read pair");
  X("--tensor-size=SIZE [optional]    Size of tensor of each write/read pair");
  X("--tensor-type=TYPE [optional]    Type of tensor (cpu or cuda)");
  X("--metadata-size=SIZE [optional]  Size of metadata of each write/read pair");
  X("--cuda-sync-period=NUM [optiona] Number of round-trips between two stream syncs");

  exit(status);
}

static void validateOptions(Options options, const char* argv0) {
  int status = EXIT_SUCCESS;
  if (options.mode.empty()) {
    fprintf(stderr, "Missing argument: --mode must be set\n");
    status = EXIT_FAILURE;
  }
  if (options.transport.empty()) {
    fprintf(stderr, "Missing argument: --transport must be set\n");
    status = EXIT_FAILURE;
  }
  if (options.address.empty()) {
    fprintf(stderr, "Missing argument: --address must be set\n");
    status = EXIT_FAILURE;
  }
  if (options.numRoundTrips <= 0) {
    fprintf(stderr, "Missing argument: --num-round-trips must be set\n");
    status = EXIT_FAILURE;
  }
  if (status != EXIT_SUCCESS) {
    usage(status, argv0);
  }
}

struct Options parseOptions(int argc, char** argv) {
  struct Options options;
  int opt;
  int flag = -1;

  enum Flags : int {
    MODE,
    TRANSPORT,
    CHANNEL,
    ADDRESS,
    NUM_ROUND_TRIPS,
    NUM_PAYLOADS,
    PAYLOAD_SIZE,
    NUM_TENSORS,
    TENSOR_SIZE,
    TENSOR_TYPE,
    METADATA_SIZE,
    CUDA_SYNC_PERIOD,
    HELP,
  };

  static struct option longOptions[] = {
      {"mode", required_argument, &flag, MODE},
      {"transport", required_argument, &flag, TRANSPORT},
      {"channel", required_argument, &flag, CHANNEL},
      {"address", required_argument, &flag, ADDRESS},
      {"num-round-trips", required_argument, &flag, NUM_ROUND_TRIPS},
      {"num-payloads", required_argument, &flag, NUM_PAYLOADS},
      {"payload-size", required_argument, &flag, PAYLOAD_SIZE},
      {"num-tensors", required_argument, &flag, NUM_TENSORS},
      {"tensor-size", required_argument, &flag, TENSOR_SIZE},
      {"tensor-type", required_argument, &flag, TENSOR_TYPE},
      {"metadata-size", required_argument, &flag, METADATA_SIZE},
      {"cuda-sync-period", required_argument, &flag, CUDA_SYNC_PERIOD},
      {"help", no_argument, &flag, HELP},
      {nullptr, 0, nullptr, 0}};

  while (1) {
    opt = getopt_long(argc, argv, "", longOptions, nullptr);
    if (opt == -1) {
      break;
    }
    if (opt != 0) {
      usage(EXIT_FAILURE, argv[0]);
      break;
    }
    switch (flag) {
      case MODE:
        options.mode = std::string(optarg);
        if (options.mode != "listen" && options.mode != "connect") {
          fprintf(stderr, "Error:\n");
          fprintf(stderr, "  --mode must be [listen|connect]\n");
          exit(EXIT_FAILURE);
        }
        break;
      case TRANSPORT:
        options.transport = std::string(optarg);
        break;
      case CHANNEL:
        options.channel = std::string(optarg);
        break;
      case ADDRESS:
        options.address = std::string(optarg);
        break;
      case NUM_ROUND_TRIPS:
        options.numRoundTrips = std::strtol(optarg, nullptr, 10);
        break;
      case NUM_PAYLOADS:
        options.numPayloads = std::strtoull(optarg, nullptr, 10);
        break;
      case PAYLOAD_SIZE:
        options.payloadSize = std::strtoull(optarg, nullptr, 10);
        break;
      case NUM_TENSORS:
        options.numTensors = std::strtoull(optarg, nullptr, 10);
        break;
      case TENSOR_SIZE:
        options.tensorSize = std::strtoull(optarg, nullptr, 10);
        break;
      case TENSOR_TYPE:
        if (std::string(optarg) == "cpu") {
          options.tensorType = TensorType::kCpu;
        } else if (std::string(optarg) == "cuda") {
          options.tensorType = TensorType::kCuda;
        } else {
          fprintf(stderr, "Error:\n");
          fprintf(stderr, "  --tensor-type must be [cpu|cuda]\n");
          exit(EXIT_FAILURE);
        }
        break;
      case METADATA_SIZE:
        options.metadataSize = std::strtoull(optarg, nullptr, 10);
        break;
      case CUDA_SYNC_PERIOD:
        options.cudaSyncPeriod = std::strtoull(optarg, nullptr, 10);
        break;
      case HELP:
        usage(EXIT_SUCCESS, argv[0]);
        break;
      default:
        usage(EXIT_FAILURE, argv[0]);
        break;
    }
  }

  validateOptions(options, argv[0]);

  return options;
}

} // namespace benchmark
} // namespace tensorpipe


================================================
FILE: tensorpipe/benchmark/options.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <string>

#include <tensorpipe/channel/context.h>
#include <tensorpipe/transport/context.h>

namespace tensorpipe {
namespace benchmark {

enum class TensorType {
  kCpu,
  kCuda,
};

struct Options {
  std::string mode; // server or client
  std::string transport; // shm or uv
  std::string channel; // basic
  std::string address; // address for listen or connect
  int numRoundTrips{0}; // number of write/read pairs
  size_t numPayloads{0};
  size_t payloadSize{0};
  size_t numTensors{0};
  size_t tensorSize{0};
  TensorType tensorType{TensorType::kCpu};
  size_t metadataSize{0};
  size_t cudaSyncPeriod{1};
};

struct Options parseOptions(int argc, char** argv);

} // namespace benchmark
} // namespace tensorpipe


================================================
FILE: tensorpipe/benchmark/registry.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

// NB: This Registry works poorly when you have other namespaces.

/**
 * Simple registry implementation that uses static variables to
 * register object creators during program initialization time. This registry
 * implementation is largely borrowed from the PyTorch registry utility in file
 * pytorch/c10/util/Registry.h.
 */

#pragma once

#include <algorithm>
#include <cstdio>
#include <cstdlib>
#include <functional>
#include <iostream>
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>

namespace tensorpipe {

/**
 * @brief A template class that allows one to register classes by keys.
 *
 * The keys are usually a std::string specifying the name, but can be anything
 * that can be used in a std::map.
 *
 * You should most likely not use the Registry class explicitly, but use the
 * helper macros below to declare specific registries as well as registering
 * objects.
 */
template <class ObjectPtrType, class... Args>
class Registry {
 public:
  typedef std::function<ObjectPtrType(Args...)> Creator;

  Registry() : registry_() {}

  // Adds a key and its associated creator to the desired registry. If the key
  // already exists in the registry, we simply replace the old creator
  // with the new args for the key.
  void registerCreator(std::string key, Creator creator) {
    registry_[key] = creator;
  }

  // Allows you to register and key/Creator pair and provide a help_messge for
  // the key as well.
  void registerCreator(
      std::string key,
      Creator creator,
      const std::string& helpMsg) {
    registerCreator(key, creator);
    helpMessage_[key] = helpMsg;
  }

  // Returns whether a particular key exists in the given registry.
  inline bool has(std::string key) {
    return (registry_.count(key) != 0);
  }

  // Given the key, create() invokes the creator with the provided args and
  // returns the object that the creator function constructs.
  ObjectPtrType create(std::string key, Args... args) {
    if (registry_.count(key) == 0) {
      // Returns nullptr if the key is not registered.
      return nullptr;
    }
    return registry_[key](args...);
  }

  // Returns the registered keys as a std::vector.
  std::vector<std::string> keys() const {
    std::vector<std::string> keys;
    for (const auto& it : registry_) {
      keys.push_back(it.first);
    }
    return keys;
  }

  // Returns the help_message for the key if one is provided.
  inline const std::unordered_map<std::string, std::string>& helpMessage()
      const {
    return helpMessage_;
  }

  const char* helpMessage(std::string key) const {
    auto it = helpMessage_.find(key);
    if (it == helpMessage_.end()) {
      return nullptr;
    }
    return it->second.c_str();
  }

 private:
  std::unordered_map<std::string, Creator> registry_;
  std::unordered_map<std::string, std::string> helpMessage_;
};

// Registerer is a class template that simplifies Register-ing keys for a given
// registry.
template <class ObjectPtrType, class... Args>
class Registerer {
 public:
  explicit Registerer(
      std::string key,
      Registry<ObjectPtrType, Args...>& registry,
      typename Registry<ObjectPtrType, Args...>::Creator creator,
      const std::string& helpMsg = "") {
    registry.registerCreator(key, creator, helpMsg);
  }
};

// The following macros should be used to create/add to registries. Avoid
// invoking the Registry class template functions directly.

#define TP_CONCATENATE_IMPL(s1, s2) s1##s2
#define TP_CONCATENATE(s1, s2) TP_CONCATENATE_IMPL(s1, s2)
#define TP_ANONYMOUS_VARIABLE(str) TP_CONCATENATE(str, __LINE__)

// Using the construct on first use idiom to avoid static order initialization
// issue. Refer to this link for reference:
// https://isocpp.org/wiki/faq/ctors#static-init-order-on-first-use
#define TP_DEFINE_TYPED_REGISTRY(RegistryName, ObjectType, PtrType, ...)     \
  tensorpipe::Registry<PtrType<ObjectType>, ##__VA_ARGS__>& RegistryName() { \
    static tensorpipe::Registry<PtrType<ObjectType>, ##__VA_ARGS__>*         \
        registry =                                                           \
            new tensorpipe::Registry<PtrType<ObjectType>, ##__VA_ARGS__>();  \
    return *registry;                                                        \
  }

#define TP_DECLARE_TYPED_REGISTRY(RegistryName, ObjectType, PtrType, ...)   \
  tensorpipe::Registry<PtrType<ObjectType>, ##__VA_ARGS__>& RegistryName(); \
  typedef tensorpipe::Registerer<PtrType<ObjectType>, ##__VA_ARGS__>        \
      Registerer##RegistryName

#define TP_DEFINE_SHARED_REGISTRY(RegistryName, ObjectType, ...) \
  TP_DEFINE_TYPED_REGISTRY(                                      \
      RegistryName, ObjectType, std::shared_ptr, ##__VA_ARGS__)

#define TP_DECLARE_SHARED_REGISTRY(RegistryName, ObjectType, ...) \
  TP_DECLARE_TYPED_REGISTRY(                                      \
      RegistryName, ObjectType, std::shared_ptr, ##__VA_ARGS__)

#define TP_REGISTER_TYPED_CREATOR(RegistryName, key, ...)                  \
  static Registerer##RegistryName TP_ANONYMOUS_VARIABLE(g_##RegistryName)( \
      key, RegistryName(), ##__VA_ARGS__);

#define TP_REGISTER_CREATOR(RegistryName, key, ...) \
  TP_REGISTER_TYPED_CREATOR(RegistryName, #key, __VA_ARGS__)

} // namespace tensorpipe


================================================
FILE: tensorpipe/benchmark/transport_registry.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/benchmark/transport_registry.h>

#include <tensorpipe/tensorpipe.h>

TP_DEFINE_SHARED_REGISTRY(
    TensorpipeTransportRegistry,
    tensorpipe::transport::Context);

// IBV

#if TENSORPIPE_HAS_IBV_TRANSPORT
std::shared_ptr<tensorpipe::transport::Context> makeIbvContext() {
  return tensorpipe::transport::ibv::create();
}

TP_REGISTER_CREATOR(TensorpipeTransportRegistry, ibv, makeIbvContext);
#endif // TENSORPIPE_HAS_IBV_TRANSPORT

// SHM

#if TENSORPIPE_HAS_SHM_TRANSPORT
std::shared_ptr<tensorpipe::transport::Context> makeShmContext() {
  return tensorpipe::transport::shm::create();
}

TP_REGISTER_CREATOR(TensorpipeTransportRegistry, shm, makeShmContext);
#endif // TENSORPIPE_HAS_SHM_TRANSPORT

// UV

std::shared_ptr<tensorpipe::transport::Context> makeUvContext() {
  return tensorpipe::transport::uv::create();
}

TP_REGISTER_CREATOR(TensorpipeTransportRegistry, uv, makeUvContext);

void validateTransportContext(
    std::shared_ptr<tensorpipe::transport::Context> context) {
  if (!context) {
    auto keys = TensorpipeTransportRegistry().keys();
    std::cout
        << "The transport you passed in is not supported. The following transports are valid: ";
    for (const auto& key : keys) {
      std::cout << key << ", ";
    }
    std::cout << "\n";
    exit(EXIT_FAILURE);
  }
}


================================================
FILE: tensorpipe/benchmark/transport_registry.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <tensorpipe/benchmark/registry.h>
#include <tensorpipe/transport/context.h>

TP_DECLARE_SHARED_REGISTRY(
    TensorpipeTransportRegistry,
    tensorpipe::transport::Context);

void validateTransportContext(
    std::shared_ptr<tensorpipe::transport::Context> context);


================================================
FILE: tensorpipe/channel/basic/channel_impl.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/channel/basic/channel_impl.h>

#include <memory>
#include <string>
#include <utility>

#include <tensorpipe/channel/basic/context_impl.h>
#include <tensorpipe/common/cpu_buffer.h>
#include <tensorpipe/common/defs.h>
#include <tensorpipe/common/error.h>
#include <tensorpipe/transport/connection.h>

namespace tensorpipe {
namespace channel {
namespace basic {

ChannelImpl::ChannelImpl(
    ConstructorToken token,
    std::shared_ptr<ContextImpl> context,
    std::string id,
    std::shared_ptr<transport::Connection> connection)
    : ChannelImplBoilerplate<ContextImpl, ChannelImpl>(
          token,
          std::move(context),
          std::move(id)),
      connection_(std::move(connection)) {}

void ChannelImpl::initImplFromLoop() {
  context_->enroll(*this);
}

void ChannelImpl::sendImplFromLoop(
    uint64_t sequenceNumber,
    Buffer buffer,
    size_t length,
    TSendCallback callback) {
  SendOpIter opIter = sendOps_.emplaceBack(sequenceNumber);
  SendOperation& op = *opIter;
  op.ptr = buffer.unwrap<CpuBuffer>().ptr;
  op.length = length;
  op.callback = std::move(callback);

  sendOps_.advanceOperation(opIter);
}

void ChannelImpl::advanceSendOperation(
    SendOpIter opIter,
    SendOperation::State prevOpState) {
  TP_DCHECK(context_->inLoop());

  SendOperation& op = *opIter;

  sendOps_.attemptTransition(
      opIter,
      /*from=*/SendOperation::UNINITIALIZED,
      /*to=*/SendOperation::FINISHED,
      /*cond=*/error_ || op.length == 0,
      /*actions=*/{&ChannelImpl::callSendCallback});

  // Needs to go after previous op to ensure predictable and consistent ordering
  // of write calls on the connection.
  sendOps_.attemptTransition(
      opIter,
      /*from=*/SendOperation::UNINITIALIZED,
      /*to=*/SendOperation::WRITING,
      /*cond=*/!error_ && prevOpState >= SendOperation::WRITING,
      /*actions=*/{&ChannelImpl::write});

  sendOps_.attemptTransition(
      opIter,
      /*from=*/SendOperation::WRITING,
      /*to=*/SendOperation::FINISHED,
      /*cond=*/op.doneWriting,
      /*actions=*/{&ChannelImpl::callSendCallback});
}

void ChannelImpl::write(SendOpIter opIter) {
  SendOperation& op = *opIter;

  TP_VLOG(6) << "Channel " << id_ << " is writing payload (#"
             << op.sequenceNumber << ")";
  connection_->write(
      op.ptr, op.length, callbackWrapper_([opIter](ChannelImpl& impl) {
        TP_VLOG(6) << "Channel " << impl.id_ << " done writing payload (#"
                   << opIter->sequenceNumber << ")";
        opIter->doneWriting = true;
        impl.sendOps_.advanceOperation(opIter);
      }));
}

void ChannelImpl::callSendCallback(SendOpIter opIter) {
  SendOperation& op = *opIter;

  op.callback(error_);
  // Reset callback to release the resources it was holding.
  op.callback = nullptr;
}

void ChannelImpl::recvImplFromLoop(
    uint64_t sequenceNumber,
    Buffer buffer,
    size_t length,
    TRecvCallback callback) {
  RecvOpIter opIter = recvOps_.emplaceBack(sequenceNumber);
  RecvOperation& op = *opIter;
  op.ptr = buffer.unwrap<CpuBuffer>().ptr;
  op.length = length;
  op.callback = std::move(callback);

  recvOps_.advanceOperation(opIter);
}

void ChannelImpl::advanceRecvOperation(
    RecvOpIter opIter,
    RecvOperation::State prevOpState) {
  TP_DCHECK(context_->inLoop());

  RecvOperation& op = *opIter;

  recvOps_.attemptTransition(
      opIter,
      /*from=*/RecvOperation::UNINITIALIZED,
      /*to=*/RecvOperation::FINISHED,
      /*cond=*/error_ || op.length == 0,
      /*actions=*/{&ChannelImpl::callRecvCallback});

  // Needs to go after previous op to ensure predictable and consistent ordering
  // of read calls on the connection.
  recvOps_.attemptTransition(
      opIter,
      /*from=*/RecvOperation::UNINITIALIZED,
      /*to=*/RecvOperation::READING,
      /*cond=*/!error_ && prevOpState >= RecvOperation::READING,
      /*actions=*/{&ChannelImpl::read});

  recvOps_.attemptTransition(
      opIter,
      /*from=*/RecvOperation::READING,
      /*to=*/RecvOperation::FINISHED,
      /*cond=*/op.doneReading,
      /*actions=*/{&ChannelImpl::callRecvCallback});
}

void ChannelImpl::read(RecvOpIter opIter) {
  RecvOperation& op = *opIter;

  TP_VLOG(6) << "Channel " << id_ << " is reading payload (#"
             << op.sequenceNumber << ")";
  connection_->read(
      op.ptr,
      op.length,
      callbackWrapper_([opIter](
                           ChannelImpl& impl,
                           const void* /* unused */,
                           size_t /* unused */) {
        TP_VLOG(6) << "Channel " << impl.id_ << " done reading payload (#"
                   << opIter->sequenceNumber << ")";
        opIter->doneReading = true;
        impl.recvOps_.advanceOperation(opIter);
      }));
}

void ChannelImpl::callRecvCallback(RecvOpIter opIter) {
  RecvOperation& op = *opIter;

  op.callback(error_);
  // Reset callback to release the resources it was holding.
  op.callback = nullptr;
}

void ChannelImpl::handleErrorImpl() {
  sendOps_.advanceAllOperations();
  recvOps_.advanceAllOperations();

  // Close the connection so that all current operations will be aborted. This
  // will cause their callbacks to be invoked, and only then we'll invoke ours.
  connection_->close();

  context_->unenroll(*this);
}

} // namespace basic
} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/channel/basic/channel_impl.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <memory>
#include <string>

#include <tensorpipe/channel/channel_impl_boilerplate.h>
#include <tensorpipe/common/state_machine.h>
#include <tensorpipe/transport/context.h>

namespace tensorpipe {
namespace channel {
namespace basic {

class ContextImpl;

struct SendOperation {
  enum State { UNINITIALIZED, WRITING, FINISHED };

  // Fields used by the state machine
  uint64_t sequenceNumber{0};
  State state{UNINITIALIZED};

  // Progress flags
  bool doneWriting{false};

  // Arguments at creation
  const void* ptr;
  size_t length;
  TSendCallback callback;
};

// State capturing a single recv operation.
struct RecvOperation {
  enum State { UNINITIALIZED, READING, FINISHED };

  // Fields used by the state machine
  uint64_t sequenceNumber{0};
  State state{UNINITIALIZED};

  // Progress flags
  bool doneReading{false};

  // Arguments at creation
  void* ptr;
  size_t length;
  TRecvCallback callback;
};

class ChannelImpl final
    : public ChannelImplBoilerplate<ContextImpl, ChannelImpl> {
 public:
  ChannelImpl(
      ConstructorToken token,
      std::shared_ptr<ContextImpl> context,
      std::string id,
      std::shared_ptr<transport::Connection> connection);

 protected:
  // Implement the entry points called by ChannelImplBoilerplate.
  void initImplFromLoop() override;
  void sendImplFromLoop(
      uint64_t sequenceNumber,
      Buffer buffer,
      size_t length,
      TSendCallback callback) override;
  void recvImplFromLoop(
      uint64_t sequenceNumber,
      Buffer buffer,
      size_t length,
      TRecvCallback callback) override;
  void handleErrorImpl() override;

 private:
  const std::shared_ptr<transport::Connection> connection_;

  OpsStateMachine<ChannelImpl, SendOperation> sendOps_{
      *this,
      &ChannelImpl::advanceSendOperation};
  using SendOpIter = decltype(sendOps_)::Iter;
  OpsStateMachine<ChannelImpl, RecvOperation> recvOps_{
      *this,
      &ChannelImpl::advanceRecvOperation};
  using RecvOpIter = decltype(recvOps_)::Iter;

  // State machines for send and recv ops.
  void advanceSendOperation(
      SendOpIter opIter,
      SendOperation::State prevOpState);
  void advanceRecvOperation(
      RecvOpIter opIter,
      RecvOperation::State prevOpState);

  // Actions (i.e., methods that begin a state transition).
  // For send operations:
  void write(SendOpIter opIter);
  void callSendCallback(SendOpIter opIter);
  // For recv operations:
  void read(RecvOpIter opIter);
  void callRecvCallback(RecvOpIter opIter);
};

} // namespace basic
} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/channel/basic/context_impl.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/channel/basic/context_impl.h>

#include <functional>
#include <utility>

#include <tensorpipe/channel/basic/channel_impl.h>

namespace tensorpipe {
namespace channel {
namespace basic {

std::shared_ptr<ContextImpl> ContextImpl::create() {
  std::unordered_map<Device, std::string> deviceDescriptors = {
      {Device{kCpuDeviceType, 0}, "any"}};
  return std::make_shared<ContextImpl>(std::move(deviceDescriptors));
}

ContextImpl::ContextImpl(
    std::unordered_map<Device, std::string> deviceDescriptors)
    : ContextImplBoilerplate<ContextImpl, ChannelImpl>(
          std::move(deviceDescriptors)) {}

std::shared_ptr<Channel> ContextImpl::createChannel(
    std::vector<std::shared_ptr<transport::Connection>> connections,
    Endpoint /* unused */) {
  TP_DCHECK_EQ(numConnectionsNeeded(), connections.size());
  return createChannelInternal(std::move(connections[0]));
}

void ContextImpl::handleErrorImpl() {}

void ContextImpl::joinImpl() {}

bool ContextImpl::inLoop() const {
  return loop_.inLoop();
};

void ContextImpl::deferToLoop(std::function<void()> fn) {
  loop_.deferToLoop(std::move(fn));
};

} // namespace basic
} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/channel/basic/context_impl.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <functional>

#include <tensorpipe/channel/context_impl_boilerplate.h>
#include <tensorpipe/common/deferred_executor.h>
#include <tensorpipe/common/device.h>

namespace tensorpipe {
namespace channel {
namespace basic {

class ChannelImpl;

class ContextImpl final
    : public ContextImplBoilerplate<ContextImpl, ChannelImpl> {
 public:
  static std::shared_ptr<ContextImpl> create();

  explicit ContextImpl(
      std::unordered_map<Device, std::string> deviceDescriptors);

  std::shared_ptr<Channel> createChannel(
      std::vector<std::shared_ptr<transport::Connection>> connections,
      Endpoint endpoint);

  // Implement the DeferredExecutor interface.
  bool inLoop() const override;
  void deferToLoop(std::function<void()> fn) override;

 protected:
  // Implement the entry points called by ContextImplBoilerplate.
  void handleErrorImpl() override;
  void joinImpl() override;

 private:
  OnDemandDeferredExecutor loop_;
};

} // namespace basic
} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/channel/basic/factory.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/channel/basic/factory.h>

#include <tensorpipe/channel/basic/channel_impl.h>
#include <tensorpipe/channel/basic/context_impl.h>
#include <tensorpipe/channel/context_boilerplate.h>

namespace tensorpipe {
namespace channel {
namespace basic {

std::shared_ptr<Context> create() {
  return std::make_shared<ContextBoilerplate<ContextImpl, ChannelImpl>>();
}

} // namespace basic
} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/channel/basic/factory.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <memory>

#include <tensorpipe/channel/context.h>

namespace tensorpipe {
namespace channel {
namespace basic {

std::shared_ptr<Context> create();

} // namespace basic
} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/channel/channel.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <functional>
#include <string>

#include <tensorpipe/channel/context.h>
#include <tensorpipe/common/buffer.h>
#include <tensorpipe/common/error.h>

// Channels are an out of band mechanism to transfer data between
// processes. Examples include a direct address space to address space
// memory copy on the same machine, or a GPU-to-GPU memory copy.
//
// Construction of a channel happens as follows.
//
//   1) During initialization of a pipe, the connecting peer sends its
//      list of channel contexts and their device descriptors. The
//      device descriptor is used to determine whether or not a
//      channel can be used by a pair of peers.
//   2) The listening side of the pipe compares the list it received
//      its own list to determine the list of channels that should be used
//      for the peers.
//   3) For every channel that should be constructed, the listening
//      side registers a slot with its low level listener. These slots
//      uniquely identify inbound connections on this listener (by
//      sending a word-sized indentifier immediately after connecting)
//      and can be used to construct new connections. These slots are
//      sent to the connecting side of the pipe, which then attempts
//      to establish a new connection for every token.
//   4) At this time, we have a new control connection for every
//      channel that is about to be constructed. Both sides of the
//      pipe can now create the channel instance using the newly
//      created connection. Further initialization that needs to
//      happen is defered to the channel implementation. We assume the
//      channel is usable from the moment it is constructed.
//
namespace tensorpipe {
namespace channel {

using TSendCallback = std::function<void(const Error&)>;
using TRecvCallback = std::function<void(const Error&)>;

// Abstract base class for channel classes.
class Channel {
 public:
  // Send memory region to peer.
  virtual void send(Buffer buffer, size_t length, TSendCallback callback) = 0;

  // Receive memory region from peer.
  virtual void recv(Buffer buffer, size_t length, TRecvCallback callback) = 0;

  // Tell the channel what its identifier is.
  //
  // This is only supposed to be called from the high-level pipe. It will only
  // used for logging and debugging purposes.
  virtual void setId(std::string id) = 0;

  // Put the channel in a terminal state, aborting pending operations and
  // rejecting future ones, and release its resources. This may be carried out
  // asynchronously, in background.
  virtual void close() = 0;

  virtual ~Channel() = default;
};

} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/channel/channel_boilerplate.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <cstddef>
#include <memory>
#include <string>
#include <type_traits>
#include <utility>

#include <tensorpipe/channel/channel.h>
#include <tensorpipe/channel/channel_impl_boilerplate.h>

namespace tensorpipe {
namespace channel {

template <typename TCtx, typename TChan>
class ChannelBoilerplate : public Channel {
 public:
  template <typename... Args>
  ChannelBoilerplate(
      typename ChannelImplBoilerplate<TCtx, TChan>::ConstructorToken token,
      std::shared_ptr<TCtx> context,
      std::string id,
      Args&&... args);

  explicit ChannelBoilerplate(std::shared_ptr<TChan> channel);

  ChannelBoilerplate(const ChannelBoilerplate&) = delete;
  ChannelBoilerplate(ChannelBoilerplate&&) = delete;
  ChannelBoilerplate& operator=(const ChannelBoilerplate&) = delete;
  ChannelBoilerplate& operator=(ChannelBoilerplate&&) = delete;

  // Perform a send operation.
  void send(Buffer buffer, size_t length, TSendCallback callback) override;

  // Queue a recv operation.
  void recv(Buffer buffer, size_t length, TRecvCallback callback) override;

  // Tell the connection what its identifier is.
  void setId(std::string id) override;

  // Shut down the connection and its resources.
  void close() override;

  ~ChannelBoilerplate() override;

 protected:
  // Using a shared_ptr allows us to detach the lifetime of the implementation
  // from the public object's one and perform the destruction asynchronously.
  const std::shared_ptr<TChan> impl_;
};

template <typename TCtx, typename TChan>
template <typename... Args>
ChannelBoilerplate<TCtx, TChan>::ChannelBoilerplate(
    typename ChannelImplBoilerplate<TCtx, TChan>::ConstructorToken token,
    std::shared_ptr<TCtx> context,
    std::string id,
    Args&&... args)
    : impl_(std::make_shared<TChan>(
          token,
          std::move(context),
          std::move(id),
          std::forward<Args>(args)...)) {
  static_assert(
      std::is_base_of<ChannelImplBoilerplate<TCtx, TChan>, TChan>::value, "");
  impl_->init();
}

template <typename TCtx, typename TChan>
ChannelBoilerplate<TCtx, TChan>::ChannelBoilerplate(
    std::shared_ptr<TChan> channel)
    : impl_(std::move(channel)) {
  static_assert(
      std::is_base_of<ChannelImplBoilerplate<TCtx, TChan>, TChan>::value, "");
}

template <typename TCtx, typename TChan>
void ChannelBoilerplate<TCtx, TChan>::send(
    Buffer buffer,
    size_t length,
    TSendCallback callback) {
  if (unlikely(!impl_)) {
    // FIXME In C++-17 perhaps a global static inline variable would be better?
    static Error error = TP_CREATE_ERROR(ContextNotViableError);
    callback(error);
    return;
  }
  impl_->send(buffer, length, std::move(callback));
}

template <typename TCtx, typename TChan>
void ChannelBoilerplate<TCtx, TChan>::recv(
    Buffer buffer,
    size_t length,
    TRecvCallback callback) {
  if (unlikely(!impl_)) {
    // FIXME In C++-17 perhaps a global static inline variable would be better?
    static Error error = TP_CREATE_ERROR(ContextNotViableError);
    callback(error);
    return;
  }
  impl_->recv(buffer, length, std::move(callback));
}

template <typename TCtx, typename TChan>
void ChannelBoilerplate<TCtx, TChan>::setId(std::string id) {
  if (unlikely(!impl_)) {
    return;
  }
  impl_->setId(std::move(id));
}

template <typename TCtx, typename TChan>
void ChannelBoilerplate<TCtx, TChan>::close() {
  if (unlikely(!impl_)) {
    return;
  }
  impl_->close();
}

template <typename TCtx, typename TChan>
ChannelBoilerplate<TCtx, TChan>::~ChannelBoilerplate() {
  close();
}

} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/channel/channel_impl_boilerplate.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <memory>
#include <string>
#include <utility>

#include <tensorpipe/channel/channel.h>
#include <tensorpipe/channel/error.h>
#include <tensorpipe/common/callback.h>
#include <tensorpipe/common/defs.h>
#include <tensorpipe/common/error.h>
#include <tensorpipe/common/error_macros.h>

namespace tensorpipe {
namespace channel {

template <typename TCtx, typename TChan>
class ContextImplBoilerplate;

template <typename TCtx, typename TChan>
class ChannelImplBoilerplate : public std::enable_shared_from_this<TChan> {
 public:
  class ConstructorToken {
   public:
    ConstructorToken(const ConstructorToken&) = default;

   private:
    explicit ConstructorToken() {}
    friend ContextImplBoilerplate<TCtx, TChan>;
  };

  ChannelImplBoilerplate(
      ConstructorToken token,
      std::shared_ptr<TCtx> context,
      std::string id);

  ChannelImplBoilerplate(const ChannelImplBoilerplate&) = delete;
  ChannelImplBoilerplate(ChannelImplBoilerplate&&) = delete;
  ChannelImplBoilerplate& operator=(const ChannelImplBoilerplate&) = delete;
  ChannelImplBoilerplate& operator=(ChannelImplBoilerplate&&) = delete;

  // Initialize member fields that need `shared_from_this`.
  void init();

  // Perform a send operation.
  void send(Buffer buffer, size_t length, TSendCallback callback);

  // Queue a recv operation.
  void recv(Buffer buffer, size_t length, TRecvCallback callback);

  // Tell the connection what its identifier is.
  void setId(std::string id);

  // Shut down the connection and its resources.
  void close();

  virtual ~ChannelImplBoilerplate() = default;

 protected:
  virtual void initImplFromLoop() = 0;
  virtual void sendImplFromLoop(
      uint64_t sequenceNumber,
      Buffer buffer,
      size_t length,
      TSendCallback callback) = 0;
  virtual void recvImplFromLoop(
      uint64_t sequenceNumber,
      Buffer buffer,
      size_t length,
      TRecvCallback callback) = 0;
  virtual void handleErrorImpl() = 0;
  virtual void setIdImpl() {}

  void setError(Error error);

  const std::shared_ptr<TCtx> context_;

  Error error_{Error::kSuccess};

  // An identifier for the connection, composed of the identifier for the
  // context or listener, combined with an increasing sequence number. It will
  // only be used for logging and debugging purposes.
  std::string id_;

  CallbackWrapper<TChan> callbackWrapper_{*this, *this->context_};

 private:
  // Initialize member fields that need `shared_from_this`.
  void initFromLoop();

  // Perform a send operation.
  void sendFromLoop(Buffer buffer, size_t length, TSendCallback callback);

  // Queue a recv operation.
  void recvFromLoop(Buffer buffer, size_t length, TRecvCallback callback);

  void setIdFromLoop(std::string id);

  // Shut down the connection and its resources.
  void closeFromLoop();

  // Deal with an error.
  void handleError();

  // A sequence number for the calls to send and recv.
  uint64_t nextTensorBeingSent_{0};
  uint64_t nextTensorBeingReceived_{0};

  // For some odd reason it seems we need to use a qualified name here...
  template <typename T>
  friend class tensorpipe::CallbackWrapper;

  // Contexts do sometimes need to call directly into closeFromLoop, in order to
  // make sure that some of their operations can happen "atomically" on the
  // connection, without possibly other operations occurring in between (e.g.,
  // an error).
  friend ContextImplBoilerplate<TCtx, TChan>;
};

template <typename TCtx, typename TChan>
ChannelImplBoilerplate<TCtx, TChan>::ChannelImplBoilerplate(
    ConstructorToken /* unused */,
    std::shared_ptr<TCtx> context,
    std::string id)
    : context_(std::move(context)), id_(std::move(id)) {}

template <typename TCtx, typename TChan>
void ChannelImplBoilerplate<TCtx, TChan>::init() {
  context_->deferToLoop(
      [impl{this->shared_from_this()}]() { impl->initFromLoop(); });
}

template <typename TCtx, typename TChan>
void ChannelImplBoilerplate<TCtx, TChan>::initFromLoop() {
  if (context_->closed()) {
    // Set the error without calling setError because we do not want to invoke
    // the subclass's handleErrorImpl as it would find itself in a weird state
    // (since initFromLoop wouldn't have been called).
    error_ = TP_CREATE_ERROR(ChannelClosedError);
    TP_VLOG(4) << "Channel " << id_ << " is closing (without initing)";
    return;
  }

  initImplFromLoop();
}

template <typename TCtx, typename TChan>
void ChannelImplBoilerplate<TCtx, TChan>::send(
    Buffer buffer,
    size_t length,
    TSendCallback callback) {
  context_->deferToLoop([impl{this->shared_from_this()},
                         buffer,
                         length,
                         callback{std::move(callback)}]() mutable {
    impl->sendFromLoop(buffer, length, std::move(callback));
  });
}

template <typename TCtx, typename TChan>
void ChannelImplBoilerplate<TCtx, TChan>::sendFromLoop(
    Buffer buffer,
    size_t length,
    TSendCallback callback) {
  TP_DCHECK(context_->inLoop());

  const uint64_t sequenceNumber = nextTensorBeingSent_++;
  TP_VLOG(4) << "Channel " << id_ << " received a send request (#"
             << sequenceNumber << ")";

  callback = [this, sequenceNumber, callback{std::move(callback)}](
                 const Error& error) {
    // There is no requirement for the channel to invoke callbacks in order.
    TP_VLOG(4) << "Channel " << id_ << " is calling a send callback (#"
               << sequenceNumber << ")";
    callback(error);
    TP_VLOG(4) << "Channel " << id_ << " done calling a send callback (#"
               << sequenceNumber << ")";
  };

  if (error_) {
    callback(error_);
    return;
  }

  sendImplFromLoop(sequenceNumber, buffer, length, std::move(callback));
}

template <typename TCtx, typename TChan>
void ChannelImplBoilerplate<TCtx, TChan>::recv(
    Buffer buffer,
    size_t length,
    TRecvCallback callback) {
  context_->deferToLoop([impl{this->shared_from_this()},
                         buffer,
                         length,
                         callback{std::move(callback)}]() mutable {
    impl->recvFromLoop(buffer, length, std::move(callback));
  });
}

template <typename TCtx, typename TChan>
void ChannelImplBoilerplate<TCtx, TChan>::recvFromLoop(
    Buffer buffer,
    size_t length,
    TRecvCallback callback) {
  TP_DCHECK(context_->inLoop());

  const uint64_t sequenceNumber = nextTensorBeingReceived_++;
  TP_VLOG(4) << "Channel " << id_ << " received a recv request (#"
             << sequenceNumber << ")";

  callback = [this, sequenceNumber, callback{std::move(callback)}](
                 const Error& error) {
    // There is no requirement for the channel to invoke callbacks in order.
    TP_VLOG(4) << "Channel " << id_ << " is calling a recv callback (#"
               << sequenceNumber << ")";
    callback(error);
    TP_VLOG(4) << "Channel " << id_ << " done calling a recv callback (#"
               << sequenceNumber << ")";
  };

  if (error_) {
    callback(error_);
    return;
  }

  recvImplFromLoop(sequenceNumber, buffer, length, std::move(callback));
}

template <typename TCtx, typename TChan>
void ChannelImplBoilerplate<TCtx, TChan>::setId(std::string id) {
  context_->deferToLoop(
      [impl{this->shared_from_this()}, id{std::move(id)}]() mutable {
        impl->setIdFromLoop(std::move(id));
      });
}

template <typename TCtx, typename TChan>
void ChannelImplBoilerplate<TCtx, TChan>::setIdFromLoop(std::string id) {
  TP_DCHECK(context_->inLoop());
  TP_VLOG(4) << "Channel " << id_ << " was renamed to " << id;
  id_ = std::move(id);
  setIdImpl();
}

template <typename TCtx, typename TChan>
void ChannelImplBoilerplate<TCtx, TChan>::close() {
  context_->deferToLoop(
      [impl{this->shared_from_this()}]() { impl->closeFromLoop(); });
}

template <typename TCtx, typename TChan>
void ChannelImplBoilerplate<TCtx, TChan>::closeFromLoop() {
  TP_DCHECK(context_->inLoop());
  TP_VLOG(4) << "Channel " << id_ << " is closing";
  setError(TP_CREATE_ERROR(ChannelClosedError));
}

template <typename TCtx, typename TChan>
void ChannelImplBoilerplate<TCtx, TChan>::setError(Error error) {
  // Don't overwrite an error that's already set.
  if (error_ || !error) {
    return;
  }

  error_ = std::move(error);

  handleError();
}

template <typename TCtx, typename TChan>
void ChannelImplBoilerplate<TCtx, TChan>::handleError() {
  TP_DCHECK(context_->inLoop());
  TP_VLOG(5) << "Channel " << id_ << " is handling error " << error_.what();

  handleErrorImpl();
}

} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/channel/cma/channel_impl.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/channel/cma/channel_impl.h>

#include <memory>
#include <string>
#include <utility>

#include <nop/serializer.h>
#include <nop/structure.h>

#include <tensorpipe/channel/cma/context_impl.h>
#include <tensorpipe/common/cpu_buffer.h>
#include <tensorpipe/common/defs.h>
#include <tensorpipe/common/error.h>
#include <tensorpipe/transport/connection.h>

namespace tensorpipe {
namespace channel {
namespace cma {

namespace {

struct Descriptor {
  uint32_t pid;
  uint64_t ptr;
  NOP_STRUCTURE(Descriptor, pid, ptr);
};

} // namespace

ChannelImpl::ChannelImpl(
    ConstructorToken token,
    std::shared_ptr<ContextImpl> context,
    std::string id,
    std::shared_ptr<transport::Connection> descriptorConnection,
    std::shared_ptr<transport::Connection> completionConnection)
    : ChannelImplBoilerplate<ContextImpl, ChannelImpl>(
          token,
          std::move(context),
          std::move(id)),
      descriptorConnection_(std::move(descriptorConnection)),
      completionConnection_(std::move(completionConnection)) {}

void ChannelImpl::initImplFromLoop() {
  context_->enroll(*this);
}

void ChannelImpl::sendImplFromLoop(
    uint64_t sequenceNumber,
    Buffer buffer,
    size_t length,
    TSendCallback callback) {
  SendOpIter opIter = sendOps_.emplaceBack(sequenceNumber);
  SendOperation& op = *opIter;
  op.callback = std::move(callback);
  op.ptr = buffer.unwrap<CpuBuffer>().ptr;
  op.length = length;

  sendOps_.advanceOperation(opIter);
}

void ChannelImpl::advanceSendOperation(
    SendOpIter opIter,
    SendOperation::State prevOpState) {
  TP_DCHECK(context_->inLoop());

  SendOperation& op = *opIter;

  sendOps_.attemptTransition(
      opIter,
      /*from=*/SendOperation::UNINITIALIZED,
      /*to=*/SendOperation::FINISHED,
      /*cond=*/error_ || op.length == 0,
      /*actions=*/{&ChannelImpl::callSendCallback});

  // Needs to go after previous op to ensure predictable and consistent ordering
  // of write calls on the descriptor control connection and read calls on the
  // completion control connection.
  sendOps_.attemptTransition(
      opIter,
      /*from=*/SendOperation::UNINITIALIZED,
      /*to=*/SendOperation::READING_COMPLETION,
      /*cond=*/!error_ && prevOpState >= SendOperation::READING_COMPLETION,
      /*actions=*/
      {&ChannelImpl::writeDescriptor, &ChannelImpl::readCompletion});

  sendOps_.attemptTransition(
      opIter,
      /*from=*/SendOperation::READING_COMPLETION,
      /*to=*/SendOperation::FINISHED,
      /*cond=*/op.doneReadingCompletion,
      /*actions=*/{&ChannelImpl::callSendCallback});
}

void ChannelImpl::writeDescriptor(SendOpIter opIter) {
  SendOperation& op = *opIter;

  auto nopHolder = std::make_shared<NopHolder<Descriptor>>();
  Descriptor& nopDescriptor = nopHolder->getObject();
  // TODO: Store the PID upon channel/context instantiation.
  nopDescriptor.pid = ::getpid();
  nopDescriptor.ptr = reinterpret_cast<uint64_t>(op.ptr);

  TP_VLOG(6) << "Channel " << id_ << " is writing descriptor (#"
             << op.sequenceNumber << ")";
  descriptorConnection_->write(
      *nopHolder,
      callbackWrapper_([sequenceNumber{op.sequenceNumber},
                        nopHolder](ChannelImpl& impl) {
        TP_VLOG(6) << "Channel " << impl.id_ << " done writing descriptor (#"
                   << sequenceNumber << ")";
      }));
}

void ChannelImpl::readCompletion(SendOpIter opIter) {
  SendOperation& op = *opIter;

  TP_VLOG(6) << "Channel " << id_ << " is reading completion (#"
             << op.sequenceNumber << ")";
  completionConnection_->read(
      nullptr,
      0,
      callbackWrapper_([opIter](
                           ChannelImpl& impl,
                           const void* /* unused */,
                           size_t /* unused */) {
        TP_VLOG(6) << "Channel " << impl.id_ << " done reading completion (#"
                   << opIter->sequenceNumber << ")";
        opIter->doneReadingCompletion = true;
        impl.sendOps_.advanceOperation(opIter);
      }));
}

void ChannelImpl::callSendCallback(SendOpIter opIter) {
  SendOperation& op = *opIter;

  op.callback(error_);
  // Reset callback to release the resources it was holding.
  op.callback = nullptr;
}

void ChannelImpl::recvImplFromLoop(
    uint64_t sequenceNumber,
    Buffer buffer,
    size_t length,
    TRecvCallback callback) {
  RecvOpIter opIter = recvOps_.emplaceBack(sequenceNumber);
  RecvOperation& op = *opIter;
  op.ptr = buffer.unwrap<CpuBuffer>().ptr;
  op.length = length;
  op.callback = std::move(callback);

  recvOps_.advanceOperation(opIter);
}

void ChannelImpl::advanceRecvOperation(
    RecvOpIter opIter,
    RecvOperation::State prevOpState) {
  TP_DCHECK(context_->inLoop());

  RecvOperation& op = *opIter;

  recvOps_.attemptTransition(
      opIter,
      /*from=*/RecvOperation::UNINITIALIZED,
      /*to=*/RecvOperation::FINISHED,
      /*cond=*/error_ || op.length == 0,
      /*actions=*/{&ChannelImpl::callRecvCallback});

  // Needs to go after previous op to ensure predictable and consistent ordering
  // of read calls on the descriptor control connection.
  recvOps_.attemptTransition(
      opIter,
      /*from=*/RecvOperation::UNINITIALIZED,
      /*to=*/RecvOperation::READING_DESCRIPTOR,
      /*cond=*/!error_ && prevOpState >= RecvOperation::READING_DESCRIPTOR,
      /*actions=*/{&ChannelImpl::readDescriptor});

  recvOps_.attemptTransition(
      opIter,
      /*from=*/RecvOperation::READING_DESCRIPTOR,
      /*to=*/RecvOperation::FINISHED,
      /*cond=*/error_ && op.doneReadingDescriptor,
      /*actions=*/{&ChannelImpl::callRecvCallback});

  recvOps_.attemptTransition(
      opIter,
      /*from=*/RecvOperation::READING_DESCRIPTOR,
      /*to=*/RecvOperation::COPYING,
      /*cond=*/!error_ && op.doneReadingDescriptor,
      /*actions=*/{&ChannelImpl::copy});

  recvOps_.attemptTransition(
      opIter,
      /*from=*/RecvOperation::COPYING,
      /*to=*/RecvOperation::FINISHED,
      /*cond=*/error_ && op.doneCopying,
      /*actions=*/{&ChannelImpl::callRecvCallback});

  // Needs to go after previous op to ensure predictable and consistent ordering
  // of write calls on the completion control connection.
  recvOps_.attemptTransition(
      opIter,
      /*from=*/RecvOperation::COPYING,
      /*to=*/RecvOperation::FINISHED,
      /*cond=*/!error_ && op.doneCopying &&
          prevOpState >= RecvOperation::FINISHED,
      /*actions=*/
      {&ChannelImpl::callRecvCallback, &ChannelImpl::writeCompletion});
}

void ChannelImpl::readDescriptor(RecvOpIter opIter) {
  RecvOperation& op = *opIter;

  TP_VLOG(6) << "Channel " << id_ << " is reading descriptor (#"
             << op.sequenceNumber << ")";
  auto nopHolderIn = std::make_shared<NopHolder<Descriptor>>();
  descriptorConnection_->read(
      *nopHolderIn, callbackWrapper_([opIter, nopHolderIn](ChannelImpl& impl) {
        TP_VLOG(6) << "Channel " << impl.id_ << " done reading descriptor (#"
                   << opIter->sequenceNumber << ")";
        opIter->doneReadingDescriptor = true;
        if (!impl.error_) {
          Descriptor& nopDescriptor = nopHolderIn->getObject();
          opIter->remotePid = nopDescriptor.pid;
          opIter->remotePtr = reinterpret_cast<void*>(nopDescriptor.ptr);
        }
        impl.recvOps_.advanceOperation(opIter);
      }));
}

void ChannelImpl::copy(RecvOpIter opIter) {
  RecvOperation& op = *opIter;

  TP_VLOG(6) << "Channel " << id_ << " is copying payload (#"
             << op.sequenceNumber << ")";
  context_->requestCopy(
      op.remotePid,
      op.remotePtr,
      op.ptr,
      op.length,
      callbackWrapper_([opIter](ChannelImpl& impl) {
        TP_VLOG(6) << "Channel " << impl.id_ << " done copying payload (#"
                   << opIter->sequenceNumber << ")";
        opIter->doneCopying = true;
        impl.recvOps_.advanceOperation(opIter);
      }));
}

void ChannelImpl::callRecvCallback(RecvOpIter opIter) {
  RecvOperation& op = *opIter;

  op.callback(error_);
  // Reset callback to release the resources it was holding.
  op.callback = nullptr;
}

void ChannelImpl::writeCompletion(RecvOpIter opIter) {
  RecvOperation& op = *opIter;

  TP_VLOG(6) << "Channel " << id_ << " is writing completion (#"
             << op.sequenceNumber << ")";
  completionConnection_->write(
      nullptr,
      0,
      callbackWrapper_([sequenceNumber{op.sequenceNumber}](ChannelImpl& impl) {
        TP_VLOG(6) << "Channel " << impl.id_ << " done writing completion (#"
                   << sequenceNumber << ")";
      }));
}

void ChannelImpl::handleErrorImpl() {
  sendOps_.advanceAllOperations();
  recvOps_.advanceAllOperations();

  descriptorConnection_->close();
  completionConnection_->close();

  context_->unenroll(*this);
}

} // namespace cma
} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/channel/cma/channel_impl.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <memory>
#include <string>

#include <tensorpipe/channel/channel_impl_boilerplate.h>
#include <tensorpipe/common/state_machine.h>
#include <tensorpipe/transport/context.h>

namespace tensorpipe {
namespace channel {
namespace cma {

class ContextImpl;

struct SendOperation {
  enum State { UNINITIALIZED, READING_COMPLETION, FINISHED };

  // Fields used by the state machine
  uint64_t sequenceNumber{0};
  State state{UNINITIALIZED};

  // Progress flags
  bool doneReadingCompletion{false};

  // Arguments at creation
  void* ptr;
  size_t length;
  TSendCallback callback;
};

struct RecvOperation {
  enum State { UNINITIALIZED, READING_DESCRIPTOR, COPYING, FINISHED };

  // Fields used by the state machine
  uint64_t sequenceNumber{0};
  State state{UNINITIALIZED};

  // Progress flags
  bool doneReadingDescriptor{false};
  bool doneCopying{false};

  // Arguments at creation
  void* ptr;
  size_t length;
  TRecvCallback callback;

  // Other data
  pid_t remotePid;
  void* remotePtr;
};

class ChannelImpl final
    : public ChannelImplBoilerplate<ContextImpl, ChannelImpl> {
 public:
  ChannelImpl(
      ConstructorToken token,
      std::shared_ptr<ContextImpl> context,
      std::string id,
      std::shared_ptr<transport::Connection> descriptorConnection,
      std::shared_ptr<transport::Connection> completionConnection);

 protected:
  // Implement the entry points called by ChannelImplBoilerplate.
  void initImplFromLoop() override;
  void sendImplFromLoop(
      uint64_t sequenceNumber,
      Buffer buffer,
      size_t length,
      TSendCallback callback) override;
  void recvImplFromLoop(
      uint64_t sequenceNumber,
      Buffer buffer,
      size_t length,
      TRecvCallback callback) override;
  void handleErrorImpl() override;

 private:
  const std::shared_ptr<transport::Connection> descriptorConnection_;
  const std::shared_ptr<transport::Connection> completionConnection_;

  OpsStateMachine<ChannelImpl, SendOperation> sendOps_{
      *this,
      &ChannelImpl::advanceSendOperation};
  using SendOpIter = decltype(sendOps_)::Iter;
  OpsStateMachine<ChannelImpl, RecvOperation> recvOps_{
      *this,
      &ChannelImpl::advanceRecvOperation};
  using RecvOpIter = decltype(recvOps_)::Iter;

  // State machines for send and recv ops.
  void advanceSendOperation(
      SendOpIter opIter,
      SendOperation::State prevOpState);
  void advanceRecvOperation(
      RecvOpIter opIter,
      RecvOperation::State prevOpState);

  // Actions (i.e., methods that begin a state transition).
  // For send operations:
  void writeDescriptor(SendOpIter opIter);
  void readCompletion(SendOpIter opIter);
  void callSendCallback(SendOpIter opIter);
  // For recv operations:
  void readDescriptor(RecvOpIter opIter);
  void copy(RecvOpIter opIter);
  void callRecvCallback(RecvOpIter opIter);
  void writeCompletion(RecvOpIter opIter);
};

} // namespace cma
} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/channel/cma/context_impl.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/channel/cma/context_impl.h>

#include <linux/prctl.h>
#include <sys/prctl.h>
#include <sys/syscall.h>
#include <sys/uio.h>
#include <unistd.h>

#include <functional>
#include <limits>
#include <sstream>
#include <string>
#include <thread>
#include <utility>

#include <tensorpipe/channel/cma/channel_impl.h>
#include <tensorpipe/common/defs.h>
#include <tensorpipe/common/strings.h>
#include <tensorpipe/common/system.h>

namespace tensorpipe {
namespace channel {
namespace cma {

namespace {

// Prepend descriptor with transport name so it's easy to
// disambiguate descriptors when debugging.
const std::string kDomainDescriptorPrefix{"cma:"};

Error callProcessVmReadv(
    void* localPtr,
    void* remotePtr,
    size_t length,
    pid_t pid) {
#ifdef SYS_process_vm_readv
  struct iovec localIov {
    .iov_base = localPtr, .iov_len = length
  };
  struct iovec remoteIov {
    .iov_base = remotePtr, .iov_len = length
  };
  ssize_t nread = static_cast<ssize_t>(::syscall(
      SYS_process_vm_readv,
      pid,
      &localIov,
      /*liovcnt=*/static_cast<unsigned long>(1),
      &remoteIov,
      /*riovcnt=*/static_cast<unsigned long>(1),
      /*flags=*/static_cast<unsigned long>(0)));
  if (nread < 0) {
    return TP_CREATE_ERROR(SystemError, "process_vm_readv", errno);
  } else if (nread != length) {
    return TP_CREATE_ERROR(ShortReadError, length, nread);
  }
  return Error::kSuccess;
#else
  return TP_CREATE_ERROR(SystemError, "process_vm_readv", ENOSYS);
#endif
}

class BadReadError final : public BaseError {
 public:
  BadReadError(uint64_t expected, uint64_t actual)
      : expected_(expected), actual_(actual) {}

  std::string what() const override {
    std::ostringstream oss;
    oss << "Expected to read " << expected_ << ", got " << actual_;
    return oss.str();
  }

 private:
  const uint64_t expected_;
  const uint64_t actual_;
};

// Old versions of Docker use a default seccomp-bpf rule that blocks some
// ptrace-related syscalls. To find this out, we attempt such a call against
// ourselves, which is always allowed (it shortcuts all checks, including LSMs),
// hence a failure can only come from a "filter" on the syscall.
// Or, in fact, it could also happen if the kernel doesn't support the syscall.
Error attemptProcessVmReadvSyscallOnSelf() {
  uint64_t someSourceValue = 0x0123456789abcdef;
  uint64_t someTargetValue = 0;
  Error error = callProcessVmReadv(
      &someTargetValue, &someSourceValue, sizeof(uint64_t), ::getpid());
  if (error) {
    return error;
  }
  if (someTargetValue != someSourceValue) {
    return TP_CREATE_ERROR(BadReadError, someSourceValue, someTargetValue);
  }
  return Error::kSuccess;
}

// According to read(2):
// > On Linux, read() (and similar system calls) will transfer at most
// > 0x7ffff000 (2,147,479,552) bytes, returning the number of bytes actually
// > transferred. (This is true on both 32-bit and 64-bit systems.)
constexpr size_t kMaxBytesReadableAtOnce = 0x7ffff000;

Error performCopy(
    void* localPtr,
    void* remotePtr,
    size_t length,
    pid_t remotePid) {
  for (size_t offset = 0; offset < length; offset += kMaxBytesReadableAtOnce) {
    Error error = callProcessVmReadv(
        reinterpret_cast<uint8_t*>(localPtr) + offset,
        reinterpret_cast<uint8_t*>(remotePtr) + offset,
        std::min(length - offset, kMaxBytesReadableAtOnce),
        remotePid);
    if (error) {
      return error;
    }
  }
  return Error::kSuccess;
}

} // namespace

std::shared_ptr<ContextImpl> ContextImpl::create() {
  int rv;
  std::ostringstream oss;
  oss << kDomainDescriptorPrefix;

  // This transport only works across processes on the same machine, and we
  // detect that by computing the boot ID.
  optional<std::string> bootID = getBootID();
  TP_THROW_ASSERT_IF(!bootID.has_value()) << "Unable to read boot_id";
  oss << bootID.value();

  // An endpoint can see the other through its PID if the latter is in a child
  // PID namespace of the former. Since the channel is bidirectional this must
  // be symmetric and thus the PID namespaces must be the same.
  optional<std::string> pidNsID = getLinuxNamespaceId(LinuxNamespace::kPid);
  if (!pidNsID.has_value()) {
    TP_VLOG(5) << "Unable to read pid namespace ID";
    return nullptr;
  }
  oss << '_' << pidNsID.value();

  // The ability to call process_vm_readv on a target is controlled by the
  // PTRACE_MODE_ATTACH_REALCREDS check (see process_vm_readv(2)). We'll go
  // through its checklist, step by step (which is found in ptrace(2)). We will
  // ignore the CAP_SYS_PTRACE conditions (i.e., we'll assume we don't have that
  // capability) because they are hard to check, and typically not needed.

  // We'll skip the check on whether the endpoints are two threads of the same
  // process (in which case ptrace is always allowed) because it's hard to fit
  // it in the descriptor and because we have some other more specialized
  // channels for that case.

  // The next step involves comparing user and group IDs. If the processes are
  // in user namespaces the kernel first maps these IDs back to the top-level
  // ("initial") ones and compares those. We can't do such mapping, thus we
  // compare the IDs as integers as we see them and thus for this to work
  // properly we require that the two endpoints are in the same user namespace.
  // This does not in fact constitute an extra restriction since the later
  // commoncap/capability LSM check will need to enforce this too.
  optional<std::string> userNsID = getLinuxNamespaceId(LinuxNamespace::kUser);
  if (!userNsID.has_value()) {
    TP_VLOG(5) << "Unable to read user namespace ID";
    return nullptr;
  }
  oss << '_' << userNsID.value();

  // It is required that our *real* user ID matches the real, effective and
  // saved-set user IDs of the target. And the same must hold for group IDs.
  // As the channel is bidirectional, the reverse must also hold, which means
  // our real, effective and saved-set IDs must all be equal and must match the
  // other endpoint's ones.
  uid_t realUserId, effectiveUserId, savedSetUserId;
  gid_t realGroupId, effectiveGroupId, savedSetGroupId;
  rv = ::getresuid(&realUserId, &effectiveUserId, &savedSetUserId);
  TP_THROW_SYSTEM_IF(rv < 0, errno);
  rv = ::getresgid(&realGroupId, &effectiveGroupId, &savedSetGroupId);
  TP_THROW_SYSTEM_IF(rv < 0, errno);
  if (realUserId != effectiveUserId || realUserId != savedSetUserId ||
      realGroupId != effectiveGroupId || realGroupId != savedSetGroupId) {
    TP_VLOG(5) << "User IDs or group IDs aren't all equal. User IDs are "
               << realUserId << " (real), " << effectiveUserId
               << " (effective) and " << savedSetUserId
               << " (saved-set). Group IDs are " << realGroupId << " (real), "
               << effectiveGroupId << " (effective) and " << savedSetGroupId
               << " (saved-set).";
    return nullptr;
  }
  oss << '_' << realUserId << '_' << realGroupId;

  // The target must be dumpable. Which, due to symmetry, means we must be
  // dumpable too.
  rv = ::prctl(PR_GET_DUMPABLE, 0, 0, 0, 0);
  TP_THROW_SYSTEM_IF(rv < 0, errno);
  // SUID_DUMP_USER has a value of 1.
  if (rv != 1) {
    TP_VLOG(5) << "Process isn't dumpable";
    return nullptr;
  }

  // Next the Linux Security Modules (LSMs) kick in. Since users could register
  // third-party LSMs we'll need to draw a line in what we support. We have two
  // options with unsupported LSMs: play it safe and assume the LSM will reject
  // the check, or "trust" the user and make them responsible to deal with the
  // LSMs they added. We're leaning for the latter, as often some LSMs like
  // AppArmor or SELinux are enabled without actually restricting anything. For
  // now we'll support the LSMs that are found by default on common distros,
  // but we can include support for more of them if that becomes necessary.
  optional<std::vector<std::string>> lsms = getLinuxSecurityModules();
  bool yamaOptional = false;
  if (!lsms.has_value()) {
    // This could happen if /sys/kernel/security/lsm cannot be opened. Although
    // that file looks like it resides on sysfs, it's actually on the securityfs
    // VFS, which is sometimes not bind-mounted inside containers. In such cases
    // rather than failing hard we'll check a couple of reasonable LSMs.
    TP_VLOG(5) << "Couldn't detect the active Linux Security Modules";
    lsms.emplace();
    *lsms = {"capability", "yama"};
    // We don't know whether YAMA is really there, hence we'll remember to
    // tolerate any failures later on.
    yamaOptional = true;
  } else {
    TP_VLOG(5) << "Detected these Linux Security Modules: " << joinStrs(*lsms);
  }
  // FIXME Can we assume that the two endpoints will see the same list of LSMs,
  // or should we incorporate that into the domain descriptor?
  for (const std::string& lsm : lsms.value()) {
    if (lsm == "capability") {
      // We already checked that the endpoints are in the same user namespace.
      // We must check they have the same permitted capabilities in it.
      optional<std::string> caps = getPermittedCapabilitiesID();
      TP_THROW_ASSERT_IF(!caps.has_value())
          << "Unable to obtain permitted capabilities";
      oss << '_' << caps.value();
    } else if (lsm == "yama") {
      optional<YamaPtraceScope> yamaScope = getYamaPtraceScope();
      if (!yamaScope.has_value()) {
        TP_THROW_ASSERT_IF(!yamaOptional)
            << "Unable to retrieve YAMA ptrace scope";
        continue;
      }
      switch (yamaScope.value()) {
        case YamaPtraceScope::kClassicPtracePermissions:
          TP_VLOG(5) << "YAMA ptrace scope set to classic ptrace permissions";
          break;
        case YamaPtraceScope::kRestrictedPtrace:
          TP_VLOG(5) << "YAMA ptrace scope set to restricted ptrace";
          // FIXME It's not really great to change a global property of the
          // process, especially a security-related one. An "excuse" for doing
          // so is that UCT does the same:
          // https://github.com/openucx/ucx/blob/4d9976b6b8f8faae609c078c72aad8e5b842c43f/src/uct/sm/scopy/cma/cma_md.c#L61
#ifndef PR_SET_PTRACER
// https://github.com/torvalds/linux/blob/master/include/uapi/linux/prctl.h
#define PR_SET_PTRACER 0x59616d61
#endif
#ifndef PR_SET_PTRACER_ANY
// https://github.com/torvalds/linux/blob/master/include/uapi/linux/prctl.h
#define PR_SET_PTRACER_ANY ((unsigned long)-1)
#endif
          rv = ::prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY, 0, 0, 0);
          TP_THROW_SYSTEM_IF(rv < 0, errno);
          break;
        case YamaPtraceScope::kAdminOnlyAttach:
          TP_VLOG(5) << "YAMA ptrace scope set to admin-only attach";
          return nullptr;
        case YamaPtraceScope::kNoAttach:
          TP_VLOG(5) << "YAMA ptrace scope set to no attach";
          return nullptr;
        default:
          TP_THROW_ASSERT() << "Unknown YAMA ptrace scope";
      }
    }
  }

  // In addition to the ptrace check, in some cases (I'm looking at you Docker)
  // the process_vm_readv syscall is outright blocked by seccomp-bpf. Or just
  // unsupported by the kernel.
  Error error = attemptProcessVmReadvSyscallOnSelf();
  if (error) {
    TP_VLOG(5)
        << "The process_vm_readv syscall appears to be unavailable or blocked: "
        << error.what();
    return nullptr;
  }

  std::string domainDescriptor = oss.str();
  TP_VLOG(5) << "The domain descriptor for CMA is " << domainDescriptor;

  std::unordered_map<Device, std::string> deviceDescriptors = {
      {Device{kCpuDeviceType, 0}, std::move(domainDescriptor)}};

  return std::make_shared<ContextImpl>(std::move(deviceDescriptors));
}

ContextImpl::ContextImpl(
    std::unordered_map<Device, std::string> deviceDescriptors)
    : ContextImplBoilerplate<ContextImpl, ChannelImpl>(
          std::move(deviceDescriptors)) {
  thread_ = std::thread(&ContextImpl::handleCopyRequests, this);
}

std::shared_ptr<Channel> ContextImpl::createChannel(
    std::vector<std::shared_ptr<transport::Connection>> connections,
    Endpoint /* unused */) {
  TP_DCHECK_EQ(numConnectionsNeeded(), connections.size());
  return createChannelInternal(
      std::move(connections[0]), std::move(connections[1]));
}

size_t ContextImpl::numConnectionsNeeded() const {
  return 2;
}

void ContextImpl::handleErrorImpl() {
  requests_.push(nullopt);
}

void ContextImpl::joinImpl() {
  thread_.join();
  // TP_DCHECK(requests_.empty());
}

bool ContextImpl::inLoop() const {
  return loop_.inLoop();
};

void ContextImpl::deferToLoop(std::function<void()> fn) {
  loop_.deferToLoop(std::move(fn));
};

void ContextImpl::requestCopy(
    pid_t remotePid,
    void* remotePtr,
    void* localPtr,
    size_t length,
    std::function<void(const Error&)> fn) {
  uint64_t requestId = nextRequestId_++;
  TP_VLOG(4) << "Channel context " << id_ << " received a copy request (#"
             << requestId << ")";

  fn = [this, requestId, fn{std::move(fn)}](const Error& error) {
    TP_VLOG(4) << "Channel context " << id_
               << " is calling a copy request callback (#" << requestId << ")";
    fn(error);
    TP_VLOG(4) << "Channel context " << id_
               << " done calling a copy request callback (#" << requestId
               << ")";
  };

  requests_.push(
      CopyRequest{remotePid, remotePtr, localPtr, length, std::move(fn)});
}

void ContextImpl::handleCopyRequests() {
  setThreadName("TP_CMA_loop");
  while (true) {
    auto maybeRequest = requests_.pop();
    if (!maybeRequest.has_value()) {
      break;
    }
    CopyRequest request = std::move(maybeRequest).value();

    request.callback(performCopy(
        request.localPtr,
        request.remotePtr,
        request.length,
        request.remotePid));
  }
}

} // namespace cma
} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/channel/cma/context_impl.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <atomic>
#include <functional>
#include <thread>

#include <tensorpipe/channel/context_impl_boilerplate.h>
#include <tensorpipe/common/deferred_executor.h>
#include <tensorpipe/common/device.h>
#include <tensorpipe/common/error.h>
#include <tensorpipe/common/optional.h>
#include <tensorpipe/common/queue.h>

namespace tensorpipe {
namespace channel {
namespace cma {

class ChannelImpl;

class ContextImpl final
    : public ContextImplBoilerplate<ContextImpl, ChannelImpl> {
 public:
  static std::shared_ptr<ContextImpl> create();

  explicit ContextImpl(
      std::unordered_map<Device, std::string> deviceDescriptors);

  std::shared_ptr<Channel> createChannel(
      std::vector<std::shared_ptr<transport::Connection>> connections,
      Endpoint endpoint);

  size_t numConnectionsNeeded() const override;

  // Implement the DeferredExecutor interface.
  bool inLoop() const override;
  void deferToLoop(std::function<void()> fn) override;

  using copy_request_callback_fn = std::function<void(const Error&)>;

  void requestCopy(
      pid_t remotePid,
      void* remotePtr,
      void* localPtr,
      size_t length,
      copy_request_callback_fn fn);

 protected:
  // Implement the entry points called by ContextImplBoilerplate.
  void handleErrorImpl() override;
  void joinImpl() override;

 private:
  OnDemandDeferredExecutor loop_;

  struct CopyRequest {
    pid_t remotePid;
    void* remotePtr;
    void* localPtr;
    size_t length;
    copy_request_callback_fn callback;
  };

  std::thread thread_;
  Queue<optional<CopyRequest>> requests_{std::numeric_limits<int>::max()};

  // This is atomic because it may be accessed from outside the loop.
  std::atomic<uint64_t> nextRequestId_{0};

  void handleCopyRequests();
};

} // namespace cma
} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/channel/cma/factory.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/channel/cma/factory.h>

#include <tensorpipe/channel/cma/channel_impl.h>
#include <tensorpipe/channel/cma/context_impl.h>
#include <tensorpipe/channel/context_boilerplate.h>

namespace tensorpipe {
namespace channel {
namespace cma {

std::shared_ptr<Context> create() {
  return std::make_shared<ContextBoilerplate<ContextImpl, ChannelImpl>>();
}

} // namespace cma
} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/channel/cma/factory.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <memory>

#include <tensorpipe/channel/context.h>

namespace tensorpipe {
namespace channel {
namespace cma {

std::shared_ptr<Context> create();

} // namespace cma
} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/channel/context.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <memory>
#include <string>
#include <unordered_map>
#include <vector>

#include <tensorpipe/common/buffer.h>
#include <tensorpipe/transport/context.h>

namespace tensorpipe {
namespace channel {

enum class Endpoint : bool { kConnect, kListen };

class Channel;

// Abstract base class for channel context classes.
//
// Instances of these classes are expected to be registered with a
// context. All registered instances are assumed to be eligible
// channels for all pairs.
//
class Context {
 public:
  // Return whether the context is able to operate correctly.
  //
  // Some channel types may be unable to perform as intended under some
  // circumstances (e.g., specialized hardware unavailable, lack of
  // permissions). They can report it through this method in order for
  // the core context to avoid registering them in the first place.
  //
  virtual bool isViable() const = 0;

  // Return the number of control connections needed to create an instance of
  // this channel.
  //
  // Most channels require only one, but some require more (cuda_basic), and
  // some might require none.
  //
  virtual size_t numConnectionsNeeded() const = 0;

  // Return a map from supported devices to strings describing the device from
  // the channel's perspective.
  //
  // Two processes with a channel context of the same type can leverage this
  // channel to make two devices communicate if one side's device descriptor is
  // "accepted" by the other one, using the canCommunicateWithRemote method
  // below. That method must be symmetric, and unless overridden defaults to
  // string comparison.
  //
  virtual const std::unordered_map<Device, std::string>& deviceDescriptors()
      const = 0;

  // Compare local and remote device descriptors for compatibility.
  //
  // Determine whether a channel can be opened between a local device and
  // a remote one that has the given device descriptor. This function
  // needs to be symmetric: if we called this method on the remote
  // context with the local descriptor we should get the same answer.
  // Unless overridden it defaults to string comparison.
  //
  virtual bool canCommunicateWithRemote(
      const std::string& localDeviceDescriptor,
      const std::string& remoteDeviceDescriptor) const = 0;

  // Return newly created channel using the specified connections.
  //
  // It is up to the channel to either use these connections for further
  // initialization, or use them directly. Either way, the returned
  // channel should be immediately usable. If the channel isn't fully
  // initialized yet, take care to queue these operations to execute
  // as soon as initialization has completed.
  //
  virtual std::shared_ptr<Channel> createChannel(
      std::vector<std::shared_ptr<transport::Connection>>,
      Endpoint) = 0;

  // Tell the context what its identifier is.
  //
  // This is only supposed to be called from the high-level context. It will
  // only used for logging and debugging purposes.
  virtual void setId(std::string id) = 0;

  // Put the channel context in a terminal state, in turn closing all of its
  // channels, and release its resources. This may be done asynchronously, in
  // background.
  virtual void close() = 0;

  // Wait for all resources to be released and all background activity to stop.
  virtual void join() = 0;

  virtual ~Context() = default;

 private:
  std::string name_;
};

} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/channel/context_boilerplate.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <memory>
#include <string>
#include <type_traits>
#include <utility>
#include <vector>

#include <tensorpipe/channel/context.h>
#include <tensorpipe/channel/context_impl_boilerplate.h>

namespace tensorpipe {
namespace channel {

template <typename TCtx, typename TChan>
class ContextBoilerplate : public Context {
 public:
  template <typename... Args>
  explicit ContextBoilerplate(Args&&... args);

  ContextBoilerplate(const ContextBoilerplate&) = delete;
  ContextBoilerplate(ContextBoilerplate&&) = delete;
  ContextBoilerplate& operator=(const ContextBoilerplate&) = delete;
  ContextBoilerplate& operator=(ContextBoilerplate&&) = delete;

  std::shared_ptr<Channel> createChannel(
      std::vector<std::shared_ptr<transport::Connection>> connections,
      Endpoint endpoint) override;

  size_t numConnectionsNeeded() const override;

  bool isViable() const override;

  const std::unordered_map<Device, std::string>& deviceDescriptors()
      const override;

  bool canCommunicateWithRemote(
      const std::string& localDeviceDescriptor,
      const std::string& remoteDeviceDescriptor) const override;

  void setId(std::string id) override;

  void close() override;

  void join() override;

  ~ContextBoilerplate() override;

 protected:
  // The implementation is managed by a shared_ptr because each child object
  // will also hold a shared_ptr to it. However, its lifetime is tied to the one
  // of this public object since when the latter is destroyed the implementation
  // is closed and joined.
  const std::shared_ptr<TCtx> impl_;
};

template <typename TCtx, typename TChan>
template <typename... Args>
ContextBoilerplate<TCtx, TChan>::ContextBoilerplate(Args&&... args)
    : impl_(TCtx::create(std::forward<Args>(args)...)) {
  static_assert(
      std::is_base_of<ChannelImplBoilerplate<TCtx, TChan>, TChan>::value, "");
  if (unlikely(!impl_)) {
    return;
  }
  impl_->init();
}

template <typename TCtx, typename TChan>
std::shared_ptr<Channel> ContextBoilerplate<TCtx, TChan>::createChannel(
    std::vector<std::shared_ptr<transport::Connection>> connections,
    Endpoint endpoint) {
  if (unlikely(!impl_)) {
    return std::make_shared<ChannelBoilerplate<TCtx, TChan>>(nullptr);
  }
  return impl_->createChannel(std::move(connections), endpoint);
}

template <typename TCtx, typename TChan>
size_t ContextBoilerplate<TCtx, TChan>::numConnectionsNeeded() const {
  if (unlikely(!impl_)) {
    return 0;
  }
  return impl_->numConnectionsNeeded();
}

template <typename TCtx, typename TChan>
bool ContextBoilerplate<TCtx, TChan>::isViable() const {
  return impl_ != nullptr;
}

template <typename TCtx, typename TChan>
const std::unordered_map<Device, std::string>& ContextBoilerplate<TCtx, TChan>::
    deviceDescriptors() const {
  if (unlikely(!impl_)) {
    // FIXME In C++-17 perhaps a global static inline variable would be better?
    static std::unordered_map<Device, std::string> empty = {};
    return empty;
  }
  return impl_->deviceDescriptors();
}

template <typename TCtx, typename TChan>
bool ContextBoilerplate<TCtx, TChan>::canCommunicateWithRemote(
    const std::string& localDeviceDescriptor,
    const std::string& remoteDeviceDescriptor) const {
  if (unlikely(!impl_)) {
    return false;
  }
  return impl_->canCommunicateWithRemote(
      localDeviceDescriptor, remoteDeviceDescriptor);
}

template <typename TCtx, typename TChan>
void ContextBoilerplate<TCtx, TChan>::setId(std::string id) {
  if (unlikely(!impl_)) {
    return;
  }
  impl_->setId(std::move(id));
}

template <typename TCtx, typename TChan>
void ContextBoilerplate<TCtx, TChan>::close() {
  if (unlikely(!impl_)) {
    return;
  }
  impl_->close();
}

template <typename TCtx, typename TChan>
void ContextBoilerplate<TCtx, TChan>::join() {
  if (unlikely(!impl_)) {
    return;
  }
  impl_->join();
}

template <typename TCtx, typename TChan>
ContextBoilerplate<TCtx, TChan>::~ContextBoilerplate() {
  join();
}

} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/channel/context_impl_boilerplate.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <atomic>
#include <future>
#include <memory>
#include <string>
#include <unordered_map>
#include <utility>

#include <tensorpipe/channel/channel_boilerplate.h>
#include <tensorpipe/common/defs.h>
#include <tensorpipe/transport/context.h>

namespace tensorpipe {
namespace channel {

template <typename TCtx, typename TChan>
class ContextImplBoilerplate : public virtual DeferredExecutor,
                               public std::enable_shared_from_this<TCtx> {
 public:
  explicit ContextImplBoilerplate(
      std::unordered_map<Device, std::string> deviceDescriptors);

  ContextImplBoilerplate(const ContextImplBoilerplate&) = delete;
  ContextImplBoilerplate(ContextImplBoilerplate&&) = delete;
  ContextImplBoilerplate& operator=(const ContextImplBoilerplate&) = delete;
  ContextImplBoilerplate& operator=(ContextImplBoilerplate&&) = delete;

  void init();

  virtual size_t numConnectionsNeeded() const;

  const std::unordered_map<Device, std::string>& deviceDescriptors() const;

  virtual bool canCommunicateWithRemote(
      const std::string& localDeviceDescriptor,
      const std::string& remoteDeviceDescriptor) const;

  // Enrolling dependent objects (channels) causes them to be kept alive for as
  // long as the context exists. These objects should enroll themselves as soon
  // as they're created (in their initImplFromLoop method) and unenroll
  // themselves after they've completed handling an error (either right in the
  // handleErrorImpl method or in a subsequent callback). The context, on the
  // other hand, should avoid terminating (i.e., complete joining) until all
  // objects have unenrolled themselves.
  void enroll(TChan& channel);
  void unenroll(TChan& channel);

  // Return whether the context is in a closed state. To avoid race conditions,
  // this must be called from within the loop.
  bool closed();

  void setId(std::string id);

  void close();

  void join();

  virtual ~ContextImplBoilerplate() = default;

 protected:
  virtual void initImplFromLoop() {}
  virtual void handleErrorImpl() = 0;
  virtual void joinImpl() = 0;
  virtual void setIdImpl() {}

  void setError(Error error);

  template <typename... Args>
  std::shared_ptr<Channel> createChannelInternal(Args&&... args);

  Error error_{Error::kSuccess};

  // An identifier for the context, composed of the identifier for the context,
  // combined with the channel's name. It will only be used for logging and
  // debugging purposes.
  std::string id_{"N/A"};

  CallbackWrapper<TCtx> callbackWrapper_{*this, *this};

 private:
  void initFromLoop();
  void closeFromLoop();

  void handleError();

  std::atomic<bool> joined_{false};

  const std::unordered_map<Device, std::string> deviceDescriptors_;

  // Sequence numbers for the channels created by this context, used to create
  // their identifiers based off this context's identifier. They will only be
  // used for logging and debugging.
  std::atomic<uint64_t> channelCounter_{0};

  // Store shared_ptrs to dependent objects that have enrolled themselves to
  // keep them alive. We use a map, indexed by raw pointers, rather than a set
  // of shared_ptrs so that we can erase objects without them having to create
  // a fresh shared_ptr just for that.
  std::unordered_map<TChan*, std::shared_ptr<TChan>> channels_;

  // For some odd reason it seems we need to use a qualified name here...
  template <typename T>
  friend class tensorpipe::CallbackWrapper;
};

template <typename TCtx, typename TChan>
ContextImplBoilerplate<TCtx, TChan>::ContextImplBoilerplate(
    std::unordered_map<Device, std::string> deviceDescriptors)
    : deviceDescriptors_(std::move(deviceDescriptors)) {}

template <typename TCtx, typename TChan>
template <typename... Args>
std::shared_ptr<Channel> ContextImplBoilerplate<TCtx, TChan>::
    createChannelInternal(Args&&... args) {
  std::string channelId = id_ + ".c" + std::to_string(channelCounter_++);
  TP_VLOG(4) << "Channel context " << id_ << " is opening channel "
             << channelId;
  return std::make_shared<ChannelBoilerplate<TCtx, TChan>>(
      typename ChannelImplBoilerplate<TCtx, TChan>::ConstructorToken(),
      this->shared_from_this(),
      std::move(channelId),
      std::forward<Args>(args)...);
}

template <typename TCtx, typename TChan>
void ContextImplBoilerplate<TCtx, TChan>::init() {
  deferToLoop([this]() { initFromLoop(); });
}

template <typename TCtx, typename TChan>
void ContextImplBoilerplate<TCtx, TChan>::initFromLoop() {
  TP_DCHECK(inLoop());

  TP_DCHECK(!error_);

  initImplFromLoop();
}

template <typename TCtx, typename TChan>
size_t ContextImplBoilerplate<TCtx, TChan>::numConnectionsNeeded() const {
  return 1;
}

template <typename TCtx, typename TChan>
const std::unordered_map<Device, std::string>& ContextImplBoilerplate<
    TCtx,
    TChan>::deviceDescriptors() const {
  return deviceDescriptors_;
}

template <typename TCtx, typename TChan>
bool ContextImplBoilerplate<TCtx, TChan>::canCommunicateWithRemote(
    const std::string& localDeviceDescriptor,
    const std::string& remoteDeviceDescriptor) const {
  return localDeviceDescriptor == remoteDeviceDescriptor;
}

template <typename TCtx, typename TChan>
void ContextImplBoilerplate<TCtx, TChan>::enroll(TChan& channel) {
  TP_DCHECK(inLoop());
  bool wasInserted;
  std::tie(std::ignore, wasInserted) =
      channels_.emplace(&channel, channel.shared_from_this());
  TP_DCHECK(wasInserted);
}

template <typename TCtx, typename TChan>
void ContextImplBoilerplate<TCtx, TChan>::unenroll(TChan& channel) {
  TP_DCHECK(inLoop());
  auto numRemoved = channels_.erase(&channel);
  TP_DCHECK_EQ(numRemoved, 1);
}

template <typename TCtx, typename TChan>
bool ContextImplBoilerplate<TCtx, TChan>::closed() {
  TP_DCHECK(inLoop());
  return error_;
};

template <typename TCtx, typename TChan>
void ContextImplBoilerplate<TCtx, TChan>::setId(std::string id) {
  TP_VLOG(4) << "Channel context " << id_ << " was renamed to " << id;
  id_ = std::move(id);
  setIdImpl();
}

template <typename TCtx, typename TChan>
void ContextImplBoilerplate<TCtx, TChan>::close() {
  deferToLoop([this]() { closeFromLoop(); });
}

template <typename TCtx, typename TChan>
void ContextImplBoilerplate<TCtx, TChan>::closeFromLoop() {
  TP_DCHECK(inLoop());
  TP_VLOG(4) << "Channel context " << id_ << " is closing";
  setError(TP_CREATE_ERROR(ContextClosedError));
  TP_VLOG(4) << "Channel context " << id_ << " done closing";
}

template <typename TCtx, typename TChan>
void ContextImplBoilerplate<TCtx, TChan>::setError(Error error) {
  // Don't overwrite an error that's already set.
  if (error_ || !error) {
    return;
  }

  error_ = std::move(error);

  handleError();
}

template <typename TCtx, typename TChan>
void ContextImplBoilerplate<TCtx, TChan>::handleError() {
  TP_DCHECK(inLoop());
  TP_VLOG(5) << "Channel context " << id_ << " is handling error "
             << error_.what();

  // Make a copy as they could unenroll themselves inline.
  auto channelsCopy = channels_;
  // We call closeFromLoop, rather than just close, because we need these
  // objects to transition _immediately_ to error, "atomically". If we just
  // deferred closing to later, this could come after some already-enqueued
  // operations that could try to access the context, which would be closed,
  // and this could fail.
  for (auto& iter : channelsCopy) {
    iter.second->closeFromLoop();
  }

  handleErrorImpl();
}

template <typename TCtx, typename TChan>
void ContextImplBoilerplate<TCtx, TChan>::join() {
  close();

  if (!joined_.exchange(true)) {
    TP_VLOG(4) << "Channel context " << id_ << " is joining";

    // As closing is deferred to the loop, we must wait for closeImpl to be
    // actually called before we call joinImpl, to avoid race conditions. For
    // this, we defer another task to the loop, which we know will run after the
    // closing, and then we wait for that task to be run.
    std::promise<void> hasClosed;
    deferToLoop([&]() { hasClosed.set_value(); });
    hasClosed.get_future().wait();

    joinImpl();

    TP_VLOG(4) << "Channel context " << id_ << " done joining";

    // FIXME This may actually not be true, as channels could for example be
    // kept alive by the underlying transport, and thus outlive their context.
    // TP_DCHECK(channels_.empty());
  }
}

} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/channel/cuda_basic/channel_impl.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/channel/cuda_basic/channel_impl.h>

#include <memory>
#include <string>
#include <utility>

#include <cuda_runtime.h>

#include <tensorpipe/channel/channel.h>
#include <tensorpipe/channel/cuda_basic/constants.h>
#include <tensorpipe/channel/cuda_basic/context_impl.h>
#include <tensorpipe/common/cpu_buffer.h>
#include <tensorpipe/common/cuda.h>
#include <tensorpipe/common/cuda_buffer.h>
#include <tensorpipe/common/defs.h>
#include <tensorpipe/common/error.h>
#include <tensorpipe/transport/connection.h>

namespace tensorpipe {
namespace channel {
namespace cuda_basic {

namespace {

size_t ceilOfRatio(size_t n, size_t d) {
  return (n + d - 1) / d;
}

} // namespace

ChannelImpl::ChannelImpl(
    ConstructorToken token,
    std::shared_ptr<ContextImpl> context,
    std::string id,
    std::shared_ptr<transport::Connection> connection,
    std::shared_ptr<Channel> cpuChannel,
    CudaLoop& cudaLoop)
    : ChannelImplBoilerplate<ContextImpl, ChannelImpl>(
          token,
          std::move(context),
          std::move(id)),
      connection_(std::move(connection)),
      cpuChannel_(std::move(cpuChannel)),
      cudaLoop_(cudaLoop) {}

void ChannelImpl::initImplFromLoop() {
  context_->enroll(*this);
}

void ChannelImpl::cudaCopy(
    void* dst,
    const void* src,
    size_t length,
    int deviceIdx,
    cudaStream_t stream,
    std::function<void(const Error&)> callback) {
  {
    CudaDeviceGuard guard(deviceIdx);
    TP_CUDA_CHECK(cudaMemcpyAsync(dst, src, length, cudaMemcpyDefault, stream));
  }

  cudaLoop_.addCallback(deviceIdx, stream, std::move(callback));
}

void ChannelImpl::sendImplFromLoop(
    uint64_t sequenceNumber,
    Buffer buffer,
    size_t length,
    TSendCallback callback) {
  if (length == 0) {
    callback(error_);
    return;
  }

  const Device device = buffer.device();
  const size_t chunkLength = kSlotSize;
  const size_t numChunks = ceilOfRatio(length, chunkLength);
  for (size_t offset = 0; offset < length; offset += chunkLength) {
    ChunkSendOpIter opIter = chunkSendOps_.emplaceBack(nextChunkBeingSent_++);
    ChunkSendOperation& op = *opIter;
    op.bufferSequenceNumber = sequenceNumber;
    op.chunkId = offset / chunkLength;
    op.numChunks = numChunks;
    op.length = std::min(length - offset, chunkLength);
    // Operations are processed in order, so we can afford to trigger the
    // callback once the last operation is done.
    if (op.chunkId == numChunks - 1) {
      op.callback = std::move(callback);
    }

    if (device.type == kCpuDeviceType) {
      op.isCpuBuffer = true;
      op.devicePtr =
          static_cast<uint8_t*>(buffer.unwrap<CpuBuffer>().ptr) + offset;
    } else if (device.type == kCudaDeviceType) {
      op.isCpuBuffer = false;
      op.devicePtr =
          static_cast<uint8_t*>(buffer.unwrap<CudaBuffer>().ptr) + offset;
      op.stream = buffer.unwrap<CudaBuffer>().stream;
      op.deviceIdx = device.index;
    } else {
      TP_THROW_ASSERT() << "Unexpected device type: " << device.type;
    }

    chunkSendOps_.advanceOperation(opIter);
  }
}

void ChannelImpl::advanceChunkSendOperation(
    ChunkSendOpIter opIter,
    ChunkSendOperation::State prevOpState) {
  TP_DCHECK(context_->inLoop());

  ChunkSendOperation& op = *opIter;

  // Needs to go after previous op invoked its callback because the last chunk
  // in a series (that corresponds to one operation) must invoke its callback
  // only when all chunks in the series are done.
  chunkSendOps_.attemptTransition(
      opIter,
      /*from=*/ChunkSendOperation::UNINITIALIZED,
      /*to=*/ChunkSendOperation::FINISHED,
      /*cond=*/error_ && prevOpState >= ChunkSendOperation::INVOKED_CALLBACK,
      /*actions=*/{&ChannelImpl::callSendCallback});

  // Needs to go after previous op to ensure predictable and consistent ordering
  // of send calls on CPU channel.
  // This transition shortcuts the allocation of/copy to staging memory when the
  // buffer is already on CPU.
  chunkSendOps_.attemptTransition(
      opIter,
      /*from=*/ChunkSendOperation::UNINITIALIZED,
      /*to=*/ChunkSendOperation::SENDING_CPU_BUFFER,
      /*cond=*/!error_ && op.isCpuBuffer &&
          prevOpState >= ChunkSendOperation::SENDING_CPU_BUFFER,
      /*actions=*/
      {&ChannelImpl::writeReadyToSend, &ChannelImpl::sendCpuBuffer});

  // Needs to go after previous op to ensure later operations are not holding
  // staging buffers while earlier ones are still blocked waiting for them,
  // because the staging buffer will only be returned to the allocator once the
  // operation is destroyed, but this won't happen until earlier operations have
  // completed, and if they are blocked waiting for buffers we may deadlock.
  chunkSendOps_.attemptTransition(
      opIter,
      /*from=*/ChunkSendOperation::UNINITIALIZED,
      /*to=*/ChunkSendOperation::ALLOCATING_CPU_BUFFER,
      /*cond=*/!error_ && !op.isCpuBuffer &&
          prevOpState >= ChunkSendOperation::ALLOCATING_CPU_BUFFER,
      /*actions=*/{&ChannelImpl::allocateSendCpuBuffer});

  // See above for why this needs to go after previous op.
  chunkSendOps_.attemptTransition(
      opIter,
      /*from=*/ChunkSendOperation::ALLOCATING_CPU_BUFFER,
      /*to=*/ChunkSendOperation::FINISHED,
      /*cond=*/error_ && op.doneAllocatingCpuStagingBuffer &&
          prevOpState >= ChunkSendOperation::INVOKED_CALLBACK,
      /*actions=*/
      {&ChannelImpl::callSendCallback, &ChannelImpl::returnSendCpuBuffer});

  // Needs to go after previous op to ensure predictable and consistent ordering
  // of write calls on the control connection.
  chunkSendOps_.attemptTransition(
      opIter,
      /*from=*/ChunkSendOperation::ALLOCATING_CPU_BUFFER,
      /*to=*/ChunkSendOperation::COPYING_FROM_GPU_TO_CPU,
      /*cond=*/!error_ && op.doneAllocatingCpuStagingBuffer &&
          prevOpState >= ChunkSendOperation::COPYING_FROM_GPU_TO_CPU,
      /*actions=*/
      {&ChannelImpl::writeReadyToSend, &ChannelImpl::copyFromGpuToCpu});

  // See above for why this needs to go after previous op.
  chunkSendOps_.attemptTransition(
      opIter,
      /*from=*/ChunkSendOperation::COPYING_FROM_GPU_TO_CPU,
      /*to=*/ChunkSendOperation::FINISHED,
      /*cond=*/error_ && op.doneCopyingFromGpuToCpu &&
          prevOpState >= ChunkSendOperation::INVOKED_CALLBACK,
      /*actions=*/
      {&ChannelImpl::callSendCallback, &ChannelImpl::returnSendCpuBuffer});

  // See above for why this needs to go after previous op.
  chunkSendOps_.attemptTransition(
      opIter,
      /*from=*/ChunkSendOperation::COPYING_FROM_GPU_TO_CPU,
      /*to=*/ChunkSendOperation::INVOKED_CALLBACK,
      /*cond=*/!error_ && op.doneCopyingFromGpuToCpu &&
          prevOpState >= ChunkSendOperation::INVOKED_CALLBACK,
      /*actions=*/{&ChannelImpl::callSendCallback});

  chunkSendOps_.attemptTransition(
      opIter,
      /*from=*/ChunkSendOperation::INVOKED_CALLBACK,
      /*to=*/ChunkSendOperation::FINISHED,
      /*cond=*/error_,
      /*actions=*/{&ChannelImpl::returnSendCpuBuffer});

  // Needs to go after previous op to ensure predictable and consistent ordering
  // of send calls on CPU channel.
  chunkSendOps_.attemptTransition(
      opIter,
      /*from=*/ChunkSendOperation::INVOKED_CALLBACK,
      /*to=*/ChunkSendOperation::SENDING_CPU_BUFFER,
      /*cond=*/!error_ && prevOpState >= ChunkSendOperation::SENDING_CPU_BUFFER,
      /*actions=*/{&ChannelImpl::sendCpuBuffer});

  chunkSendOps_.attemptTransition(
      opIter,
      /*from=*/ChunkSendOperation::SENDING_CPU_BUFFER,
      /*to=*/ChunkSendOperation::FINISHED,
      /*cond=*/op.doneSendingCpuBuffer && op.isCpuBuffer,
      /*actions=*/{&ChannelImpl::callSendCallback});

  chunkSendOps_.attemptTransition(
      opIter,
      /*from=*/ChunkSendOperation::SENDING_CPU_BUFFER,
      /*to=*/ChunkSendOperation::FINISHED,
      /*cond=*/op.doneSendingCpuBuffer && !op.isCpuBuffer,
      /*actions=*/{&ChannelImpl::returnSendCpuBuffer});
}

void ChannelImpl::allocateSendCpuBuffer(ChunkSendOpIter opIter) {
  ChunkSendOperation& op = *opIter;

  TP_VLOG(5) << "Channel " << id_
             << " is allocating temporary memory for chunk #" << op.chunkId
             << " of " << op.numChunks << " for buffer #"
             << op.bufferSequenceNumber;
  Allocator& cudaHostAllocator =
      context_->getCudaHostSendAllocator(op.deviceIdx);
  cudaHostAllocator.alloc(
      op.length,
      callbackWrapper_(
          [opIter](ChannelImpl& impl, std::shared_ptr<uint8_t> tmpBuffer) {
            TP_VLOG(5) << "Channel " << impl.id_
                       << " is done allocating temporary memory for chunk #"
                       << opIter->chunkId << " of " << opIter->numChunks
                       << " for buffer #" << opIter->bufferSequenceNumber;
            opIter->doneAllocatingCpuStagingBuffer = true;
            if (!impl.error_) {
              opIter->tmpBuffer = std::move(tmpBuffer);
            }
            impl.chunkSendOps_.advanceOperation(opIter);
          }));
}

void ChannelImpl::writeReadyToSend(ChunkSendOpIter opIter) {
  ChunkSendOperation& op = *opIter;

  TP_VLOG(6) << "Channel " << id_
             << " is sending ready-to-send notification for chunk #"
             << op.chunkId << " of " << op.numChunks << " for buffer #"
             << op.bufferSequenceNumber;
  connection_->write(
      nullptr,
      0,
      callbackWrapper_([bufferSequenceNumber{op.bufferSequenceNumber},
                        chunkId{op.chunkId},
                        numChunks{op.numChunks}](ChannelImpl& impl) {
        TP_VLOG(6) << "Channel " << impl.id_
                   << " is done sending ready-to-send notification for chunk #"
                   << chunkId << " of " << numChunks << " for buffer #"
                   << bufferSequenceNumber;
      }));
}

void ChannelImpl::copyFromGpuToCpu(ChunkSendOpIter opIter) {
  ChunkSendOperation& op = *opIter;

  TP_VLOG(5) << "Channel " << id_ << " is copying chunk #" << op.chunkId
             << " of " << op.numChunks << " for buffer #"
             << op.bufferSequenceNumber << " from CUDA device to CPU";
  cudaCopy(
      op.tmpBuffer.get(),
      op.devicePtr,
      op.length,
      op.deviceIdx,
      op.stream,
      callbackWrapper_([opIter](ChannelImpl& impl) {
        TP_VLOG(5) << "Channel " << impl.id_ << " is done copying chunk #"
                   << opIter->chunkId << " of " << opIter->numChunks
                   << " for buffer #" << opIter->bufferSequenceNumber
                   << " from CUDA device to CPU";
        opIter->doneCopyingFromGpuToCpu = true;
        impl.chunkSendOps_.advanceOperation(opIter);
      }));
}

void ChannelImpl::sendCpuBuffer(ChunkSendOpIter opIter) {
  ChunkSendOperation& op = *opIter;

  TP_VLOG(6) << "Channel " << id_ << " is sending chunk #" << op.chunkId
             << " of " << op.numChunks << " for buffer #"
             << op.bufferSequenceNumber << " through CPU channel";

  cpuChannel_->send(
      CpuBuffer{.ptr = op.isCpuBuffer ? op.devicePtr : op.tmpBuffer.get()},
      op.length,
      callbackWrapper_([opIter](ChannelImpl& impl) {
        TP_VLOG(6) << "Channel " << impl.id_ << " is done sending chunk #"
                   << opIter->chunkId << " of " << opIter->numChunks
                   << " for buffer #" << opIter->bufferSequenceNumber
                   << " through CPU channel";
        opIter->doneSendingCpuBuffer = true;
        impl.chunkSendOps_.advanceOperation(opIter);
      }));
}

void ChannelImpl::callSendCallback(ChunkSendOpIter opIter) {
  ChunkSendOperation& op = *opIter;

  if (op.callback) {
    op.callback(error_);
    // Reset callback to release the resources it was holding.
    op.callback = nullptr;
  }
}

void ChannelImpl::returnSendCpuBuffer(ChunkSendOpIter opIter) {
  ChunkSendOperation& op = *opIter;

  // The pointer's deleter will return the buffer to the allocator.
  op.tmpBuffer = nullptr;
}

void ChannelImpl::recvImplFromLoop(
    uint64_t sequenceNumber,
    Buffer buffer,
    size_t length,
    TRecvCallback callback) {
  if (length == 0) {
    callback(error_);
    return;
  }

  const Device device = buffer.device();
  const size_t chunkLength = kSlotSize;
  const size_t numChunks = ceilOfRatio(length, chunkLength);
  for (size_t offset = 0; offset < length; offset += chunkLength) {
    ChunkRecvOpIter opIter =
        chunkRecvOps_.emplaceBack(nextChunkBeingReceived_++);
    ChunkRecvOperation& op = *opIter;
    op.bufferSequenceNumber = sequenceNumber;
    op.chunkId = offset / chunkLength;
    op.numChunks = numChunks;
    op.length = std::min(length - offset, chunkLength);
    // Operations are processed in order, so we can afford to trigger the
    // callback once the last operation is done.
    if (op.chunkId == numChunks - 1) {
      op.callback = std::move(callback);
    }

    if (device.type == kCpuDeviceType) {
      op.isCpuBuffer = true;
      op.devicePtr =
          static_cast<uint8_t*>(buffer.unwrap<CpuBuffer>().ptr) + offset;
    } else if (device.type == kCudaDeviceType) {
      op.isCpuBuffer = false;
      op.devicePtr =
          static_cast<uint8_t*>(buffer.unwrap<CudaBuffer>().ptr) + offset;
      op.stream = buffer.unwrap<CudaBuffer>().stream;
      op.deviceIdx = device.index;
    } else {
      TP_THROW_ASSERT() << "Unexpected device type: " << device.type;
    }

    chunkRecvOps_.advanceOperation(opIter);
  }
}

void ChannelImpl::advanceChunkRecvOperation(
    ChunkRecvOpIter opIter,
    ChunkRecvOperation::State prevOpState) {
  TP_DCHECK(context_->inLoop());

  ChunkRecvOperation& op = *opIter;

  // Needs to go after previous op invoked its callback because the last chunk
  // in a series (that corresponds to one operation) must invoke its callback
  // only when all chunks in the series are done.
  chunkRecvOps_.attemptTransition(
      opIter,
      /*from=*/ChunkRecvOperation::UNINITIALIZED,
      /*to=*/ChunkRecvOperation::FINISHED,
      /*cond=*/error_ &&
          prevOpState >=
              ChunkRecvOperation::COPYING_FROM_CPU_TO_GPU_AND_INVOKED_CALLBACK,
      /*actions=*/{&ChannelImpl::callRecvCallback});

  // Needs to go after previous op to ensure predictable and consistent ordering
  // of read calls on control connection.
  chunkRecvOps_.attemptTransition(
      opIter,
      /*from=*/ChunkRecvOperation::UNINITIALIZED,
      /*to=*/ChunkRecvOperation::READING_READY_TO_SEND,
      /*cond=*/!error_ &&
          prevOpState >= ChunkRecvOperation::READING_READY_TO_SEND,
      /*actions=*/{&ChannelImpl::readReadyToSend});

  // See above for why this needs to go after previous op.
  chunkRecvOps_.attemptTransition(
      opIter,
      /*from=*/ChunkRecvOperation::READING_READY_TO_SEND,
      /*to=*/ChunkRecvOperation::FINISHED,
      /*cond=*/error_ && op.doneReadingReadyToSend &&
          prevOpState >=
              ChunkRecvOperation::COPYING_FROM_CPU_TO_GPU_AND_INVOKED_CALLBACK,
      /*actions=*/{&ChannelImpl::callRecvCallback});

  // Needs to go after previous op to ensure predictable and consistent ordering
  // of recv calls on CPU channel.
  // This operation shortcuts allocating staging memory when receiving directly
  // on CPU.
  chunkRecvOps_.attemptTransition(
      opIter,
      /*from=*/ChunkRecvOperation::READING_READY_TO_SEND,
      /*to=*/ChunkRecvOperation::RECEIVING_CPU_BUFFER,
      /*cond=*/!error_ && op.doneReadingReadyToSend && op.isCpuBuffer &&
          prevOpState >= ChunkRecvOperation::RECEIVING_CPU_BUFFER,
      /*actions=*/{&ChannelImpl::receiveCpuBuffer});

  // Needs to go after previous op to ensure later operations are not holding
  // staging buffers while earlier ones are still blocked waiting for them,
  // because the staging buffer will only be returned to the allocator once the
  // operation is destroyed, but this won't happen until earlier operations have
  // completed, and if they are blocked waiting for buffers we may deadlock.
  chunkRecvOps_.attemptTransition(
      opIter,
      /*from=*/ChunkRecvOperation::READING_READY_TO_SEND,
      /*to=*/ChunkRecvOperation::ALLOCATING_CPU_BUFFER,
      /*cond=*/!error_ && op.doneReadingReadyToSend && !op.isCpuBuffer &&
          prevOpState >= ChunkRecvOperation::ALLOCATING_CPU_BUFFER,
      /*actions=*/{&ChannelImpl::allocateRecvCpuBuffer});

  // See above for why this needs to go after previous op.
  chunkRecvOps_.attemptTransition(
      opIter,
      /*from=*/ChunkRecvOperation::ALLOCATING_CPU_BUFFER,
      /*to=*/ChunkRecvOperation::FINISHED,
      /*cond=*/error_ && op.doneAllocatingCpuStagingBuffer &&
          prevOpState >=
              ChunkRecvOperation::COPYING_FROM_CPU_TO_GPU_AND_INVOKED_CALLBACK,
      /*actions=*/
      {&ChannelImpl::callRecvCallback, &ChannelImpl::returnRecvCpuBuffer});

  // Needs to go after previous op to ensure predictable and consistent ordering
  // of recv calls on CPU channel.
  chunkRecvOps_.attemptTransition(
      opIter,
      /*from=*/ChunkRecvOperation::ALLOCATING_CPU_BUFFER,
      /*to=*/ChunkRecvOperation::RECEIVING_CPU_BUFFER,
      /*cond=*/!error_ && op.doneAllocatingCpuStagingBuffer &&
          prevOpState >= ChunkRecvOperation::RECEIVING_CPU_BUFFER,
      /*actions=*/{&ChannelImpl::receiveCpuBuffer});

  // See above for why this needs to go after previous op.
  chunkRecvOps_.attemptTransition(
      opIter,
      /*from=*/ChunkRecvOperation::RECEIVING_CPU_BUFFER,
      /*to=*/ChunkRecvOperation::FINISHED,
      /*cond=*/error_ && op.doneReceivingCpuBuffer && !op.isCpuBuffer &&
          prevOpState >=
              ChunkRecvOperation::COPYING_FROM_CPU_TO_GPU_AND_INVOKED_CALLBACK,
      /*actions=*/
      {&ChannelImpl::callRecvCallback, &ChannelImpl::returnRecvCpuBuffer});

  // This transition shortcuts the copy to GPU when receiving on CPU memory.
  chunkRecvOps_.attemptTransition(
      opIter,
      /*from=*/ChunkRecvOperation::RECEIVING_CPU_BUFFER,
      /*to=*/ChunkRecvOperation::FINISHED,
      /*cond=*/op.doneReceivingCpuBuffer && op.isCpuBuffer,
      /*actions=*/{&ChannelImpl::callRecvCallback});

  chunkRecvOps_.attemptTransition(
      opIter,
      /*from=*/ChunkRecvOperation::RECEIVING_CPU_BUFFER,
      /*to=*/ChunkRecvOperation::COPYING_FROM_CPU_TO_GPU,
      /*cond=*/!error_ && op.doneReceivingCpuBuffer && !op.isCpuBuffer,
      /*actions=*/{&ChannelImpl::copyFromCpuToGpu});

  // See above for why this needs to go after previous op.
  chunkRecvOps_.attemptTransition(
      opIter,
      /*from=*/ChunkRecvOperation::COPYING_FROM_CPU_TO_GPU,
      /*to=*/ChunkRecvOperation::COPYING_FROM_CPU_TO_GPU_AND_INVOKED_CALLBACK,
      /*cond=*/prevOpState >=
          ChunkRecvOperation::COPYING_FROM_CPU_TO_GPU_AND_INVOKED_CALLBACK,
      /*actions=*/{&ChannelImpl::callRecvCallback});

  chunkRecvOps_.attemptTransition(
      opIter,
      /*from=*/ChunkRecvOperation::COPYING_FROM_CPU_TO_GPU_AND_INVOKED_CALLBACK,
      /*to=*/ChunkRecvOperation::FINISHED,
      /*cond=*/op.doneCopyingFromCpuToGpu,
      /*actions=*/{&ChannelImpl::returnRecvCpuBuffer});
}

void ChannelImpl::readReadyToSend(ChunkRecvOpIter opIter) {
  ChunkRecvOperation& op = *opIter;

  TP_VLOG(6) << "Channel " << id_
             << " is reading ready-to-send notification for chunk #"
             << op.chunkId << " of " << op.numChunks << " for buffer #"
             << op.bufferSequenceNumber;
  connection_->read(callbackWrapper_(
      [opIter](
          ChannelImpl& impl, const void* /* unused */, size_t /* unused */) {
        TP_VLOG(6) << "Channel " << impl.id_
                   << " is done reading ready-to-send notification for chunk #"
                   << opIter->chunkId << " of " << opIter->numChunks
                   << " for buffer #" << opIter->bufferSequenceNumber;
        opIter->doneReadingReadyToSend = true;
        impl.chunkRecvOps_.advanceOperation(opIter);
      }));
}

void ChannelImpl::allocateRecvCpuBuffer(ChunkRecvOpIter opIter) {
  ChunkRecvOperation& op = *opIter;

  TP_VLOG(5) << "Channel " << id_
             << " is allocating temporary memory for chunk #" << op.chunkId
             << " of " << op.numChunks << " for buffer #"
             << op.bufferSequenceNumber;
  Allocator& cudaHostAllocator =
      context_->getCudaHostRecvAllocator(op.deviceIdx);
  cudaHostAllocator.alloc(
      op.length,
      callbackWrapper_(
          [opIter](
              ChannelImpl& impl, std::shared_ptr<uint8_t> tmpBuffer) mutable {
            TP_VLOG(5) << "Channel " << impl.id_
                       << " is done allocating temporary memory for chunk #"
                       << opIter->chunkId << " of " << opIter->numChunks
                       << " for buffer #" << opIter->bufferSequenceNumber;
            opIter->doneAllocatingCpuStagingBuffer = true;
            if (!impl.error_) {
              opIter->tmpBuffer = std::move(tmpBuffer);
            }
            impl.chunkRecvOps_.advanceOperation(opIter);
          }));
}

void ChannelImpl::receiveCpuBuffer(ChunkRecvOpIter opIter) {
  ChunkRecvOperation& op = *opIter;

  TP_VLOG(6) << "Channel " << id_ << " is sending chunk #" << op.chunkId
             << " of " << op.numChunks << " for buffer #"
             << op.bufferSequenceNumber << " through CPU channel";
  cpuChannel_->recv(
      CpuBuffer{.ptr = op.isCpuBuffer ? op.devicePtr : op.tmpBuffer.get()},
      op.length,
      callbackWrapper_([opIter](ChannelImpl& impl) {
        TP_VLOG(6) << "Channel " << impl.id_ << " is done sending chunk #"
                   << opIter->chunkId << " of " << opIter->numChunks
                   << " for buffer #" << opIter->bufferSequenceNumber
                   << " through CPU channel";
        opIter->doneReceivingCpuBuffer = true;
        impl.chunkRecvOps_.advanceOperation(opIter);
      }));
}

void ChannelImpl::copyFromCpuToGpu(ChunkRecvOpIter opIter) {
  ChunkRecvOperation& op = *opIter;

  TP_VLOG(5) << "Channel " << id_ << " is copying chunk #" << op.chunkId
             << " of " << op.numChunks << " for buffer #"
             << op.bufferSequenceNumber << " from CPU to CUDA device";
  cudaCopy(
      op.devicePtr,
      op.tmpBuffer.get(),
      op.length,
      op.deviceIdx,
      op.stream,
      callbackWrapper_([opIter](ChannelImpl& impl) {
        TP_VLOG(5) << "Channel " << impl.id_ << " is done copying chunk #"
                   << opIter->chunkId << " of " << opIter->numChunks
                   << " for buffer #" << opIter->bufferSequenceNumber
                   << " from CPU to CUDA device";
        opIter->doneCopyingFromCpuToGpu = true;
        impl.chunkRecvOps_.advanceOperation(opIter);
      }));
}

void ChannelImpl::callRecvCallback(ChunkRecvOpIter opIter) {
  ChunkRecvOperation& op = *opIter;

  if (op.callback) {
    op.callback(error_);
    // Reset callback to release the resources it was holding.
    op.callback = nullptr;
  }
}

void ChannelImpl::returnRecvCpuBuffer(ChunkRecvOpIter opIter) {
  ChunkRecvOperation& op = *opIter;

  // The pointer's deleter will return the buffer to the allocator.
  op.tmpBuffer = nullptr;
}

void ChannelImpl::setIdImpl() {
  cpuChannel_->setId(id_ + ".cpu");
}

void ChannelImpl::handleErrorImpl() {
  chunkSendOps_.advanceAllOperations();
  chunkRecvOps_.advanceAllOperations();

  connection_->close();
  cpuChannel_->close();

  context_->unenroll(*this);
}

} // namespace cuda_basic
} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/channel/cuda_basic/channel_impl.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <deque>
#include <memory>
#include <string>

#include <tensorpipe/channel/channel_impl_boilerplate.h>
#include <tensorpipe/common/allocator.h>
#include <tensorpipe/common/cuda.h>
#include <tensorpipe/common/cuda_buffer.h>
#include <tensorpipe/common/cuda_loop.h>
#include <tensorpipe/common/state_machine.h>

namespace tensorpipe {
namespace channel {
namespace cuda_basic {

class ContextImpl;

struct ChunkSendOperation {
  enum State {
    UNINITIALIZED,
    ALLOCATING_CPU_BUFFER,
    COPYING_FROM_GPU_TO_CPU,
    INVOKED_CALLBACK,
    SENDING_CPU_BUFFER,
    FINISHED
  };

  // Fields used by the state machine
  uint64_t sequenceNumber{0};
  State state{UNINITIALIZED};

  // Arguments at creation
  uint64_t bufferSequenceNumber{0};
  bool isCpuBuffer{false};
  void* devicePtr{nullptr};
  size_t chunkId{0};
  size_t numChunks{0};
  size_t length{0};
  std::function<void(const Error&)> callback;

  // For CUDA buffers
  cudaStream_t stream{cudaStreamDefault};
  int deviceIdx{0};

  // Data collected during processing
  std::shared_ptr<uint8_t> tmpBuffer;

  // Progress flags
  bool doneAllocatingCpuStagingBuffer{false};
  bool doneCopyingFromGpuToCpu{false};
  bool doneSendingCpuBuffer{false};
};

struct ChunkRecvOperation {
  enum State {
    UNINITIALIZED,
    READING_READY_TO_SEND,
    ALLOCATING_CPU_BUFFER,
    RECEIVING_CPU_BUFFER,
    COPYING_FROM_CPU_TO_GPU,
    COPYING_FROM_CPU_TO_GPU_AND_INVOKED_CALLBACK,
    FINISHED
  };

  // Fields used by the state machine
  uint64_t sequenceNumber{0};
  State state{UNINITIALIZED};

  // Arguments at creation
  uint64_t bufferSequenceNumber{0};
  bool isCpuBuffer{false};
  void* devicePtr{nullptr};
  size_t chunkId{0};
  size_t numChunks{0};
  size_t length{0};
  std::function<void(const Error&)> callback;

  // For CUDA buffers
  cudaStream_t stream{cudaStreamDefault};
  int deviceIdx{0};

  // Data collected during processing
  std::shared_ptr<uint8_t> tmpBuffer;

  // Progress flags
  bool doneReadingReadyToSend{false};
  bool doneAllocatingCpuStagingBuffer{false};
  bool doneReceivingCpuBuffer{false};
  bool doneCopyingFromCpuToGpu{false};
};

class ChannelImpl final
    : public ChannelImplBoilerplate<ContextImpl, ChannelImpl> {
 public:
  ChannelImpl(
      ConstructorToken token,
      std::shared_ptr<ContextImpl> context,
      std::string id,
      std::shared_ptr<transport::Connection> connection,
      std::shared_ptr<Channel> cpuChannel,
      CudaLoop& cudaLoop);

 protected:
  // Implement the entry points called by ChannelImplBoilerplate.
  void initImplFromLoop() override;
  void sendImplFromLoop(
      uint64_t sequenceNumber,
      Buffer buffer,
      size_t length,
      TSendCallback callback) override;
  void recvImplFromLoop(
      uint64_t sequenceNumber,
      Buffer buffer,
      size_t length,
      TRecvCallback callback) override;
  void handleErrorImpl() override;
  void setIdImpl() override;

 private:
  const std::shared_ptr<transport::Connection> connection_;
  const std::shared_ptr<Channel> cpuChannel_;
  CudaLoop& cudaLoop_;

  // A sequence number for the chunks.
  uint64_t nextChunkBeingSent_{0};
  uint64_t nextChunkBeingReceived_{0};

  OpsStateMachine<ChannelImpl, ChunkSendOperation> chunkSendOps_{
      *this,
      &ChannelImpl::advanceChunkSendOperation};
  using ChunkSendOpIter = decltype(chunkSendOps_)::Iter;
  OpsStateMachine<ChannelImpl, ChunkRecvOperation> chunkRecvOps_{
      *this,
      &ChannelImpl::advanceChunkRecvOperation};
  using ChunkRecvOpIter = decltype(chunkRecvOps_)::Iter;

  // State machines for send and recv ops.
  void advanceChunkSendOperation(
      ChunkSendOpIter opIter,
      ChunkSendOperation::State prevOpState);
  void advanceChunkRecvOperation(
      ChunkRecvOpIter opIter,
      ChunkRecvOperation::State prevOpState);

  // Actions (i.e., methods that begin a state transition).
  // For send operations:
  void allocateSendCpuBuffer(ChunkSendOpIter opIter);
  void copyFromGpuToCpu(ChunkSendOpIter opIter);
  void callSendCallback(ChunkSendOpIter opIter);
  void sendCpuBuffer(ChunkSendOpIter opIter);
  void writeReadyToSend(ChunkSendOpIter opIter);
  void returnSendCpuBuffer(ChunkSendOpIter opIter);
  // For recv operations:
  void readReadyToSend(ChunkRecvOpIter opIter);
  void allocateRecvCpuBuffer(ChunkRecvOpIter opIter);
  void receiveCpuBuffer(ChunkRecvOpIter opIter);
  void copyFromCpuToGpu(ChunkRecvOpIter opIter);
  void callRecvCallback(ChunkRecvOpIter opIter);
  void returnRecvCpuBuffer(ChunkRecvOpIter opIter);

  void cudaCopy(
      void* dst,
      const void* src,
      size_t length,
      int deviceIdx,
      cudaStream_t stream,
      std::function<void(const Error&)> callback);
};

} // namespace cuda_basic
} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/channel/cuda_basic/constants.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <cstddef>

namespace tensorpipe {
namespace channel {
namespace cuda_basic {

// FIXME Avoid this anonymous namespace and use inline variables in C++-17.
namespace {

// Define all three (redundant) values to make them explicit and avoid
// misunderstandings due to miscalculations.
static constexpr size_t kStagingAreaSize = 16 * 1024 * 1024;
static constexpr size_t kSlotSize = 1024 * 1024;
static constexpr size_t kNumSlots = 16;

static_assert(kStagingAreaSize == kSlotSize * kNumSlots, "");

} // namespace

} // namespace cuda_basic
} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/channel/cuda_basic/context_impl.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/channel/cuda_basic/context_impl.h>

#include <functional>
#include <memory>
#include <utility>

#include <tensorpipe/channel/cuda_basic/channel_impl.h>
#include <tensorpipe/channel/cuda_basic/constants.h>
#include <tensorpipe/channel/helpers.h>
#include <tensorpipe/common/cuda.h>
#include <tensorpipe/common/nop.h>

namespace tensorpipe {
namespace channel {
namespace cuda_basic {

namespace {

struct DeviceDescriptor {
  std::string deviceType;
  std::string descriptor;
  NOP_STRUCTURE(DeviceDescriptor, deviceType, descriptor);
};

DeviceDescriptor deserializeDeviceDescriptor(
    const std::string& deviceDescriptor) {
  NopHolder<DeviceDescriptor> nopHolder;
  loadDescriptor(nopHolder, deviceDescriptor);
  return std::move(nopHolder.getObject());
}

} // namespace

std::shared_ptr<ContextImpl> ContextImpl::create(
    std::shared_ptr<Context> cpuContext) {
  Error error;
  CudaLib cudaLib;
  std::tie(error, cudaLib) = CudaLib::create();
  if (error) {
    TP_VLOG(5)
        << "CUDA basic channel is not viable because libcuda could not be loaded: "
        << error.what();
    return nullptr;
  }

  if (cpuContext->deviceDescriptors().count(Device{kCpuDeviceType, 0}) == 0) {
    TP_THROW_ASSERT() << "CUDA basic channel needs a CPU channel";

    return nullptr;
  }

  if (!cpuContext->isViable()) {
    return nullptr;
  }

  std::unordered_map<Device, std::string> deviceDescriptors;
  // NOTE: Assume there is only one CPU.
  TP_DCHECK_EQ(
      cpuContext->deviceDescriptors().count(Device{kCpuDeviceType, 0}), 1);
  const auto cpuDeviceDescriptor =
      cpuContext->deviceDescriptors().begin()->second;

  NopHolder<DeviceDescriptor> nopHolder;
  DeviceDescriptor& deviceDescriptor = nopHolder.getObject();
  deviceDescriptor.descriptor = cpuDeviceDescriptor;

  deviceDescriptor.deviceType = kCpuDeviceType;
  deviceDescriptors[Device{kCpuDeviceType, 0}] = saveDescriptor(nopHolder);
  for (const auto& device : getCudaDevices(cudaLib)) {
    deviceDescriptor.deviceType = kCudaDeviceType;
    deviceDescriptors[device] = saveDescriptor(nopHolder);
  }

  return std::make_shared<ContextImpl>(
      std::move(cudaLib), std::move(cpuContext), std::move(deviceDescriptors));
}

ContextImpl::ContextImpl(
    CudaLib cudaLib,
    std::shared_ptr<Context> cpuContext,
    std::unordered_map<Device, std::string> deviceDescriptors)
    : ContextImplBoilerplate<ContextImpl, ChannelImpl>(
          std::move(deviceDescriptors)),
      cudaLib_(std::move(cudaLib)),
      cpuContext_(std::move(cpuContext)) {}

std::shared_ptr<Channel> ContextImpl::createChannel(
    std::vector<std::shared_ptr<transport::Connection>> connections,
    Endpoint endpoint) {
  TP_DCHECK_EQ(numConnectionsNeeded(), connections.size());
  auto conn = std::move(connections.back());
  connections.pop_back();
  auto cpuChannel =
      cpuContext_->createChannel(std::move(connections), endpoint);
  return createChannelInternal(
      std::move(conn), std::move(cpuChannel), cudaLoop_);
}

size_t ContextImpl::numConnectionsNeeded() const {
  return 1 + cpuContext_->numConnectionsNeeded();
}

bool ContextImpl::canCommunicateWithRemote(
    const std::string& localDeviceDescriptor,
    const std::string& remoteDeviceDescriptor) const {
  DeviceDescriptor nopLocalDeviceDescriptor =
      deserializeDeviceDescriptor(localDeviceDescriptor);
  DeviceDescriptor nopRemoteDeviceDescriptor =
      deserializeDeviceDescriptor(remoteDeviceDescriptor);

  // Prevent CudaBasic from being mistakenly used for CPU to CPU transfers, as
  // there are always better options.
  if (nopLocalDeviceDescriptor.deviceType == kCpuDeviceType &&
      nopRemoteDeviceDescriptor.deviceType == kCpuDeviceType) {
    return false;
  }

  return nopLocalDeviceDescriptor.descriptor ==
      nopRemoteDeviceDescriptor.descriptor;
}

const CudaLib& ContextImpl::getCudaLib() {
  return cudaLib_;
}

Allocator& ContextImpl::getCudaHostSendAllocator(int deviceIdx) {
  if (!cudaHostSendAllocator_.has_value()) {
    CudaPinnedBuffer buffer = makeCudaPinnedBuffer(kStagingAreaSize, deviceIdx);
    uint8_t* ptr = buffer.get();
    cudaHostSendAllocator_.emplace(CudaHostAllocator{
        std::move(buffer), Allocator(ptr, kNumSlots, kSlotSize)});
  }

  return cudaHostSendAllocator_->allocator;
}

Allocator& ContextImpl::getCudaHostRecvAllocator(int deviceIdx) {
  if (!cudaHostRecvAllocator_.has_value()) {
    CudaPinnedBuffer buffer = makeCudaPinnedBuffer(kStagingAreaSize, deviceIdx);
    uint8_t* ptr = buffer.get();
    cudaHostRecvAllocator_.emplace(CudaHostAllocator{
        std::move(buffer), Allocator(ptr, kNumSlots, kSlotSize)});
  }

  return cudaHostRecvAllocator_->allocator;
}

void ContextImpl::handleErrorImpl() {
  if (cpuContext_ != nullptr) {
    cpuContext_->close();
  }
  cudaLoop_.close();

  if (cudaHostSendAllocator_.has_value()) {
    cudaHostSendAllocator_->allocator.close();
  }
  if (cudaHostRecvAllocator_.has_value()) {
    cudaHostRecvAllocator_->allocator.close();
  }
}

void ContextImpl::joinImpl() {
  if (cpuContext_ != nullptr) {
    cpuContext_->join();
  }
  cudaLoop_.join();
}

bool ContextImpl::inLoop() const {
  return loop_.inLoop();
};

void ContextImpl::deferToLoop(std::function<void()> fn) {
  loop_.deferToLoop(std::move(fn));
};

void ContextImpl::setIdImpl() {
  cpuContext_->setId(id_ + ".cpu");
}

} // namespace cuda_basic
} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/channel/cuda_basic/context_impl.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <tensorpipe/channel/context_impl_boilerplate.h>
#include <tensorpipe/common/allocator.h>
#include <tensorpipe/common/cuda.h>
#include <tensorpipe/common/cuda_buffer.h>
#include <tensorpipe/common/cuda_lib.h>
#include <tensorpipe/common/cuda_loop.h>
#include <tensorpipe/common/deferred_executor.h>
#include <tensorpipe/common/device.h>
#include <tensorpipe/common/optional.h>

namespace tensorpipe {
namespace channel {
namespace cuda_basic {

class ChannelImpl;

class ContextImpl final
    : public ContextImplBoilerplate<ContextImpl, ChannelImpl> {
 public:
  static std::shared_ptr<ContextImpl> create(
      std::shared_ptr<Context> cpuContext);

  ContextImpl(
      CudaLib cudaLib,
      std::shared_ptr<Context> cpuContext,
      std::unordered_map<Device, std::string> deviceDescriptors);

  std::shared_ptr<Channel> createChannel(
      std::vector<std::shared_ptr<transport::Connection>> connections,
      Endpoint endpoint);

  size_t numConnectionsNeeded() const override;

  bool canCommunicateWithRemote(
      const std::string& localDeviceDescriptor,
      const std::string& remoteDeviceDescriptor) const override;

  const CudaLib& getCudaLib();
  Allocator& getCudaHostSendAllocator(int deviceIdx);
  Allocator& getCudaHostRecvAllocator(int deviceIdx);

  // Implement the DeferredExecutor interface.
  bool inLoop() const override;
  void deferToLoop(std::function<void()> fn) override;

 protected:
  // Implement the entry points called by ContextImplBoilerplate.
  void handleErrorImpl() override;
  void joinImpl() override;
  void setIdImpl() override;

 private:
  OnDemandDeferredExecutor loop_;

  const CudaLib cudaLib_;

  const std::shared_ptr<Context> cpuContext_;
  // TODO: Lazy initialization of cuda loop.
  CudaLoop cudaLoop_;

  struct CudaHostAllocator {
    CudaPinnedBuffer buffer;
    Allocator allocator;
  };
  optional<CudaHostAllocator> cudaHostSendAllocator_;
  optional<CudaHostAllocator> cudaHostRecvAllocator_;
};

} // namespace cuda_basic
} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/channel/cuda_basic/factory.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/channel/cuda_basic/factory.h>

#include <tensorpipe/channel/context_boilerplate.h>
#include <tensorpipe/channel/cuda_basic/channel_impl.h>
#include <tensorpipe/channel/cuda_basic/context_impl.h>

namespace tensorpipe {
namespace channel {
namespace cuda_basic {

std::shared_ptr<Context> create(std::shared_ptr<Context> cpuContext) {
  return std::make_shared<ContextBoilerplate<ContextImpl, ChannelImpl>>(
      std::move(cpuContext));
}

} // namespace cuda_basic
} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/channel/cuda_basic/factory.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <memory>

#include <tensorpipe/channel/context.h>

namespace tensorpipe {
namespace channel {
namespace cuda_basic {

std::shared_ptr<Context> create(std::shared_ptr<Context> cpuContext);

} // namespace cuda_basic
} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/channel/cuda_gdr/channel_impl.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/channel/cuda_gdr/channel_impl.h>

#include <algorithm>
#include <cstring>
#include <memory>
#include <string>
#include <utility>
#include <vector>

#include <tensorpipe/channel/cuda_gdr/context_impl.h>
#include <tensorpipe/common/cuda_buffer.h>
#include <tensorpipe/common/defs.h>
#include <tensorpipe/common/error.h>
#include <tensorpipe/transport/connection.h>

namespace tensorpipe {
namespace channel {
namespace cuda_gdr {

namespace {

size_t ceilOfRatio(size_t n, size_t d) {
  return (n + d - 1) / d;
}

} // namespace

ChannelImpl::ChannelImpl(
    ConstructorToken token,
    std::shared_ptr<ContextImpl> context,
    std::string id,
    std::shared_ptr<transport::Connection> descriptorConnection,
    std::shared_ptr<transport::Connection> readyToReceiveConnection)
    : ChannelImplBoilerplate<ContextImpl, ChannelImpl>(
          token,
          std::move(context),
          std::move(id)),
      descriptorConnection_(std::move(descriptorConnection)),
      readyToReceiveConnection_(std::move(readyToReceiveConnection)) {}

void ChannelImpl::initImplFromLoop() {
  TP_DCHECK(context_->inLoop());
  TP_DCHECK_EQ(state_, INITIALIZING);
  TP_DCHECK(!error_);

  context_->enroll(*this);

  localGpuToNic_ = context_->getGpuToNicMapping();
  numLocalNics_ =
      *std::max_element(localGpuToNic_.begin(), localGpuToNic_.end()) + 1;

  auto nopHolderOut = std::make_shared<NopHolder<HandshakeNumNics>>();
  HandshakeNumNics& nopHandshakeNumNics = nopHolderOut->getObject();
  nopHandshakeNumNics.numNics = numLocalNics_;
  TP_VLOG(6) << "Channel " << id_
             << " is writing nop object (handshake num NICs)";
  readyToReceiveConnection_->write(
      *nopHolderOut, callbackWrapper_([nopHolderOut](ChannelImpl& impl) {
        TP_VLOG(6) << "Channel " << impl.id_
                   << " done writing nop object (handshake num NICs)";
      }));

  auto nopHolderIn = std::make_shared<NopHolder<HandshakeNumNics>>();
  TP_VLOG(6) << "Channel " << id_
             << " is reading nop object (handshake num NICs)";
  readyToReceiveConnection_->read(
      *nopHolderIn, callbackWrapper_([nopHolderIn](ChannelImpl& impl) {
        TP_VLOG(6) << "Channel " << impl.id_
                   << " done reading nop object (handshake num NICs)";
        if (!impl.error_) {
          impl.onReadHandshakeNumNics(nopHolderIn->getObject());
        }
      }));

  state_ = WAITING_FOR_HANDSHAKE_NUM_NICS;
}

void ChannelImpl::onReadHandshakeNumNics(
    const HandshakeNumNics& nopHandshakeNumNics) {
  TP_DCHECK(context_->inLoop());
  TP_DCHECK_EQ(state_, WAITING_FOR_HANDSHAKE_NUM_NICS);
  TP_DCHECK(!error_);

  numRemoteNics_ = nopHandshakeNumNics.numNics;

  std::vector<std::vector<NopIbvSetupInformation>> allSetupInfo;

  queuePairs_.resize(numLocalNics_);
  allSetupInfo.resize(numLocalNics_);
  for (size_t localNicIdx = 0; localNicIdx < numLocalNics_; localNicIdx++) {
    queuePairs_[localNicIdx].resize(numRemoteNics_);
    allSetupInfo[localNicIdx].resize(numRemoteNics_);
    IbvNic& localNic = context_->getIbvNic(localNicIdx);
    for (size_t remoteNicIdx = 0; remoteNicIdx < numRemoteNics_;
         remoteNicIdx++) {
      IbvLib::qp_init_attr initAttr;
      std::memset(&initAttr, 0, sizeof(initAttr));
      initAttr.qp_type = IbvLib::QPT_RC;
      initAttr.send_cq = localNic.getIbvCq().get();
      initAttr.recv_cq = localNic.getIbvCq().get();
      initAttr.cap.max_send_wr = kNumSends;
      initAttr.cap.max_send_sge = 1;
      initAttr.cap.max_recv_wr = kNumRecvs;
      initAttr.cap.max_recv_sge = 1;
      initAttr.sq_sig_all = 1;
      IbvQueuePair qp = createIbvQueuePair(
          context_->getIbvLib(), localNic.getIbvPd(), initAttr);

      transitionIbvQueuePairToInit(
          context_->getIbvLib(), qp, localNic.getIbvAddress());

      IbvSetupInformation setupInfo =
          makeIbvSetupInformation(localNic.getIbvAddress(), qp);

      // The maximum message size will be filled in later.
      queuePairs_[localNicIdx][remoteNicIdx] =
          QueuePair{std::move(qp), /*maximumMessageSize=*/0};
      allSetupInfo[localNicIdx][remoteNicIdx].fromIbvSetupInformation(
          setupInfo);
    }
  }

  auto nopHolderOut = std::make_shared<NopHolder<HandshakeSetupInfo>>();
  HandshakeSetupInfo& nopHandshakeSetupInfo = nopHolderOut->getObject();
  nopHandshakeSetupInfo.setupInfo = std::move(allSetupInfo);
  TP_VLOG(6) << "Channel " << id_ << " is writing nop object (handshake two)";
  readyToReceiveConnection_->write(
      *nopHolderOut, callbackWrapper_([nopHolderOut](ChannelImpl& impl) {
        TP_VLOG(6) << "Channel " << impl.id_
                   << " done writing nop object (handshake two)";
      }));

  auto nopHolderIn = std::make_shared<NopHolder<HandshakeSetupInfo>>();
  TP_VLOG(6) << "Channel " << id_ << " is reading nop object (handshake two)";
  readyToReceiveConnection_->read(
      *nopHolderIn, callbackWrapper_([nopHolderIn](ChannelImpl& impl) {
        TP_VLOG(6) << "Channel " << impl.id_
                   << " done reading nop object (handshake two)";
        if (!impl.error_) {
          impl.onReadHandshakeSetupInfo(nopHolderIn->getObject());
        }
      }));

  state_ = WAITING_FOR_HANDSHAKE_SETUP_INFO;
}

void ChannelImpl::onReadHandshakeSetupInfo(
    const HandshakeSetupInfo& nopHandshakeSetupInfo) {
  TP_DCHECK(context_->inLoop());
  TP_DCHECK_EQ(state_, WAITING_FOR_HANDSHAKE_SETUP_INFO);
  TP_DCHECK(!error_);

  const std::vector<std::vector<NopIbvSetupInformation>>& remoteSetupInfo =
      nopHandshakeSetupInfo.setupInfo;

  TP_DCHECK_EQ(remoteSetupInfo.size(), numRemoteNics_);
  for (size_t remoteNicIdx = 0; remoteNicIdx < numRemoteNics_; remoteNicIdx++) {
    TP_DCHECK_EQ(remoteSetupInfo[remoteNicIdx].size(), numLocalNics_);
    for (size_t localNicIdx = 0; localNicIdx < numLocalNics_; localNicIdx++) {
      IbvNic& localNic = context_->getIbvNic(localNicIdx);
      IbvSetupInformation setupInfo =
          remoteSetupInfo[remoteNicIdx][localNicIdx].toIbvSetupInformation();
      const IbvAddress& localAddress = localNic.getIbvAddress();

      transitionIbvQueuePairToReadyToReceive(
          context_->getIbvLib(),
          queuePairs_[localNicIdx][remoteNicIdx].queuePair,
          localAddress,
          setupInfo);
      transitionIbvQueuePairToReadyToSend(
          context_->getIbvLib(),
          queuePairs_[localNicIdx][remoteNicIdx].queuePair);

      queuePairs_[localNicIdx][remoteNicIdx].maximumMessageSize = std::min(
          localAddress.maximumMessageSize, setupInfo.maximumMessageSize);
    }
  }

  state_ = ESTABLISHED;
  sendOps_.advanceAllOperations();
  recvOps_.advanceAllOperations();
}

void ChannelImpl::sendImplFromLoop(
    uint64_t sequenceNumber,
    Buffer buffer,
    size_t length,
    TSendCallback callback) {
  size_t localGpuIdx = cudaDeviceForPointer(
      context_->getCudaLib(), buffer.unwrap<CudaBuffer>().ptr);
  size_t localNicIdx = context_->getGpuToNicMapping()[localGpuIdx];

  SendOpIter opIter = sendOps_.emplaceBack(
      sequenceNumber,
      buffer.unwrap<CudaBuffer>(),
      length,
      std::move(callback),
      localGpuIdx,
      localNicIdx);
  opIter->event.record(buffer.unwrap<CudaBuffer>().stream);

  sendOps_.advanceOperation(opIter);
}

void ChannelImpl::advanceSendOperation(
    SendOpIter opIter,
    SendOperation::State prevOpState) {
  TP_DCHECK(context_->inLoop());

  SendOperation& op = *opIter;

  sendOps_.attemptTransition(
      opIter,
      /*from=*/SendOperation::UNINITIALIZED,
      /*to=*/SendOperation::FINISHED,
      /*cond=*/error_ || op.length == 0,
      /*actions=*/{&ChannelImpl::callSendCallback});

  // Needs to go after previous op to ensure predictable and consistent ordering
  // of write calls on the descriptor control connection and read calls on the
  // completion control connection.
  sendOps_.attemptTransition(
      opIter,
      /*from=*/SendOperation::UNINITIALIZED,
      /*to=*/SendOperation::READING_READY_TO_RECEIVE,
      /*cond=*/!error_ && state_ == ESTABLISHED &&
          prevOpState >= SendOperation::READING_READY_TO_RECEIVE,
      /*actions=*/
      {&ChannelImpl::writeDescriptor, &ChannelImpl::readReadyToReceive});

  sendOps_.attemptTransition(
      opIter,
      /*from=*/SendOperation::READING_READY_TO_RECEIVE,
      /*to=*/SendOperation::FINISHED,
      /*cond=*/error_ && op.doneReadingReadyToReceive,
      /*actions=*/{&ChannelImpl::callSendCallback});

  // This doesn't strictly need to go after the previous op, but it doesn't make
  // sense to busy poll multiple events if only one of them is actually able to
  // then make progress.
  sendOps_.attemptTransition(
      opIter,
      /*from=*/SendOperation::READING_READY_TO_RECEIVE,
      /*to=*/SendOperation::WAITING_FOR_CUDA_EVENT,
      /*cond=*/!error_ && op.doneReadingReadyToReceive &&
          prevOpState >= SendOperation::SENDING_OVER_IB,
      /*actions=*/{&ChannelImpl::waitForSendCudaEvent});

  sendOps_.attemptTransition(
      opIter,
      /*from=*/SendOperation::WAITING_FOR_CUDA_EVENT,
      /*to=*/SendOperation::FINISHED,
      /*cond=*/error_ && op.doneWaitingForCudaEvent,
      /*actions=*/{&ChannelImpl::callSendCallback});

  // Needs to go after previous op to ensure predictable and consistent ordering
  // of send calls on InfiniBand queue pair.
  sendOps_.attemptTransition(
      opIter,
      /*from=*/SendOperation::WAITING_FOR_CUDA_EVENT,
      /*to=*/SendOperation::SENDING_OVER_IB,
      /*cond=*/!error_ && op.doneWaitingForCudaEvent &&
          prevOpState >= SendOperation::SENDING_OVER_IB,
      /*actions=*/{&ChannelImpl::sendOverIb});

  sendOps_.attemptTransition(
      opIter,
      /*from=*/SendOperation::SENDING_OVER_IB,
      /*to=*/SendOperation::FINISHED,
      /*cond=*/op.numChunksBeingSent == 0,
      /*actions=*/{&ChannelImpl::callSendCallback});
}

void ChannelImpl::writeDescriptor(SendOpIter opIter) {
  TP_DCHECK(context_->inLoop());
  SendOperation& op = *opIter;

  auto nopHolder = std::make_shared<NopHolder<Descriptor>>();
  Descriptor& nopDescriptor = nopHolder->getObject();
  nopDescriptor.originNicIdx = op.localNicIdx;

  TP_VLOG(6) << "Channel " << id_ << " is writing descriptor (#"
             << op.sequenceNumber << ")";
  descriptorConnection_->write(
      *nopHolder,
      callbackWrapper_([sequenceNumber{op.sequenceNumber},
                        nopHolder](ChannelImpl& impl) {
        TP_VLOG(6) << "Channel " << impl.id_ << " done writing descriptor (# "
                   << sequenceNumber << ")";
      }));
}

void ChannelImpl::readReadyToReceive(SendOpIter opIter) {
  TP_DCHECK(context_->inLoop());
  SendOperation& op = *opIter;

  auto nopHolderIn = std::make_shared<NopHolder<ReadyToReceive>>();
  TP_VLOG(6) << "Channel " << id_ << " is reading ready-to-receive (#"
             << op.sequenceNumber << ")";
  readyToReceiveConnection_->read(
      *nopHolderIn, callbackWrapper_([opIter, nopHolderIn](ChannelImpl& impl) {
        TP_VLOG(6) << "Channel " << impl.id_
                   << " done reading ready-to-receive (# "
                   << opIter->sequenceNumber << ")";
        opIter->doneReadingReadyToReceive = true;
        if (!impl.error_) {
          const auto& readyToReceive = nopHolderIn->getObject();
          opIter->remoteNicIdx = readyToReceive.destinationNicIdx;
        }
        impl.sendOps_.advanceOperation(opIter);
      }));
}

void ChannelImpl::waitForSendCudaEvent(SendOpIter opIter) {
  TP_DCHECK(context_->inLoop());

  SendOperation& op = *opIter;

  TP_VLOG(6) << "Channel " << id_ << " is waiting for CUDA event to send (#"
             << op.sequenceNumber << ")";
  context_->waitForCudaEvent(
      op.event, callbackWrapper_([opIter](ChannelImpl& impl) {
        TP_VLOG(6) << "Channel " << impl.id_
                   << " done waiting for CUDA event to send (# "
                   << opIter->sequenceNumber << ")";
        opIter->doneWaitingForCudaEvent = true;
        impl.sendOps_.advanceOperation(opIter);
      }));
}

void ChannelImpl::sendOverIb(SendOpIter opIter) {
  TP_DCHECK(context_->inLoop());

  SendOperation& op = *opIter;

  IbvNic& localNic = context_->getIbvNic(op.localNicIdx);
  IbvQueuePair& qp = queuePairs_[op.localNicIdx][op.remoteNicIdx].queuePair;
  size_t chunkSize =
      queuePairs_[op.localNicIdx][op.remoteNicIdx].maximumMessageSize;

  // This could be VEEERY slow the first time we encounter the buffer, but the
  // result will be cached and subsequent calls will be much faster.
  IbvMemoryRegion& mr = localNic.registerMemory(op.buffer);

  size_t numChunks = ceilOfRatio(op.length, chunkSize);
  for (size_t chunkIdx = 0; chunkIdx < numChunks; chunkIdx++) {
    IbvNic::SendInfo info;
    info.addr =
        reinterpret_cast<uint8_t*>(op.buffer.ptr) + chunkIdx * chunkSize;
    info.length = std::min(op.length - chunkIdx * chunkSize, chunkSize);
    info.lkey = mr->lkey;

    TP_VLOG(6) << "Channel " << id_ << " is sending chunk #" << chunkIdx
               << " (out of " << numChunks << ") of tensor #"
               << op.sequenceNumber << " on QP " << qp->qp_num;
    localNic.postSend(
        qp, info, callbackWrapper_([opIter, chunkIdx](ChannelImpl& impl) {
          TP_VLOG(6) << "Channel " << impl.id_ << " done sending chunk #"
                     << chunkIdx << " of tensor #" << opIter->sequenceNumber;
          opIter->numChunksBeingSent--;
          impl.sendOps_.advanceOperation(opIter);

          impl.numSendsInFlight_--;
          impl.tryCleanup();
        }));
    op.numChunksBeingSent++;
    numSendsInFlight_++;
  }
}

void ChannelImpl::callSendCallback(SendOpIter opIter) {
  TP_DCHECK(context_->inLoop());

  SendOperation& op = *opIter;

  op.callback(error_);
  // Reset callback to release the resources it was holding.
  op.callback = nullptr;
}

void ChannelImpl::recvImplFromLoop(
    uint64_t sequenceNumber,
    Buffer buffer,
    size_t length,
    TRecvCallback callback) {
  size_t localGpuIdx = cudaDeviceForPointer(
      context_->getCudaLib(), buffer.unwrap<CudaBuffer>().ptr);
  size_t localNicIdx = context_->getGpuToNicMapping()[localGpuIdx];

  RecvOpIter opIter = recvOps_.emplaceBack(
      sequenceNumber,
      buffer.unwrap<CudaBuffer>(),
      length,
      std::move(callback),
      localGpuIdx,
      localNicIdx);
  opIter->event.record(buffer.unwrap<CudaBuffer>().stream);

  recvOps_.advanceOperation(opIter);
}

void ChannelImpl::advanceRecvOperation(
    RecvOpIter opIter,
    RecvOperation::State prevOpState) {
  TP_DCHECK(context_->inLoop());

  RecvOperation& op = *opIter;

  recvOps_.attemptTransition(
      opIter,
      /*from=*/RecvOperation::UNINITIALIZED,
      /*to=*/RecvOperation::FINISHED,
      /*cond=*/error_ || op.length == 0,
      /*actions=*/{&ChannelImpl::callRecvCallback});

  // Needs to go after previous op to ensure predictable and consistent ordering
  // of write calls on the descriptor control connection.
  recvOps_.attemptTransition(
      opIter,
      /*from=*/RecvOperation::UNINITIALIZED,
      /*to=*/RecvOperation::READING_DESCRIPTOR,
      /*cond=*/!error_ && state_ == ESTABLISHED &&
          prevOpState >= RecvOperation::READING_DESCRIPTOR,
      /*actions=*/{&ChannelImpl::readDescriptor});

  recvOps_.attemptTransition(
      opIter,
      /*from=*/RecvOperation::READING_DESCRIPTOR,
      /*to=*/RecvOperation::FINISHED,
      /*cond=*/error_ && op.doneReadingDescriptor,
      /*actions=*/{&ChannelImpl::callRecvCallback});

  // This doesn't strictly need to go after the previous op, but it doesn't make
  // sense to busy poll multiple events if only one of them is actually able to
  // then make progress.
  recvOps_.attemptTransition(
      opIter,
      /*from=*/RecvOperation::READING_DESCRIPTOR,
      /*to=*/RecvOperation::WAITING_FOR_CUDA_EVENT,
      /*cond=*/!error_ && op.doneReadingDescriptor &&
          prevOpState >= RecvOperation::RECEIVING_OVER_IB,
      /*actions=*/{&ChannelImpl::waitForRecvCudaEvent});

  recvOps_.attemptTransition(
      opIter,
      /*from=*/RecvOperation::WAITING_FOR_CUDA_EVENT,
      /*to=*/RecvOperation::FINISHED,
      /*cond=*/error_ && op.doneWaitingForCudaEvent,
      /*actions=*/{&ChannelImpl::callRecvCallback});

  // Needs to go after previous op to ensure predictable and consistent ordering
  // of recv calls on InfiniBand queue pair and write calls on the completion
  // control connection.
  recvOps_.attemptTransition(
      opIter,
      /*from=*/RecvOperation::WAITING_FOR_CUDA_EVENT,
      /*to=*/RecvOperation::RECEIVING_OVER_IB,
      /*cond=*/!error_ && op.doneWaitingForCudaEvent &&
          prevOpState >= RecvOperation::RECEIVING_OVER_IB,
      /*actions=*/{&ChannelImpl::recvOverIbAndWriteReadyToRecive});

  recvOps_.attemptTransition(
      opIter,
      /*from=*/RecvOperation::RECEIVING_OVER_IB,
      /*to=*/RecvOperation::FINISHED,
      /*cond=*/op.numChunksBeingReceived == 0,
      /*actions=*/{&ChannelImpl::callRecvCallback});
}

void ChannelImpl::readDescriptor(RecvOpIter opIter) {
  TP_DCHECK(context_->inLoop());

  RecvOperation& op = *opIter;

  TP_VLOG(6) << "Channel " << id_ << " is reading descriptor (#"
             << op.sequenceNumber << ")";
  auto nopHolderIn = std::make_shared<NopHolder<Descriptor>>();
  descriptorConnection_->read(
      *nopHolderIn, callbackWrapper_([opIter, nopHolderIn](ChannelImpl& impl) {
        TP_VLOG(6) << "Channel " << impl.id_ << " done reading descriptor (# "
                   << opIter->sequenceNumber << ")";
        opIter->doneReadingDescriptor = true;
        if (!impl.error_) {
          Descriptor& nopDescriptor = nopHolderIn->getObject();
          opIter->remoteNicIdx = nopDescriptor.originNicIdx;
        }
        impl.recvOps_.advanceOperation(opIter);
      }));
}

void ChannelImpl::waitForRecvCudaEvent(RecvOpIter opIter) {
  TP_DCHECK(context_->inLoop());

  RecvOperation& op = *opIter;

  TP_VLOG(6) << "Channel " << id_ << " is waiting for CUDA event to recv (#"
             << op.sequenceNumber << ")";
  context_->waitForCudaEvent(
      op.event, callbackWrapper_([opIter](ChannelImpl& impl) {
        TP_VLOG(6) << "Channel " << impl.id_
                   << " done waiting for CUDA event to recv (# "
                   << opIter->sequenceNumber << ")";
        opIter->doneWaitingForCudaEvent = true;
        impl.recvOps_.advanceOperation(opIter);
      }));
}

void ChannelImpl::recvOverIbAndWriteReadyToRecive(RecvOpIter opIter) {
  TP_DCHECK(context_->inLoop());

  RecvOperation& op = *opIter;

  IbvNic& localNic = context_->getIbvNic(op.localNicIdx);
  IbvQueuePair& qp = queuePairs_[op.localNicIdx][op.remoteNicIdx].queuePair;
  size_t chunkSize =
      queuePairs_[op.localNicIdx][op.remoteNicIdx].maximumMessageSize;

  // This could be VEEERY slow the first time we encounter the buffer, but the
  // result will be cached and subsequent calls will be much faster.
  IbvMemoryRegion& mr = localNic.registerMemory(op.buffer);

  size_t numChunks = ceilOfRatio(op.length, chunkSize);
  for (size_t chunkIdx = 0; chunkIdx < numChunks; chunkIdx++) {
    IbvNic::RecvInfo info;
    info.addr =
        reinterpret_cast<uint8_t*>(op.buffer.ptr) + chunkIdx * chunkSize;
    info.length = std::min(op.length - chunkIdx * chunkSize, chunkSize);
    info.lkey = mr->lkey;

    TP_VLOG(6) << "Channel " << id_ << " is receiving chunk #" << chunkIdx
               << " (out of " << numChunks << ") of tensor #"
               << op.sequenceNumber << " on QP " << qp->qp_num;
    localNic.postRecv(
        qp, info, callbackWrapper_([opIter, chunkIdx](ChannelImpl& impl) {
          TP_VLOG(6) << "Channel " << impl.id_ << " done receiving chunk #"
                     << chunkIdx << " of tensor #" << opIter->sequenceNumber;
          opIter->numChunksBeingReceived--;
          impl.recvOps_.advanceOperation(opIter);

          impl.numRecvsInFlight_--;
          impl.tryCleanup();
        }));
    op.numChunksBeingReceived++;
    numRecvsInFlight_++;
  }

  auto nopHolderOut = std::make_shared<NopHolder<ReadyToReceive>>();
  ReadyToReceive& nopReadyToReceive = nopHolderOut->getObject();
  nopReadyToReceive.destinationNicIdx = op.localNicIdx;
  TP_VLOG(6) << "Channel " << id_ << " is writing ready-to-receive (#"
             << op.sequenceNumber << ")";
  readyToReceiveConnection_->write(
      *nopHolderOut,
      callbackWrapper_([sequenceNumber{opIter->sequenceNumber},
                        nopHolderOut](ChannelImpl& impl) {
        TP_VLOG(6) << "Channel " << impl.id_
                   << " done writing ready-to-receive (#" << sequenceNumber
                   << ")";
      }));
}

void ChannelImpl::callRecvCallback(RecvOpIter opIter) {
  TP_DCHECK(context_->inLoop());

  RecvOperation& op = *opIter;

  op.callback(error_);
  // Reset callback to release the resources it was holding.
  op.callback = nullptr;
}

void ChannelImpl::handleErrorImpl() {
  sendOps_.advanceAllOperations();
  recvOps_.advanceAllOperations();

  for (size_t localNicIdx = 0; localNicIdx < numLocalNics_; localNicIdx++) {
    for (size_t remoteNicIdx = 0; remoteNicIdx < numRemoteNics_;
         remoteNicIdx++) {
      transitionIbvQueuePairToError(
          context_->getIbvLib(),
          queuePairs_[localNicIdx][remoteNicIdx].queuePair);
    }
  }

  tryCleanup();

  descriptorConnection_->close();
  readyToReceiveConnection_->close();
}

void ChannelImpl::tryCleanup() {
  TP_DCHECK(context_->inLoop());

  if (error_) {
    if (numSendsInFlight_ == 0 && numRecvsInFlight_ == 0) {
      cleanup();
    } else {
      TP_VLOG(9) << "Connection " << id_
                 << " cannot proceed to cleanup because it has "
                 << numSendsInFlight_ << " pending send requests and "
                 << numRecvsInFlight_ << " pending recv requests";
    }
  }
}

void ChannelImpl::cleanup() {
  TP_DCHECK(context_->inLoop());
  TP_VLOG(8) << "Connection " << id_ << " is cleaning up";

  queuePairs_.clear();

  context_->unenroll(*this);
}

} // namespace cuda_gdr
} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/channel/cuda_gdr/channel_impl.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <list>
#include <memory>
#include <string>
#include <utility>
#include <vector>

#include <nop/serializer.h>
#include <nop/structure.h>

#include <tensorpipe/channel/channel_impl_boilerplate.h>
#include <tensorpipe/common/cuda.h>
#include <tensorpipe/common/cuda_buffer.h>
#include <tensorpipe/common/ibv.h>
#include <tensorpipe/common/state_machine.h>
#include <tensorpipe/transport/context.h>

namespace tensorpipe {
namespace channel {
namespace cuda_gdr {

class ContextImpl;

// Ideally we would use NOP_EXTERNAL_STRUCTURE instead of defining the following
// two structs, but we tried so in D26460332 and failed because a bug in GCC 5.5
// (and probably other versions) requires every nop structure used inside a
// std::vector to have an explicit non-defaulted default constructor, which is
// something we cannot do with NOP_EXTERNAL_STRUCTURE and forces us to re-define
// separate structs.

// Replicate the IbvLib::gid struct so we can serialize it with libnop.
struct NopIbvGid {
  uint64_t subnetPrefix;
  uint64_t interfaceId;
  NOP_STRUCTURE(NopIbvGid, subnetPrefix, interfaceId);

  void fromIbvGid(const IbvLib::gid& globalIdentifier) {
    subnetPrefix = globalIdentifier.global.subnet_prefix;
    interfaceId = globalIdentifier.global.interface_id;
  }

  IbvLib::gid toIbvGid() const {
    IbvLib::gid globalIdentifier;
    globalIdentifier.global.subnet_prefix = subnetPrefix;
    globalIdentifier.global.interface_id = interfaceId;
    return globalIdentifier;
  }
};

// Replicate the IbvSetupInformation struct so we can serialize it with libnop.
struct NopIbvSetupInformation {
  // This pointless constructor is needed to work around a bug in GCC 5.5 (and
  // possibly other versions). It appears to be needed in the nop types that
  // are used inside std::vectors.
  NopIbvSetupInformation() {}

  uint32_t localIdentifier;
  NopIbvGid globalIdentifier;
  uint32_t queuePairNumber;
  IbvLib::mtu maximumTransmissionUnit;
  uint32_t maximumMessageSize;
  NOP_STRUCTURE(
      NopIbvSetupInformation,
      localIdentifier,
      globalIdentifier,
      queuePairNumber,
      maximumTransmissionUnit,
      maximumMessageSize);

  void fromIbvSetupInformation(const IbvSetupInformation& setupInfo) {
    localIdentifier = setupInfo.localIdentifier;
    globalIdentifier.fromIbvGid(setupInfo.globalIdentifier);
    queuePairNumber = setupInfo.queuePairNumber;
    maximumTransmissionUnit = setupInfo.maximumTransmissionUnit;
    maximumMessageSize = setupInfo.maximumMessageSize;
  }

  IbvSetupInformation toIbvSetupInformation() const {
    IbvSetupInformation setupInfo;
    setupInfo.localIdentifier = localIdentifier;
    setupInfo.globalIdentifier = globalIdentifier.toIbvGid();
    setupInfo.queuePairNumber = queuePairNumber;
    setupInfo.maximumTransmissionUnit = maximumTransmissionUnit;
    setupInfo.maximumMessageSize = maximumMessageSize;
    return setupInfo;
  }
};

struct SendOperation {
  enum State {
    UNINITIALIZED,
    READING_READY_TO_RECEIVE,
    WAITING_FOR_CUDA_EVENT,
    SENDING_OVER_IB,
    FINISHED
  };

  // Fields used by the state machine
  uint64_t sequenceNumber{0};
  State state{UNINITIALIZED};

  // Progress flags
  bool doneReadingReadyToReceive{false};
  bool doneWaitingForCudaEvent{false};
  uint64_t numChunksBeingSent{0};

  // Arguments at creation
  const CudaBuffer buffer;
  const size_t length;
  const size_t localNicIdx;
  TSendCallback callback;

  // Other stuff
  CudaEvent event;
  size_t remoteNicIdx;

  SendOperation(
      CudaBuffer buffer,
      size_t length,
      TSendCallback callback,
      size_t localGpuIdx,
      size_t localNicIdx)
      : buffer(buffer),
        length(length),
        localNicIdx(localNicIdx),
        callback(std::move(callback)),
        event(localGpuIdx) {}
};

struct RecvOperation {
  enum State {
    UNINITIALIZED,
    READING_DESCRIPTOR,
    WAITING_FOR_CUDA_EVENT,
    RECEIVING_OVER_IB,
    FINISHED
  };

  // Fields used by the state machine
  uint64_t sequenceNumber{0};
  State state{UNINITIALIZED};

  // Progress flags
  bool doneReadingDescriptor{false};
  bool doneWaitingForCudaEvent{false};
  uint64_t numChunksBeingReceived{0};

  // Arguments at creation
  const CudaBuffer buffer;
  const size_t length;
  const size_t localNicIdx;
  TSendCallback callback;

  // Other stuff
  size_t remoteNicIdx;
  CudaEvent event;

  RecvOperation(
      CudaBuffer buffer,
      size_t length,
      TSendCallback callback,
      size_t deviceIdx,
      size_t localNicIdx)
      : buffer(buffer),
        length(length),
        localNicIdx(localNicIdx),
        callback(std::move(callback)),
        event(deviceIdx) {}
};

// First "round" of handshake.
struct HandshakeNumNics {
  size_t numNics;
  NOP_STRUCTURE(HandshakeNumNics, numNics);
};

// Second "round" of handshake.
struct HandshakeSetupInfo {
  std::vector<std::vector<NopIbvSetupInformation>> setupInfo;
  NOP_STRUCTURE(HandshakeSetupInfo, setupInfo);
};

// From sender to receiver (through pipe).
struct Descriptor {
  size_t originNicIdx;
  NOP_STRUCTURE(Descriptor, originNicIdx);
};

// From receiver to sender (through channel's connection).
struct ReadyToReceive {
  size_t destinationNicIdx;
  NOP_STRUCTURE(ReadyToReceive, destinationNicIdx);
};

class ChannelImpl final
    : public ChannelImplBoilerplate<ContextImpl, ChannelImpl> {
 public:
  ChannelImpl(
      ConstructorToken token,
      std::shared_ptr<ContextImpl> context,
      std::string id,
      std::shared_ptr<transport::Connection> descriptorConnection,
      std::shared_ptr<transport::Connection> readyToReceiveConnection);

 protected:
  // Implement the entry points called by ChannelImplBoilerplate.
  void initImplFromLoop() override;
  void sendImplFromLoop(
      uint64_t sequenceNumber,
      Buffer buffer,
      size_t length,
      TSendCallback callback) override;
  void recvImplFromLoop(
      uint64_t sequenceNumber,
      Buffer buffer,
      size_t length,
      TRecvCallback callback) override;
  void handleErrorImpl() override;

 private:
  const std::shared_ptr<transport::Connection> descriptorConnection_;
  const std::shared_ptr<transport::Connection> readyToReceiveConnection_;

  enum State {
    INITIALIZING = 1,
    WAITING_FOR_HANDSHAKE_NUM_NICS,
    WAITING_FOR_HANDSHAKE_SETUP_INFO,
    ESTABLISHED,
  };
  State state_{INITIALIZING};

  std::vector<size_t> localGpuToNic_;
  size_t numLocalNics_{0};
  size_t numRemoteNics_{0};

  // This struct is used to bundle the queue pair with some additional metadata.
  struct QueuePair {
    IbvQueuePair queuePair;
    // The CUDA GDR channel could be asked to transmit arbitrarily large tensors
    // and in principle it could directly forward them to the NIC as they are.
    // However IB NICs have limits on the size of each message. Hence we
    // determine these sizes, one per queue pair (as the minimum of the local
    // and remote sizes) and then split our tensors in chunks of that size.
    uint32_t maximumMessageSize;
  };
  std::vector<std::vector<QueuePair>> queuePairs_;

  OpsStateMachine<ChannelImpl, SendOperation> sendOps_{
      *this,
      &ChannelImpl::advanceSendOperation};
  using SendOpIter = decltype(sendOps_)::Iter;
  OpsStateMachine<ChannelImpl, RecvOperation> recvOps_{
      *this,
      &ChannelImpl::advanceRecvOperation};
  using RecvOpIter = decltype(recvOps_)::Iter;

  uint32_t numSendsInFlight_{0};
  uint32_t numRecvsInFlight_{0};

  // Callbacks for the initial handshake phase.
  void onReadHandshakeNumNics(const HandshakeNumNics& nopHandshakeNumNics);
  void onReadHandshakeSetupInfo(
      const HandshakeSetupInfo& nopHandshakeSetupInfo);

  // Cleanup methods for teardown.
  void tryCleanup();
  void cleanup();

  // State machines for send and recv ops.
  void advanceSendOperation(
      SendOpIter opIter,
      SendOperation::State prevOpState);
  void advanceRecvOperation(
      RecvOpIter opIter,
      RecvOperation::State prevOpState);

  // Actions (i.e., methods that begin a state transition).
  // For send operations:
  void writeDescriptor(SendOpIter opIter);
  void readReadyToReceive(SendOpIter opIter);
  void waitForSendCudaEvent(SendOpIter opIter);
  void sendOverIb(SendOpIter opIter);
  void callSendCallback(SendOpIter opIter);
  // For recv operations:
  void readDescriptor(RecvOpIter opIter);
  void waitForRecvCudaEvent(RecvOpIter opIter);
  void recvOverIbAndWriteReadyToRecive(RecvOpIter opIter);
  void callRecvCallback(RecvOpIter opIter);
};

} // namespace cuda_gdr
} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/channel/cuda_gdr/constants.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <cstdint>

namespace tensorpipe {
namespace channel {
namespace cuda_gdr {

namespace {

// We should probably allow these to be user-configured. But, for now, we'll set
// them to the lowest value they can have, the rationale being that this way
// they will always be valid.
constexpr uint8_t kPortNum = 1;
constexpr uint8_t kGlobalIdentifierIndex = 0;

// FIXME Instead of hardcoding the next three values, we could use
// ibv_query_device to obtain max_cqe, max_qp_wr and max_srq_wr and deduce from
// them the maximum allowed values for these parameters.

constexpr uint32_t kNumRecvs = 1024;
constexpr uint32_t kNumSends = 1024;

// How many elements the completion queue should be able to hold. These elements
// will be either the completed receive requests of the SRQ, or the completed
// send requests from a connection's queue pair. We can bound the former value
// but not the latter, so we try to add some margin.
constexpr int kCompletionQueueSize = kNumRecvs + kNumSends;

// How many work completions to poll from the completion queue at each reactor
// iteration.
constexpr int kNumPolledWorkCompletions = 32;

} // namespace

} // namespace cuda_gdr
} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/channel/cuda_gdr/context_impl.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/channel/cuda_gdr/context_impl.h>

#include <array>
#include <climits>
#include <cstdlib>
#include <functional>
#include <string>
#include <tuple>
#include <type_traits>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>

#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>

#include <cuda.h>
#include <cuda_runtime.h>

#include <tensorpipe/channel/cuda_gdr/channel_impl.h>
#include <tensorpipe/channel/cuda_gdr/error.h>
#include <tensorpipe/common/cuda.h>
#include <tensorpipe/common/defs.h>
#include <tensorpipe/common/error_macros.h>

namespace tensorpipe {
namespace channel {
namespace cuda_gdr {

namespace {

// NOTE: This is an incomplete implementation of C++17's `std::apply`.
// It's intended to only work for methods of IbvNic.
template <class TMethod, class TArgsTuple, std::size_t... I>
auto applyFuncImpl(
    IbvNic& subject,
    TMethod&& method,
    TArgsTuple&& args,
    std::index_sequence<I...> /* unused */) {
  return ((subject).*(method))(std::get<I>(std::forward<TArgsTuple>(args))...);
}

template <class TMethod, class TArgsTuple>
auto applyFunc(IbvNic& subject, TMethod&& method, TArgsTuple&& args) {
  return applyFuncImpl(
      subject,
      std::forward<TMethod>(method),
      std::forward<TArgsTuple>(args),
      std::make_index_sequence<
          std::tuple_size<std::remove_reference_t<TArgsTuple>>::value>{});
}

// We can only pass CUDA pointers to InfiniBand (for example when registering
// some memory) if InfiniBand "knows about" CUDA. Those pointers refer to the
// section of the process's virtual address space that is being used by CUDA to
// represent device memory (as part of CUDA's unified memory approach). Thus
// InfiniBand needs to talk to CUDA to translate those pointers to physical PCIe
// hardware addresses.
// This is achieved by CUDA providing a so-called "peer memory client" and
// registering it with the InfiniBand kernel module. The peer memory client is
// itself a kernel module, see https://github.com/Mellanox/nv_peer_memory.
// The "catch" is that the whole "peer memory client" system is not part of the
// official Linux InfiniBand. It's provided by a Mellanox extension, and it's
// part of their "OpenFabrics Enterprise Distribution" (MLNX_OFED), see
// https://www.mellanox.com/products/infiniband-drivers/linux/mlnx_ofed. (In
// particular, on Ubuntu, this seems to be provided by the mlnx-ofed-kernel-dkms
// package). Note that this difference between "vanilla" InfiniBand and OFED is
// only in kernel space; from our perspective the two have the same API. Also
// note that Mellanox has tried at least a couple of time to upstream this, but
// apparently without success:
// https://lore.kernel.org/linux-rdma/1412602019-30659-1-git-send-email-yishaih@mellanox.com/
// https://lore.kernel.org/linux-rdma/1455207177-11949-1-git-send-email-artemyko@mellanox.com/
// The check we use to verify if the peer memory client is active is the same as
// NCCL's one, see
// https://github.com/NVIDIA/nccl/blob/ca8485b0d01ca6dfa02f4454932011e68b461175/src/transport/net_ib.cc#L216-L230
// Whereas TensorFlow does it slightly differently, see
// https://github.com/tensorflow/networking/blob/671e2548b602f93a6c6502432b8bc131b5cc4914/tensorflow_networking/gdr/gdr_memory_manager.cc#L43-L60
static std::string kNvMemModulePath =
    "/sys/kernel/mm/memory_peers/nv_mem/version";
static std::string kNvidiaPeermemModulePath =
    "/sys/kernel/mm/memory_peers/nvidia-peermem/version";

bool isNvidiaPeerMemoryClientActive() {
  int rv1 = ::access(kNvMemModulePath.c_str(), F_OK);
  int rv2 = ::access(kNvidiaPeermemModulePath.c_str(), F_OK);
  return rv1 >= 0 || rv2 >= 0;
}

// The PCI topology is a tree, with the root being the host bridge, the leaves
// being the devices, and the other nodes being switches. We want to match each
// GPU to the InfiniBand NIC with which it shares the longest "prefix" in this
// tree, as that will route the data transfer away from the most "central"
// switches and from the host bridge. We extract the "path" of a device in the
// PCI tree by obtaining its "canonical" path in Linux's sysfs, which contains
// one component for each other device that is traversed. The format of such a
// path is /sys/devices/pci0123:45(/0123:45:67.8)+");
// See https://www.kernel.org/doc/ols/2005/ols2005v1-pages-321-334.pdf for more
// info on sysfs.

const std::string kPciPathPrefix = "/sys/devices/pci";

std::string getPciPathForIbvNic(const std::string& nicName) {
  std::array<char, PATH_MAX> pciPath;
  char* rv = ::realpath(
      ("/sys/class/infiniband/" + nicName + "/device").c_str(), pciPath.data());
  TP_THROW_SYSTEM_IF(rv == nullptr, errno);
  TP_DCHECK(rv == pciPath.data());

  std::string res(pciPath.data());
  TP_DCHECK(res.substr(0, kPciPathPrefix.size()) == kPciPathPrefix)
      << "Bad PCI path for InfiniBand NIC " << nicName << ": " << res;
  return res;
}

std::string getPciPathForGpu(int gpuIdx) {
  // The CUDA documentation says the ID will consist of a domain (16 bits), a
  // bus (8 bits), a device (5 bits) and a function (3 bits). When represented
  // as hex, including the separators and the null terminator, this takes up 13
  // bytes. However NCCL seems to suggests that sometimes the domain takes twice
  // that size, and hence 17 bytes are necessary.
  // https://github.com/NVIDIA/nccl/blob/c6dbdb00849027b4e2c277653cbef53729f7213d/src/misc/utils.cc#L49-L53
  std::array<char, 17> pciDeviceId;
  TP_CUDA_CHECK(
      cudaDeviceGetPCIBusId(pciDeviceId.data(), pciDeviceId.size(), gpuIdx));

  // Fun fact: CUDA seems to format hex letters as uppercase, but Linux's sysfs
  // expects them as lowercase.
  for (char& c : pciDeviceId) {
    if ('A' <= c && c <= 'F') {
      c = c - 'A' + 'a';
    }
  }

  std::array<char, PATH_MAX> pciPath;
  char* rv = ::realpath(
      ("/sys/bus/pci/devices/" + std::string(pciDeviceId.data())).c_str(),
      pciPath.data());
  TP_THROW_SYSTEM_IF(rv == nullptr, errno);
  TP_DCHECK(rv == pciPath.data());

  std::string res(pciPath.data());
  TP_DCHECK(res.substr(0, kPciPathPrefix.size()) == kPciPathPrefix)
      << "Bad PCI path for GPU #" << gpuIdx << ": " << res;
  return res;
}

size_t commonPrefixLength(const std::string& a, const std::string& b) {
  // The length of the longest common prefix is the index of the first char on
  // which the two strings differ.
  size_t maxLength = std::min(a.size(), b.size());
  for (size_t idx = 0; idx < maxLength; idx++) {
    if (a[idx] != b[idx]) {
      return idx;
    }
  }
  return maxLength;
}

std::vector<std::string> matchGpusToIbvNics(
    IbvLib& ibvLib,
    IbvDeviceList& deviceList) {
  struct NicInfo {
    std::string name;
    std::string pciPath;
  };
  std::vector<NicInfo> nicInfos;
  for (size_t deviceIdx = 0; deviceIdx < deviceList.size(); deviceIdx++) {
    IbvLib::device& device = deviceList[deviceIdx];
    std::string deviceName(TP_CHECK_IBV_PTR(ibvLib.get_device_name(&device)));
    std::string pciPath = getPciPathForIbvNic(deviceName);
    TP_VLOG(5) << "Resolved InfiniBand NIC " << deviceName << " to PCI path "
               << pciPath;
    nicInfos.push_back(NicInfo{std::move(deviceName), std::move(pciPath)});
  }

  int numGpus;
  TP_CUDA_CHECK(cudaGetDeviceCount(&numGpus));

  std::vector<std::string> gpuIdxToIbvNicName;
  for (int gpuIdx = 0; gpuIdx < numGpus; gpuIdx++) {
    std::string gpuPciPath = getPciPathForGpu(gpuIdx);
    TP_VLOG(5) << "Resolved GPU #" << gpuIdx << " to PCI path " << gpuPciPath;
    ssize_t bestMatchLength = -1;
    const std::string* bestMatchName = nullptr;
    for (const auto& nicInfo : nicInfos) {
      ssize_t matchLength = commonPrefixLength(gpuPciPath, nicInfo.pciPath);
      if (matchLength > bestMatchLength) {
        bestMatchLength = matchLength;
        bestMatchName = &nicInfo.name;
      }
    }
    TP_DCHECK_GE(bestMatchLength, 0);
    TP_DCHECK(bestMatchName != nullptr);
    gpuIdxToIbvNicName.push_back(*bestMatchName);
  }

  return gpuIdxToIbvNicName;
}

// In GpuDirect, the way an InfiniBand NIC accesses the GPU's memory is by
// issuing a PCIe read to some address within the GPU's "base address register"
// (BAR), i.e., a slice of the "physical" PCIe address space that belongs to the
// GPU. BARs in principle provide only "windows" into a device's memory, and
// could be re-mapped over time. When a CUDA allocation is registered on
// InfiniBand, its backing memory is mapped into the BAR and its address is
// given to the InfiniBand driver. That mapping must remain in place until the
// registration is destroyed. See
// https://docs.nvidia.com/cuda/gpudirect-rdma/index.html#how-gpudirect-rdma-works.
// CUDA GDR doesn't work well with that, because:
// - It attempts to register the entire user allocation with InfiniBand, hence
//   allocations that exceed the BAR's size can never be transferred.
// - It "caches" (or "leaks") the InfiniBand registration, because creating it
//   is expensive, so that this can be done once and then reused. This means
//   that even if each tensor that is sent is smaller than the BAR, we'd start
//   seeing failures if their cumulative size exceeded the one of the BAR.
// On some GPUs though the BAR size spans the entire GPU memory. In such cases
// what CUDA GDR is doing should be "safe". In all other cases, however, it
// isn't, and it's better to thus disable CUDA GDR entirely in these scenarios,
// so that users end up using a fully functioning (but slower) CUDA channel.
// There are multiple BARs for each GPU, but from an experimental investigation
// it seems the one that maps to the device's memory is BAR1. The programmatic
// way that the Linux kernel offers to access information about PCIe and its
// BARs is through sysfs. See
// https://www.kernel.org/doc/html/latest/PCI/sysfs-pci.html.

size_t getBar1SizeOfGpu(int gpuIdx) {
  std::string pciPath = getPciPathForGpu(gpuIdx);
  pciPath += "/resource1";

  struct stat bar1Stats;
  int rv = ::stat(pciPath.c_str(), &bar1Stats);
  TP_THROW_SYSTEM_IF(rv < 0, errno);

  return bar1Stats.st_size;
}

bool allGpusHaveEnoughBar1Size() {
  int numGpus;
  TP_CUDA_CHECK(cudaGetDeviceCount(&numGpus));
  for (int gpuIdx = 0; gpuIdx < numGpus; gpuIdx++) {
    cudaDeviceProp gpuProps;
    TP_CUDA_CHECK(cudaGetDeviceProperties(&gpuProps, gpuIdx));
    size_t memorySize = gpuProps.totalGlobalMem;
    size_t bar1Size = getBar1SizeOfGpu(gpuIdx);
    TP_VLOG(5) << "GPU #" << gpuIdx << " has " << memorySize
               << " bytes of memory and the size of its PCIe BAR1 is "
               << bar1Size << " bytes";
    if (bar1Size < memorySize) {
      return false;
    }
  }
  return true;
}

} // namespace

IbvNic::IbvNic(
    std::string name,
    IbvLib::device& device,
    const IbvLib& ibvLib,
    const CudaLib& cudaLib)
    : name_(std::move(name)), cudaLib_(cudaLib), ibvLib_(ibvLib) {
  ctx_ = createIbvContext(ibvLib_, device);
  pd_ = createIbvProtectionDomain(ibvLib_, ctx_);
  cq_ = createIbvCompletionQueue(
      ibvLib_,
      ctx_,
      kCompletionQueueSize,
      /*cq_context=*/nullptr,
      /*channel=*/nullptr,
      /*comp_vector=*/0);
  addr_ = makeIbvAddress(ibvLib_, ctx_, kPortNum, kGlobalIdentifierIndex);
}

bool IbvNic::pollOnce() {
  std::array<IbvLib::wc, kNumPolledWorkCompletions> wcs;
  auto rv = ibvLib_.poll_cq(cq_.get(), wcs.size(), wcs.data());

  if (rv == 0) {
    return false;
  }
  TP_THROW_SYSTEM_IF(rv < 0, errno);

  int numSends = 0;
  int numRecvs = 0;
  for (int wcIdx = 0; wcIdx < rv; wcIdx++) {
    IbvLib::wc& wc = wcs[wcIdx];

    TP_VLOG(6) << "Channel context " << id_ << " got work completion on device "
               << name_ << " for request " << wc.wr_id << " for QP "
               << wc.qp_num << " with status "
               << ibvLib_.wc_status_str(wc.status) << " and opcode "
               << ibvWorkCompletionOpcodeToStr(wc.opcode)
               << " (byte length: " << wc.byte_len << ")";

    auto iter = requestsInFlight_.find(wc.wr_id);
    TP_THROW_ASSERT_IF(iter == requestsInFlight_.end())
        << "Got work completion with unknown ID " << wc.wr_id;

    IbvLib::wc_opcode opcode = std::move(std::get<0>(iter->second));
    std::function<void(const Error&)> cb = std::move(std::get<1>(iter->second));
    requestsInFlight_.erase(iter);

    if (wc.status != IbvLib::WC_SUCCESS) {
      cb(TP_CREATE_ERROR(IbvError, ibvLib_.wc_status_str(wc.status)));
    } else {
      cb(Error::kSuccess);
    }

    switch (opcode) {
      case IbvLib::WC_RECV:
        numRecvs++;
        break;
      case IbvLib::WC_SEND:
        numSends++;
        break;
      default:
        TP_THROW_ASSERT() << "Unknown opcode: " << opcode;
    }
  }

  numAvailableSendSlots_ += numSends;
  while (!sendsWaitingForSlots_.empty() && numAvailableSendSlots_ > 0) {
    applyFunc(
        *this, &IbvNic::postSend, std::move(sendsWaitingForSlots_.front()));
    sendsWaitingForSlots_.pop_front();
  }

  numAvailableRecvSlots_ += numRecvs;
  while (!recvsWaitingForSlots_.empty() && numAvailableRecvSlots_ > 0) {
    applyFunc(
        *this, &IbvNic::postRecv, std::move(recvsWaitingForSlots_.front()));
    recvsWaitingForSlots_.pop_front();
  }

  return true;
}

void IbvNic::postSend(
    IbvQueuePair& qp,
    SendInfo info,
    std::function<void(const Error&)> cb) {
  if (numAvailableSendSlots_ > 0) {
    IbvLib::sge list;
    list.addr = reinterpret_cast<uint64_t>(info.addr);
    list.length = info.length;
    list.lkey = info.lkey;

    IbvLib::send_wr wr;
    std::memset(&wr, 0, sizeof(wr));
    wr.wr_id = nextRequestId_++;
    wr.sg_list = &list;
    wr.num_sge = 1;
    wr.opcode = IbvLib::WR_SEND;

    IbvLib::send_wr* badWr = nullptr;
    TP_VLOG(6) << "Channel context " << id_ << " posting send on device "
               << name_ << " for QP " << qp->qp_num;
    TP_CHECK_IBV_INT(ibvLib_.post_send(qp.get(), &wr, &badWr));
    TP_THROW_ASSERT_IF(badWr != nullptr);
    numAvailableSendSlots_--;
    requestsInFlight_.emplace(
        wr.wr_id, std::make_tuple(IbvLib::WC_SEND, std::move(cb)));
  } else {
    TP_VLOG(6) << "Channel context " << id_ << " queueing up send on device "
               << name_ << " for QP " << qp->qp_num;
    sendsWaitingForSlots_.emplace_back(qp, info, std::move(cb));
  }
}

void IbvNic::postRecv(
    IbvQueuePair& qp,
    RecvInfo info,
    std::function<void(const Error&)> cb) {
  if (numAvailableRecvSlots_ > 0) {
    IbvLib::sge list;
    list.addr = reinterpret_cast<uint64_t>(info.addr);
    list.length = info.length;
    list.lkey = info.lkey;

    IbvLib::recv_wr wr;
    std::memset(&wr, 0, sizeof(wr));
    wr.wr_id = nextRequestId_++;
    wr.sg_list = &list;
    wr.num_sge = 1;

    IbvLib::recv_wr* badWr = nullptr;
    TP_VLOG(6) << "Channel context " << id_ << " posting recv on device "
               << name_ << " for QP " << qp->qp_num;
    TP_CHECK_IBV_INT(ibvLib_.post_recv(qp.get(), &wr, &badWr));
    TP_THROW_ASSERT_IF(badWr != nullptr);
    numAvailableRecvSlots_--;
    requestsInFlight_.emplace(
        wr.wr_id, std::make_tuple(IbvLib::WC_RECV, std::move(cb)));
  } else {
    TP_VLOG(6) << "Channel context " << id_ << " queueing up recv on device "
               << name_ << " for QP " << qp->qp_num;
    recvsWaitingForSlots_.emplace_back(qp, info, std::move(cb));
  }
}

IbvMemoryRegion& IbvNic::registerMemory(CudaBuffer buffer) {
  // FIXME Instead of re-querying the device, have the caller provide it.
  CudaDeviceGuard guard(cudaDeviceForPointer(cudaLib_, buffer.ptr));

  CUdeviceptr basePtr;
  size_t allocSize;
  TP_CUDA_DRIVER_CHECK(
      cudaLib_,
      cudaLib_.memGetAddressRange(
          &basePtr, &allocSize, reinterpret_cast<CUdeviceptr>(buffer.ptr)));

  unsigned long long bufferId;
  TP_CUDA_DRIVER_CHECK(
      cudaLib_,
      cudaLib_.pointerGetAttribute(
          &bufferId, CU_POINTER_ATTRIBUTE_BUFFER_ID, basePtr));

  auto iter = memoryRegions_.find(bufferId);
  if (iter != memoryRegions_.end()) {
    return iter->second;
  }
  std::tie(iter, std::ignore) = memoryRegions_.emplace(
      bufferId,
      createIbvMemoryRegion(
          ibvLib_,
          pd_,
          reinterpret_cast<void*>(basePtr),
          allocSize,
          IbvLib::ACCESS_LOCAL_WRITE));
  return iter->second;
}

bool IbvNic::readyToClose() const {
  return requestsInFlight_.empty();
}

void IbvNic::setId(std::string id) {
  id_ = std::move(id);
}

std::shared_ptr<ContextImpl> ContextImpl::create(
    optional<std::vector<std::string>> gpuIdxToNicName) {
  Error error;

  CudaLib cudaLib;
  std::tie(error, cudaLib) = CudaLib::create();
  // FIXME Instead of throwing away the error and setting a bool, we should have
  // a way to set the context in an error state, and use that for viability.
  if (error) {
    TP_VLOG(5)
        << "CUDA GDR channel is not viable because libcuda could not be loaded: "
        << error.what();
    return nullptr;
  }

  IbvLib ibvLib;
  std::tie(error, ibvLib) = IbvLib::create();
  // FIXME Instead of throwing away the error and setting a bool, we should have
  // a way to set the context in an error state, and use that for viability.
  if (error) {
    TP_VLOG(5)
        << "CUDA GDR channel is not viable because libibverbs could not be loaded: "
        << error.what();
    return nullptr;
  }

  if (!isNvidiaPeerMemoryClientActive()) {
    TP_VLOG(5)
        << "CUDA GDR channel is not viable because the nv_peer_mem kernel module isn't active";
    return nullptr;
  }

  IbvDeviceList deviceList;
  std::tie(error, deviceList) = IbvDeviceList::create(ibvLib);
  if (error && error.isOfType<SystemError>() &&
      error.castToType<SystemError>()->errorCode() == ENOSYS) {
    TP_VLOG(5)
        << "CUDA GDR channel couldn't get list of InfiniBand devices because the kernel module isn't "
        << "loaded";
    return nullptr;
  }
  TP_THROW_ASSERT_IF(error)
      << "Couldn't get list of InfiniBand devices: " << error.what();
  if (deviceList.size() == 0) {
    TP_VLOG(5)
        << "CUDA GDR channel is not viable because it couldn't find any InfiniBand NICs";
    return nullptr;
  }

  // FIXME In principle we could just exclude the GPUs that violate this check
  // but keep working with the other ones (if any).
  if (!allGpusHaveEnoughBar1Size()) {
    TP_VLOG(5)
        << "CUDA GDR channel is not viable because some GPUs don't have a large enough PCIe BAR1 size";
    return nullptr;
  }

  std::unordered_map<Device, std::string> deviceDescriptors;
  for (const auto& device : getCudaDevices(cudaLib)) {
    deviceDescriptors[device] = "*";
  }

  return std::make_shared<ContextImpl>(
      std::move(deviceDescriptors),
      std::move(cudaLib),
      std::move(ibvLib),
      std::move(deviceList),
      std::move(gpuIdxToNicName));
}

ContextImpl::ContextImpl(
    std::unordered_map<Device, std::string> deviceDescriptors,
    CudaLib cudaLib,
    IbvLib ibvLib,
    IbvDeviceList deviceList,
    optional<std::vector<std::string>> gpuIdxToNicName)
    : ContextImplBoilerplate<ContextImpl, ChannelImpl>(
          std::move(deviceDescriptors)),
      cudaLib_(std::move(cudaLib)),
      ibvLib_(std::move(ibvLib)) {
  std::vector<std::string> actualGpuIdxToNicName;
  if (gpuIdxToNicName.has_value()) {
    int numGpus;
    TP_CUDA_CHECK(cudaGetDeviceCount(&numGpus));
    TP_THROW_ASSERT_IF(numGpus != gpuIdxToNicName->size())
        << "The mapping from GPUs to InfiniBand NICs contains an unexpected "
        << "number of items: found " << gpuIdxToNicName->size() << ", expected "
        << numGpus;

    actualGpuIdxToNicName = std::move(gpuIdxToNicName.value());
  } else {
    actualGpuIdxToNicName = matchGpusToIbvNics(ibvLib, deviceList);
  }

  for (int gpuIdx = 0; gpuIdx < actualGpuIdxToNicName.size(); gpuIdx++) {
    TP_VLOG(5) << "CUDA GDR channel mapped GPU #" << gpuIdx
               << " to InfiniBand NIC " << actualGpuIdxToNicName[gpuIdx];
  }

  std::unordered_set<std::string> nicNames;
  for (const auto& nicName : actualGpuIdxToNicName) {
    nicNames.insert(nicName);
  }

  std::unordered_map<std::string, size_t> nicNameToNicIdx;
  // The device index is among all available devices, the NIC index is among the
  // ones we will use.
  size_t nicIdx = 0;
  for (size_t deviceIdx = 0; deviceIdx < deviceList.size(); deviceIdx++) {
    IbvLib::device& device = deviceList[deviceIdx];
    std::string deviceName(TP_CHECK_IBV_PTR(ibvLib.get_device_name(&device)));
    auto iter = nicNames.find(deviceName);
    if (iter != nicNames.end()) {
      TP_VLOG(5) << "CUDA GDR channel is using InfiniBand NIC " << deviceName
                 << " as device #" << nicIdx;
      ibvNics_.emplace_back(*iter, device, ibvLib_, cudaLib_);
      nicNameToNicIdx[*iter] = nicIdx;
      nicIdx++;
      nicNames.erase(iter);
    }
  }
  TP_THROW_ASSERT_IF(!nicNames.empty())
      << "Couldn't find all the devices I was supposed to use";

  for (size_t gpuIdx = 0; gpuIdx < actualGpuIdxToNicName.size(); gpuIdx++) {
    gpuToNic_.push_back(nicNameToNicIdx[actualGpuIdxToNicName[gpuIdx]]);
  }

  startThread("TP_CUDA_GDR_loop");
}

const CudaLib& ContextImpl::getCudaLib() {
  return cudaLib_;
}

const std::vector<size_t>& ContextImpl::getGpuToNicMapping() {
  return gpuToNic_;
}

const IbvLib& ContextImpl::getIbvLib() {
  return ibvLib_;
}

IbvNic& ContextImpl::getIbvNic(size_t nicIdx) {
  TP_DCHECK_LT(nicIdx, ibvNics_.size());
  return ibvNics_[nicIdx];
}

bool ContextImpl::pollOnce() {
  for (IbvNic& ibvNic : ibvNics_) {
    if (ibvNic.pollOnce()) {
      return true;
    }
  }
  return pollCudaOnce();
}

bool ContextImpl::pollCudaOnce() {
  bool any = false;
  for (auto iter = pendingCudaEvents_.begin(); iter != pendingCudaEvents_.end();
       iter++) {
    const CudaEvent& event = std::get<0>(*iter);

    if (event.query()) {
      std::function<void(const Error&)> cb = std::move(std::get<1>(*iter));
      cb(Error::kSuccess);
      iter = pendingCudaEvents_.erase(iter);
      any = true;
    }
  }
  return any;
}

void ContextImpl::waitForCudaEvent(
    const CudaEvent& event,
    std::function<void(const Error&)> cb) {
  deferToLoop([this, &event, cb{std::move(cb)}]() mutable {
    waitForCudaEventFromLoop(event, std::move(cb));
  });
}

void ContextImpl::waitForCudaEventFromLoop(
    const CudaEvent& event,
    std::function<void(const Error&)> cb) {
  TP_DCHECK(inLoop());

  pendingCudaEvents_.emplace_back(event, std::move(cb));
}

bool ContextImpl::readyToClose() {
  for (const IbvNic& ibvNic : ibvNics_) {
    if (!ibvNic.readyToClose()) {
      return false;
    }
  }
  return pendingCudaEvents_.empty();
}

void ContextImpl::handleErrorImpl() {
  stopBusyPolling();
}

void ContextImpl::joinImpl() {
  joinThread();

  // FIXME It would be nice if this could be done by the thread itself just
  // before it returns, rather than by the user.
  ibvNics_.clear();
}

void ContextImpl::setIdImpl() {
  for (IbvNic& ibvNic : ibvNics_) {
    ibvNic.setId(id_);
  }
}

std::shared_ptr<Channel> ContextImpl::createChannel(
    std::vector<std::shared_ptr<transport::Connection>> connections,
    Endpoint /* unused */) {
  TP_DCHECK_EQ(numConnectionsNeeded(), connections.size());
  return createChannelInternal(
      std::move(connections[0]), std::move(connections[1]));
}

size_t ContextImpl::numConnectionsNeeded() const {
  return 2;
}

} // namespace cuda_gdr
} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/channel/cuda_gdr/context_impl.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <deque>
#include <functional>
#include <list>
#include <map>
#include <memory>
#include <string>
#include <tuple>
#include <unordered_map>
#include <vector>

#include <tensorpipe/channel/context_impl_boilerplate.h>
#include <tensorpipe/channel/cuda_gdr/constants.h>
#include <tensorpipe/common/busy_polling_loop.h>
#include <tensorpipe/common/cuda.h>
#include <tensorpipe/common/cuda_buffer.h>
#include <tensorpipe/common/cuda_lib.h>
#include <tensorpipe/common/device.h>
#include <tensorpipe/common/error.h>
#include <tensorpipe/common/ibv.h>
#include <tensorpipe/common/optional.h>
#include <tensorpipe/transport/context.h>

namespace tensorpipe {
namespace channel {
namespace cuda_gdr {

class ChannelImpl;

class IbvNic {
 public:
  IbvNic(
      std::string name,
      IbvLib::device& device,
      const IbvLib& ibvLib,
      const CudaLib& cudaLib);

  IbvProtectionDomain& getIbvPd() {
    return pd_;
  }

  IbvCompletionQueue& getIbvCq() {
    return cq_;
  }

  const IbvAddress& getIbvAddress() {
    return addr_;
  }

  struct SendInfo {
    void* addr;
    size_t length;
    uint32_t lkey;
  };

  void postSend(
      IbvQueuePair& qp,
      SendInfo info,
      std::function<void(const Error&)> cb);

  struct RecvInfo {
    void* addr;
    size_t length;
    uint32_t lkey;
  };

  void postRecv(
      IbvQueuePair& qp,
      RecvInfo info,
      std::function<void(const Error&)> cb);

  bool pollOnce();

  IbvMemoryRegion& registerMemory(CudaBuffer buffer);

  bool readyToClose() const;

  void setId(std::string id);

 private:
  // The ID of the context, for use in verbose logging.
  std::string id_{"N/A"};
  // The name of the InfiniBand device.
  const std::string name_;

  const CudaLib& cudaLib_;

  const IbvLib& ibvLib_;
  IbvContext ctx_;
  IbvProtectionDomain pd_;
  IbvCompletionQueue cq_;
  IbvAddress addr_;

  size_t numAvailableRecvSlots_ = kNumRecvs;
  std::deque<
      std::tuple<IbvQueuePair&, RecvInfo, std::function<void(const Error&)>>>
      recvsWaitingForSlots_;

  size_t numAvailableSendSlots_ = kNumSends;
  std::deque<
      std::tuple<IbvQueuePair&, SendInfo, std::function<void(const Error&)>>>
      sendsWaitingForSlots_;

  // We need one common map for both send and recv requests because in principle
  // we cannot access the opcode of a failed operation, meaning we couldn't
  // match it to its callback. However, we could group them by QP number or, in
  // fact, we could have the QP store these requests and we just wake it up when
  // a completion occurs.
  std::unordered_map<
      uint64_t,
      std::tuple<IbvLib::wc_opcode, std::function<void(const Error&)>>>
      requestsInFlight_;
  uint64_t nextRequestId_ = 0;

  // The ibverbs memory regions are indexed by the CUDA driver's buffer ID for
  // the GPU allocation, which is unique (within the process) and never reused.
  // This will prevent us from re-using the memory region if a buffer gets
  // deallocated and reallocated (although we will not clean up the old memory
  // region until we close the context).
  std::map<unsigned long long, IbvMemoryRegion> memoryRegions_;
};

class ContextImpl final
    : public BusyPollingLoop,
      public ContextImplBoilerplate<ContextImpl, ChannelImpl> {
 public:
  static std::shared_ptr<ContextImpl> create(
      optional<std::vector<std::string>> gpuIdxToNicName = nullopt);

  ContextImpl(
      std::unordered_map<Device, std::string> deviceDescriptors,
      CudaLib cudaLib,
      IbvLib ibvLib,
      IbvDeviceList deviceList,
      optional<std::vector<std::string>> gpuIdxToNicName);

  std::shared_ptr<Channel> createChannel(
      std::vector<std::shared_ptr<transport::Connection>> connections,
      Endpoint endpoint);

  size_t numConnectionsNeeded() const override;

  const CudaLib& getCudaLib();

  const std::vector<size_t>& getGpuToNicMapping();

  const IbvLib& getIbvLib();

  IbvNic& getIbvNic(size_t nicIdx);

  void waitForCudaEvent(
      const CudaEvent& event,
      std::function<void(const Error&)> cb);

 protected:
  // Implement BusyPollingLoop hooks.
  bool pollOnce() override;
  bool readyToClose() override;

  // Implement the entry points called by ContextImplBoilerplate.
  void handleErrorImpl() override;
  void joinImpl() override;
  void setIdImpl() override;

 private:
  const CudaLib cudaLib_;
  const IbvLib ibvLib_;

  std::vector<IbvNic> ibvNics_;
  std::vector<size_t> gpuToNic_;

  std::list<std::tuple<const CudaEvent&, std::function<void(const Error&)>>>
      pendingCudaEvents_;

  bool pollCudaOnce();

  void waitForCudaEventFromLoop(
      const CudaEvent& event,
      std::function<void(const Error&)> cb);
};

} // namespace cuda_gdr
} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/channel/cuda_gdr/error.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <string>

#include <tensorpipe/channel/error.h>

namespace tensorpipe {
namespace channel {
namespace cuda_gdr {

class IbvError final : public BaseError {
 public:
  explicit IbvError(std::string error) : error_(error) {}

  std::string what() const override {
    return error_;
  }

 private:
  std::string error_;
};

} // namespace cuda_gdr
} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/channel/cuda_gdr/factory.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/channel/cuda_gdr/factory.h>

#include <tensorpipe/channel/context_boilerplate.h>
#include <tensorpipe/channel/cuda_gdr/channel_impl.h>
#include <tensorpipe/channel/cuda_gdr/context_impl.h>

namespace tensorpipe {
namespace channel {
namespace cuda_gdr {

std::shared_ptr<Context> create(
    optional<std::vector<std::string>> gpuIdxToNicName) {
  return std::make_shared<ContextBoilerplate<ContextImpl, ChannelImpl>>(
      std::move(gpuIdxToNicName));
}

} // namespace cuda_gdr
} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/channel/cuda_gdr/factory.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <memory>
#include <vector>

#include <tensorpipe/channel/context.h>
#include <tensorpipe/common/optional.h>

namespace tensorpipe {
namespace channel {
namespace cuda_gdr {

std::shared_ptr<Context> create(
    optional<std::vector<std::string>> gpuIdxToNicName = nullopt);

} // namespace cuda_gdr
} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/channel/cuda_ipc/channel_impl.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/channel/cuda_ipc/channel_impl.h>

#include <memory>
#include <string>
#include <utility>

#include <cuda.h>
#include <cuda_runtime.h>
#include <nop/serializer.h>
#include <nop/structure.h>
#include <nop/types/variant.h>

#include <tensorpipe/channel/cuda_ipc/constants.h>
#include <tensorpipe/channel/cuda_ipc/context_impl.h>
#include <tensorpipe/common/cuda.h>
#include <tensorpipe/common/cuda_buffer.h>
#include <tensorpipe/common/defs.h>
#include <tensorpipe/common/error.h>
#include <tensorpipe/transport/connection.h>

namespace tensorpipe {
namespace channel {
namespace cuda_ipc {

NOP_EXTERNAL_STRUCTURE(
    ContextImpl::OutboxInfo,
    processIdentifier,
    memHandle,
    eventHandles);

namespace {

size_t ceilOfRatio(size_t n, size_t d) {
  return (n + d - 1) / d;
}

struct Descriptor {
  int deviceIdx;
  size_t slotIdx;
  nop::Optional<ContextImpl::OutboxInfo> outboxInfo;
  NOP_STRUCTURE(Descriptor, deviceIdx, slotIdx, outboxInfo);
};

} // namespace

ChunkSendOperation::ChunkSendOperation(
    uint64_t bufferSequenceNumber,
    size_t chunkId,
    size_t numChunks,
    TSendCallback callback,
    int deviceIdx,
    const void* ptr,
    size_t length,
    cudaStream_t stream)
    : bufferSequenceNumber(bufferSequenceNumber),
      chunkId(chunkId),
      numChunks(numChunks),
      ptr(ptr),
      length(length),
      deviceIdx(deviceIdx),
      stream(stream),
      callback(std::move(callback)) {}

ChunkRecvOperation::ChunkRecvOperation(
    uint64_t bufferSequenceNumber,
    size_t chunkId,
    size_t numChunks,
    TRecvCallback callback,
    int deviceIdx,
    void* ptr,
    size_t length,
    cudaStream_t stream)
    : bufferSequenceNumber(bufferSequenceNumber),
      chunkId(chunkId),
      numChunks(numChunks),
      ptr(ptr),
      length(length),
      deviceIdx(deviceIdx),
      stream(stream),
      callback(std::move(callback)) {}

ChannelImpl::ChannelImpl(
    ConstructorToken token,
    std::shared_ptr<ContextImpl> context,
    std::string id,
    std::shared_ptr<transport::Connection> descriptorConnection,
    std::shared_ptr<transport::Connection> replyConnection)
    : ChannelImplBoilerplate<ContextImpl, ChannelImpl>(
          token,
          std::move(context),
          std::move(id)),
      descriptorConnection_(std::move(descriptorConnection)),
      replyConnection_(std::move(replyConnection)) {}

void ChannelImpl::initImplFromLoop() {
  context_->enroll(*this);
}

void ChannelImpl::sendImplFromLoop(
    uint64_t sequenceNumber,
    Buffer buffer,
    size_t length,
    TSendCallback callback) {
  if (length == 0) {
    callback(error_);
    return;
  }

  int deviceIdx = cudaDeviceForPointer(
      context_->getCudaLib(), buffer.unwrap<CudaBuffer>().ptr);
  const size_t numChunks = ceilOfRatio(length, kSlotSize);

  for (size_t chunkIdx = 0; chunkIdx < numChunks; chunkIdx += 1) {
    size_t offset = chunkIdx * kSlotSize;
    ChunkSendOpIter opIter = chunkSendOps_.emplaceBack(
        nextChunkBeingSent_++,
        sequenceNumber,
        chunkIdx,
        numChunks,
        chunkIdx == numChunks - 1 ? std::move(callback) : nullptr,
        deviceIdx,
        reinterpret_cast<uint8_t*>(buffer.unwrap<CudaBuffer>().ptr) + offset,
        std::min(length - offset, kSlotSize),
        buffer.unwrap<CudaBuffer>().stream);

    chunkSendOps_.advanceOperation(opIter);
  }
}

void ChannelImpl::advanceChunkSendOperation(
    ChunkSendOpIter opIter,
    ChunkSendOperation::State prevOpState) {
  TP_DCHECK(context_->inLoop());

  ChunkSendOperation& op = *opIter;

  // Needs to go after previous op invoked its callback because the last chunk
  // in a series (that corresponds to one operation) must invoke its callback
  // only when all chunks in the series are done.
  chunkSendOps_.attemptTransition(
      opIter,
      /*from=*/ChunkSendOperation::UNINITIALIZED,
      /*to=*/ChunkSendOperation::FINISHED,
      /*cond=*/error_ && prevOpState >= ChunkSendOperation::FINISHED,
      /*actions=*/
      {&ChannelImpl::callSendCallback});

  // Needs to go after previous op to ensure later operations are not holding
  // events while earlier ones are still blocked waiting for them, because the
  // events will only be returned after the control messages have been written
  // and sent, and this won't happen for later operations until earlier ones
  // have reached that stage too, and if those are blocked waiting for events
  // then we may deadlock.
  chunkSendOps_.attemptTransition(
      opIter,
      /*from=*/ChunkSendOperation::UNINITIALIZED,
      /*to=*/ChunkSendOperation::ALLOCATING_STAGING_BUFFER,
      /*cond=*/!error_ &&
          prevOpState >= ChunkSendOperation::ALLOCATING_STAGING_BUFFER,
      /*actions=*/
      {&ChannelImpl::allocateStagingBuffer});

  // See above for why this needs to go after previous op.
  chunkSendOps_.attemptTransition(
      opIter,
      /*from=*/
      ChunkSendOperation::ALLOCATING_STAGING_BUFFER,
      /*to=*/ChunkSendOperation::FINISHED,
      /*cond=*/error_ && op.doneAllocatingStagingBuffer &&
          prevOpState >= ChunkSendOperation::FINISHED,
      /*actions=*/
      {&ChannelImpl::callSendCallback, &ChannelImpl::releaseStagingBuffer});

  // Needs to go after previous op to ensure predictable and consistent ordering
  // of write calls on the descriptor control connection and read calls on the
  // reply control connection.
  chunkSendOps_.attemptTransition(
      opIter,
      /*from=*/
      ChunkSendOperation::ALLOCATING_STAGING_BUFFER,
      /*to=*/ChunkSendOperation::READING_REPLY,
      /*cond=*/!error_ && op.doneAllocatingStagingBuffer &&
          prevOpState >= ChunkSendOperation::READING_REPLY,
      /*actions=*/
      {&ChannelImpl::copyFromSourceToStaging,
       &ChannelImpl::writeDescriptor,
       &ChannelImpl::readReply,
       &ChannelImpl::callSendCallback});

  // See above for why this needs to go after previous op.
  chunkSendOps_.attemptTransition(
      opIter,
      /*from=*/ChunkSendOperation::READING_REPLY,
      /*to=*/ChunkSendOperation::FINISHED,
      /*cond=*/op.doneReadingReply &&
          prevOpState >= ChunkSendOperation::FINISHED,
      /*actions=*/
      {&ChannelImpl::releaseStagingBuffer});
}

void ChannelImpl::allocateStagingBuffer(ChunkSendOpIter opIter) {
  ChunkSendOperation& op = *opIter;

  TP_VLOG(5) << "Channel " << id_
             << " is allocating temporary memory for chunk #" << op.chunkId
             << " of " << op.numChunks << " for buffer #"
             << op.bufferSequenceNumber;
  context_->allocateSlot(
      op.deviceIdx,
      op.length,
      callbackWrapper_([opIter](
                           ChannelImpl& impl,
                           size_t slotIdx,
                           Allocator::TChunk buffer,
                           CudaEvent* event) {
        TP_VLOG(5) << "Channel " << impl.id_
                   << " is done allocating temporary memory for chunk #"
                   << opIter->chunkId << " of " << opIter->numChunks
                   << " for buffer #" << opIter->bufferSequenceNumber;
        opIter->doneAllocatingStagingBuffer = true;
        if (!impl.error_) {
          opIter->slotIdx = slotIdx;
          opIter->stagingBuffer = std::move(buffer);
          opIter->event = event;
        }
        impl.chunkSendOps_.advanceOperation(opIter);
      }));
}

void ChannelImpl::copyFromSourceToStaging(ChunkSendOpIter opIter) {
  ChunkSendOperation& op = *opIter;

  op.event->wait(op.stream, op.deviceIdx);
  {
    CudaDeviceGuard guard(op.deviceIdx);
    TP_CUDA_CHECK(cudaMemcpyAsync(
        op.stagingBuffer.get(),
        op.ptr,
        op.length,
        cudaMemcpyDeviceToDevice,
        op.stream));
  }
  op.event->record(op.stream);
}

void ChannelImpl::writeDescriptor(ChunkSendOpIter opIter) {
  ChunkSendOperation& op = *opIter;

  const CudaLib& cudaLib = context_->getCudaLib();

  auto nopDescriptorHolder = std::make_shared<NopHolder<Descriptor>>();
  Descriptor& nopDescriptor = nopDescriptorHolder->getObject();
  nopDescriptor.deviceIdx = op.deviceIdx;
  nopDescriptor.slotIdx = op.slotIdx;
  if (localOutboxesSent_.size() <= op.deviceIdx) {
    localOutboxesSent_.resize(op.deviceIdx + 1, false);
  }
  if (!localOutboxesSent_[op.deviceIdx]) {
    localOutboxesSent_[op.deviceIdx] = true;
    nopDescriptor.outboxInfo = context_->getLocalOutboxInfo(op.deviceIdx);
  }

  TP_VLOG(6) << "Channel " << id_ << " is writing nop object (descriptor #"
             << op.sequenceNumber << ")";
  descriptorConnection_->write(
      *nopDescriptorHolder,
      callbackWrapper_([nopDescriptorHolder,
                        sequenceNumber{op.sequenceNumber}](ChannelImpl& impl) {
        TP_VLOG(6) << "Channel " << impl.id_
                   << " done writing nop object (descriptor #" << sequenceNumber
                   << ")";
      }));
}

void ChannelImpl::readReply(ChunkSendOpIter opIter) {
  ChunkSendOperation& op = *opIter;

  TP_VLOG(6) << "Channel " << id_ << " is reading nop object (reply #"
             << op.sequenceNumber << ")";
  replyConnection_->read(
      nullptr,
      0,
      callbackWrapper_([opIter](
                           ChannelImpl& impl,
                           const void* /* unused */,
                           size_t /* unused */) {
        TP_VLOG(6) << "Channel " << impl.id_
                   << " done reading nop object (reply #"
                   << opIter->sequenceNumber << ")";
        opIter->doneReadingReply = true;
        impl.chunkSendOps_.advanceOperation(opIter);
      }));
}

void ChannelImpl::releaseStagingBuffer(ChunkSendOpIter opIter) {
  ChunkSendOperation& op = *opIter;

  op.stagingBuffer = nullptr;
}

void ChannelImpl::callSendCallback(ChunkSendOpIter opIter) {
  ChunkSendOperation& op = *opIter;

  if (op.callback) {
    op.callback(error_);
    // Reset callback to release the resources it was holding.
    op.callback = nullptr;
  }
}

void ChannelImpl::recvImplFromLoop(
    uint64_t sequenceNumber,
    Buffer buffer,
    size_t length,
    TRecvCallback callback) {
  if (length == 0) {
    callback(error_);
    return;
  }

  int deviceIdx = cudaDeviceForPointer(
      context_->getCudaLib(), buffer.unwrap<CudaBuffer>().ptr);
  const size_t numChunks = ceilOfRatio(length, kSlotSize);

  for (size_t chunkIdx = 0; chunkIdx < numChunks; chunkIdx += 1) {
    size_t offset = chunkIdx * kSlotSize;
    ChunkRecvOpIter opIter = chunkRecvOps_.emplaceBack(
        nextChunkBeingReceived_++,
        sequenceNumber,
        chunkIdx,
        numChunks,
        chunkIdx == numChunks - 1 ? std::move(callback) : nullptr,
        deviceIdx,
        reinterpret_cast<uint8_t*>(buffer.unwrap<CudaBuffer>().ptr) + offset,
        std::min(length - offset, kSlotSize),
        buffer.unwrap<CudaBuffer>().stream);

    chunkRecvOps_.advanceOperation(opIter);
  }
}

void ChannelImpl::advanceChunkRecvOperation(
    ChunkRecvOpIter opIter,
    ChunkRecvOperation::State prevOpState) {
  TP_DCHECK(context_->inLoop());

  ChunkRecvOperation& op = *opIter;

  // Needs to go after previous op invoked its callback because the last chunk
  // in a series (that corresponds to one operation) must invoke its callback
  // only when all chunks in the series are done.
  chunkRecvOps_.attemptTransition(
      opIter,
      /*from=*/ChunkRecvOperation::UNINITIALIZED,
      /*to=*/ChunkRecvOperation::FINISHED,
      /*cond=*/error_ && prevOpState >= ChunkRecvOperation::FINISHED,
      /*actions=*/{&ChannelImpl::callRecvCallback});

  // Needs to go after previous op to ensure predictable and consistent ordering
  // of read calls on descriptor control connection.
  chunkRecvOps_.attemptTransition(
      opIter,
      /*from=*/ChunkRecvOperation::UNINITIALIZED,
      /*to=*/ChunkRecvOperation::READING_DESCRIPTOR,
      /*cond=*/!error_ && prevOpState >= ChunkRecvOperation::READING_DESCRIPTOR,
      /*actions=*/{&ChannelImpl::readDescriptor});

  // See above for why this needs to go after previous op.
  chunkRecvOps_.attemptTransition(
      opIter,
      /*from=*/ChunkRecvOperation::READING_DESCRIPTOR,
      /*to=*/ChunkRecvOperation::FINISHED,
      /*cond=*/error_ && op.doneReadingDescriptor &&
          prevOpState >= ChunkRecvOperation::FINISHED,
      /*actions=*/{&ChannelImpl::callRecvCallback});

  // Needs to go after previous op to ensure predictable and consistent ordering
  // of write calls on reply control connection.
  chunkRecvOps_.attemptTransition(
      opIter,
      /*from=*/ChunkRecvOperation::READING_DESCRIPTOR,
      /*to=*/ChunkRecvOperation::FINISHED,
      /*cond=*/!error_ && op.doneReadingDescriptor &&
          prevOpState >= ChunkRecvOperation::FINISHED,
      /*actions=*/
      {&ChannelImpl::copyFromStagingToTarget,
       &ChannelImpl::writeReply,
       &ChannelImpl::callRecvCallback});
}

void ChannelImpl::readDescriptor(ChunkRecvOpIter opIter) {
  ChunkRecvOperation& op = *opIter;

  TP_VLOG(6) << "Channel " << id_ << " is reading nop object (descriptor #"
             << op.sequenceNumber << ")";
  auto nopDescriptorHolder = std::make_shared<NopHolder<Descriptor>>();
  descriptorConnection_->read(
      *nopDescriptorHolder,
      callbackWrapper_([opIter, nopDescriptorHolder](ChannelImpl& impl) {
        TP_VLOG(6) << "Channel " << impl.id_
                   << " done reading nop object (descriptor #"
                   << opIter->sequenceNumber << ")";
        opIter->doneReadingDescriptor = true;
        if (!impl.error_) {
          Descriptor& nopDescriptor = nopDescriptorHolder->getObject();
          opIter->remoteDeviceIdx = nopDescriptor.deviceIdx;
          opIter->remoteSlotIdx = nopDescriptor.slotIdx;
          if (!nopDescriptor.outboxInfo.empty()) {
            if (impl.remoteOutboxesReceived_.size() <=
                opIter->remoteDeviceIdx) {
              impl.remoteOutboxesReceived_.resize(opIter->remoteDeviceIdx + 1);
            }
            TP_DCHECK(!impl.remoteOutboxesReceived_[opIter->remoteDeviceIdx]
                           .has_value());
            impl.remoteOutboxesReceived_[opIter->remoteDeviceIdx] =
                std::move(nopDescriptor.outboxInfo.take());
          }
        }
        impl.chunkRecvOps_.advanceOperation(opIter);
      }));
}

void ChannelImpl::copyFromStagingToTarget(ChunkRecvOpIter opIter) {
  ChunkRecvOperation& op = *opIter;

  if (remoteOutboxesOpened_.size() <= op.remoteDeviceIdx) {
    remoteOutboxesOpened_.resize(op.remoteDeviceIdx + 1);
  }
  if (remoteOutboxesOpened_[op.remoteDeviceIdx].size() <= op.deviceIdx) {
    remoteOutboxesOpened_[op.remoteDeviceIdx].resize(op.deviceIdx + 1, nullptr);
  }
  if (remoteOutboxesOpened_[op.remoteDeviceIdx][op.deviceIdx] == nullptr) {
    remoteOutboxesOpened_[op.remoteDeviceIdx][op.deviceIdx] =
        &context_->openRemoteOutbox(
            op.deviceIdx,
            op.remoteDeviceIdx,
            remoteOutboxesReceived_[op.remoteDeviceIdx].value());
  }
  const ContextImpl::RemoteOutboxHandle& outbox =
      *remoteOutboxesOpened_[op.remoteDeviceIdx][op.deviceIdx];

  TP_VLOG(6) << "Channel " << id_ << " is copying payload (#"
             << op.sequenceNumber << ")";

  outbox.events[op.remoteSlotIdx]->wait(op.stream, op.deviceIdx);
  {
    CudaDeviceGuard guard(op.deviceIdx);
    TP_CUDA_CHECK(cudaMemcpyAsync(
        op.ptr,
        outbox.buffer.ptr() + kSlotSize * op.remoteSlotIdx,
        op.length,
        cudaMemcpyDeviceToDevice,
        op.stream));
  }
  outbox.events[op.remoteSlotIdx]->record(op.stream);

  TP_VLOG(6) << "Channel " << id_ << " done copying payload (#"
             << op.sequenceNumber << ")";
}

void ChannelImpl::callRecvCallback(ChunkRecvOpIter opIter) {
  ChunkRecvOperation& op = *opIter;

  if (op.callback) {
    op.callback(error_);
    // Reset callback to release the resources it was holding.
    op.callback = nullptr;
  }
}

void ChannelImpl::writeReply(ChunkRecvOpIter opIter) {
  ChunkRecvOperation& op = *opIter;

  TP_VLOG(6) << "Channel " << id_ << " is writing reply notification (#"
             << op.sequenceNumber << ")";
  replyConnection_->write(
      nullptr,
      0,
      callbackWrapper_([sequenceNumber{op.sequenceNumber}](ChannelImpl& impl) {
        TP_VLOG(6) << "Channel " << impl.id_
                   << " done writing reply notification (#" << sequenceNumber
                   << ")";
      }));
}

void ChannelImpl::handleErrorImpl() {
  chunkSendOps_.advanceAllOperations();
  chunkRecvOps_.advanceAllOperations();

  descriptorConnection_->close();
  replyConnection_->close();

  context_->unenroll(*this);
}

} // namespace cuda_ipc
} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/channel/cuda_ipc/channel_impl.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <list>
#include <memory>
#include <string>

#include <cuda_runtime.h>

#include <tensorpipe/channel/channel_impl_boilerplate.h>
#include <tensorpipe/channel/cuda_ipc/context_impl.h>
#include <tensorpipe/common/allocator.h>
#include <tensorpipe/common/cuda.h>
#include <tensorpipe/common/cuda_lib.h>
#include <tensorpipe/common/state_machine.h>
#include <tensorpipe/transport/context.h>

namespace tensorpipe {
namespace channel {
namespace cuda_ipc {

class ContextImpl;

struct ChunkSendOperation {
  enum State {
    UNINITIALIZED,
    ALLOCATING_STAGING_BUFFER,
    READING_REPLY,
    FINISHED
  };

  // Fields used by the state machine
  uint64_t sequenceNumber{0};
  State state{UNINITIALIZED};

  // Progress flags
  bool doneAllocatingStagingBuffer{false};
  bool doneReadingReply{false};

  // Arguments at creation
  const uint64_t bufferSequenceNumber;
  const size_t chunkId;
  const size_t numChunks;
  const void* const ptr;
  const size_t length;
  const int deviceIdx;
  const cudaStream_t stream;
  TSendCallback callback;

  // Other data
  size_t slotIdx{static_cast<size_t>(-1)};
  Allocator::TChunk stagingBuffer;
  CudaEvent* event{nullptr};

  ChunkSendOperation(
      uint64_t bufferSequenceNumber,
      size_t chunkId,
      size_t numChunks,
      TSendCallback callback,
      int deviceIdx,
      const void* ptr,
      size_t length,
      cudaStream_t stream);
};

struct ChunkRecvOperation {
  enum State { UNINITIALIZED, READING_DESCRIPTOR, FINISHED };

  // Fields used by the state machine
  uint64_t sequenceNumber{0};
  State state{UNINITIALIZED};

  // Progress flags
  bool doneReadingDescriptor{false};
  bool doneRequestingEvent{false};
  bool doneReadingAck{false};

  // Arguments at creation
  const uint64_t bufferSequenceNumber;
  const size_t chunkId;
  const size_t numChunks;
  void* const ptr;
  const size_t length;
  const int deviceIdx;
  const cudaStream_t stream;
  TRecvCallback callback;

  // Other data
  int remoteDeviceIdx;
  size_t remoteSlotIdx;

  ChunkRecvOperation(
      uint64_t bufferSequenceNumber,
      size_t chunkId,
      size_t numChunks,
      TRecvCallback callback,
      int deviceIdx,
      void* ptr,
      size_t length,
      cudaStream_t stream);
};

class ChannelImpl final
    : public ChannelImplBoilerplate<ContextImpl, ChannelImpl> {
 public:
  ChannelImpl(
      ConstructorToken token,
      std::shared_ptr<ContextImpl> context,
      std::string id,
      std::shared_ptr<transport::Connection> descriptorConnection,
      std::shared_ptr<transport::Connection> replyConnection);

 protected:
  // Implement the entry points called by ChannelImplBoilerplate.
  void initImplFromLoop() override;
  void sendImplFromLoop(
      uint64_t sequenceNumber,
      Buffer buffer,
      size_t length,
      TSendCallback callback) override;
  void recvImplFromLoop(
      uint64_t sequenceNumber,
      Buffer buffer,
      size_t length,
      TRecvCallback callback) override;
  void handleErrorImpl() override;

 private:
  const std::shared_ptr<transport::Connection> descriptorConnection_;
  const std::shared_ptr<transport::Connection> replyConnection_;

  // For each local device, whether we've already sent the information about the
  // device's outbox to the remote, who needs it to open a handle to the outbox.
  // Used during the send path.
  std::vector<bool> localOutboxesSent_;

  // For each remote device, the information about the remote's outbox for that
  // device (or nullopt, if we haven't received it yet). We store it because we
  // will only receive it once (for the first buffer coming from that device)
  // but we might need it multiple time, as we need to open it for every local
  // target device where it might be needed. Used during the receive path.
  std::vector<optional<ContextImpl::OutboxInfo>> remoteOutboxesReceived_;
  // For each remote and local device, the handle to the opened remote outbox
  // for that device (or nullptr if we haven't opened it yet). Used during the
  // receive path.
  std::vector<std::vector<const ContextImpl::RemoteOutboxHandle*>>
      remoteOutboxesOpened_;

  // A sequence number for the chunks.
  uint64_t nextChunkBeingSent_{0};
  uint64_t nextChunkBeingReceived_{0};

  OpsStateMachine<ChannelImpl, ChunkSendOperation> chunkSendOps_{
      *this,
      &ChannelImpl::advanceChunkSendOperation};
  using ChunkSendOpIter = decltype(chunkSendOps_)::Iter;
  OpsStateMachine<ChannelImpl, ChunkRecvOperation> chunkRecvOps_{
      *this,
      &ChannelImpl::advanceChunkRecvOperation};
  using ChunkRecvOpIter = decltype(chunkRecvOps_)::Iter;

  // State machines for send and recv ops.
  void advanceChunkSendOperation(
      ChunkSendOpIter opIter,
      ChunkSendOperation::State prevOpState);
  void advanceChunkRecvOperation(
      ChunkRecvOpIter opIter,
      ChunkRecvOperation::State prevOpState);

  // Actions (i.e., methods that begin a state transition).
  // For send operations:
  void allocateStagingBuffer(ChunkSendOpIter opIter);
  void copyFromSourceToStaging(ChunkSendOpIter opIter);
  void writeDescriptor(ChunkSendOpIter opIter);
  void readReply(ChunkSendOpIter opIter);
  void releaseStagingBuffer(ChunkSendOpIter opIter);
  void callSendCallback(ChunkSendOpIter opIter);
  // For recv operations:
  void readDescriptor(ChunkRecvOpIter opIter);
  void copyFromStagingToTarget(ChunkRecvOpIter opIter);
  void callRecvCallback(ChunkRecvOpIter opIter);
  void writeReply(ChunkRecvOpIter opIter);
};

} // namespace cuda_ipc
} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/channel/cuda_ipc/constants.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <cstddef>

namespace tensorpipe {
namespace channel {
namespace cuda_ipc {

// FIXME Avoid this anonymous namespace and use inline variables in C++-17.
namespace {

// Define all three (redundant) values to make them explicit and avoid
// misunderstandings due to miscalculations.
static constexpr size_t kStagingAreaSize = 32 * 1024 * 1024;
static constexpr size_t kSlotSize = 8 * 1024 * 1024;
static constexpr size_t kNumSlots = 4;

static_assert(kStagingAreaSize == kSlotSize * kNumSlots, "");

} // namespace

} // namespace cuda_ipc
} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/channel/cuda_ipc/context_impl.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/channel/cuda_ipc/context_impl.h>

#include <algorithm>
#include <array>
#include <functional>
#include <iomanip>
#include <ios>
#include <memory>
#include <sstream>
#include <string>
#include <tuple>
#include <utility>
#include <vector>

#include <unistd.h>

#include <nop/serializer.h>
#include <nop/structure.h>

#include <tensorpipe/channel/cuda_ipc/channel_impl.h>
#include <tensorpipe/channel/cuda_ipc/constants.h>
#include <tensorpipe/channel/helpers.h>
#include <tensorpipe/common/cuda.h>
#include <tensorpipe/common/defs.h>
#include <tensorpipe/common/nop.h>
#include <tensorpipe/common/optional.h>
#include <tensorpipe/common/strings.h>
#include <tensorpipe/common/system.h>

namespace tensorpipe {
namespace channel {
namespace cuda_ipc {

namespace {

std::tuple<std::vector<std::string>, std::vector<std::vector<bool>>>
getGlobalUuidsAndP2pSupport(const NvmlLib& nvmlLib) {
  unsigned int numDevices;
  TP_NVML_CHECK(nvmlLib, nvmlLib.deviceGetCount_v2(&numDevices));

  std::vector<nvmlDevice_t> devices(numDevices);
  std::vector<std::string> uuids(numDevices);
  for (unsigned int devIdx = 0; devIdx < numDevices; devIdx++) {
    TP_NVML_CHECK(
        nvmlLib, nvmlLib.deviceGetHandleByIndex_v2(devIdx, &devices[devIdx]));

    // NVML_DEVICE_UUID_V2_BUFFER_SIZE was introduced in CUDA 11.0.
#ifdef NVML_DEVICE_UUID_V2_BUFFER_SIZE
    std::array<char, NVML_DEVICE_UUID_V2_BUFFER_SIZE> uuid;
#else
    std::array<char, NVML_DEVICE_UUID_BUFFER_SIZE> uuid;
#endif
    TP_NVML_CHECK(
        nvmlLib,
        nvmlLib.deviceGetUUID(devices[devIdx], uuid.data(), uuid.size()));
    std::string uuidStr(uuid.data());
    TP_THROW_ASSERT_IF(uuidStr.substr(0, 4) != "GPU-")
        << "Couldn't obtain valid UUID for GPU #" << devIdx
        << " from CUDA driver. Got: " << uuidStr;
    uuidStr = uuidStr.substr(4);
    TP_THROW_ASSERT_IF(!isValidUuid(uuidStr))
        << "Couldn't obtain valid UUID for GPU #" << devIdx
        << " from NVML. Got: " << uuidStr;
    uuids[devIdx] = std::move(uuidStr);
  }

  std::vector<std::vector<bool>> p2pSupport(numDevices);
  for (int devIdx = 0; devIdx < numDevices; devIdx++) {
    p2pSupport[devIdx].resize(numDevices);
    for (int otherDevIdx = 0; otherDevIdx < numDevices; otherDevIdx++) {
      if (devIdx == otherDevIdx) {
        p2pSupport[devIdx][otherDevIdx] = true;
        continue;
      }
      nvmlGpuP2PStatus_t p2pStatus;
      TP_NVML_CHECK(
          nvmlLib,
          nvmlLib.deviceGetP2PStatus(
              devices[devIdx],
              devices[otherDevIdx],
              NVML_P2P_CAPS_INDEX_READ,
              &p2pStatus));
      p2pSupport[devIdx][otherDevIdx] = (p2pStatus == NVML_P2P_STATUS_OK);
    }
  }

  return std::make_tuple(std::move(uuids), std::move(p2pSupport));
}

int globalIdxForDevice(
    const std::vector<std::string>& globalUuids,
    const std::string& uuid) {
  auto iter = std::find(globalUuids.begin(), globalUuids.end(), uuid);
  TP_THROW_ASSERT_IF(iter == globalUuids.end())
      << "Couldn't find GPU with UUID " << uuid;

  return iter - globalUuids.begin();
}

struct DeviceDescriptor {
  std::string bootId;
  int64_t pid;
  std::string deviceUuid;
  NOP_STRUCTURE(DeviceDescriptor, bootId, pid, deviceUuid);
};

DeviceDescriptor deserializeDeviceDescriptor(
    const std::string& deviceDescriptor) {
  NopHolder<DeviceDescriptor> nopHolder;
  loadDescriptor(nopHolder, deviceDescriptor);
  return std::move(nopHolder.getObject());
}

std::string generateBootId() {
  auto bootID = getBootID();
  TP_THROW_ASSERT_IF(!bootID) << "Unable to read boot_id";
  return bootID.value();
}

// FIXME We'd want this to return a std::vector<CudaEvent>, but CudaEvents
// aren't default-constructible nor movable. Hence either we make them such,
// or we use some pointer magic (like placement new). For now, we work around
// this by using a unique_ptr and wrapping them in optional<>, but it's silly.
std::unique_ptr<optional<CudaEvent>[]> createIpcEventArray(
    int deviceIdx,
    size_t numEvents) {
  auto events = std::make_unique<optional<CudaEvent>[]>(numEvents);
  // The CUDA driver has a bug where creating and/or destroying IPC events
  // sometimes causes a deadlock (it's unclear which of the two steps is the
  // cause). The deadlock tends to manifest as a cudaStreamSynchronize call
  // never returning. Just to be safe, and to catch such a deadlock early and
  // clearly, let's add extra syncs here. (The bug is fixed in v460).
  {
    CudaDeviceGuard guard(deviceIdx);
    TP_CUDA_CHECK(cudaDeviceSynchronize());
  }
  for (size_t idx = 0; idx < numEvents; idx++) {
    events[idx].emplace(deviceIdx, true);
    // One day we might get tempted to have CudaEvent lazily initialize its
    // cudaEvent_t, just like PyTorch does. However here we explicitly want to
    // eagerly initialize IPC events, as creating them late might deadlock with
    // old CUDA driver versions. This check should hopefully catch if the event
    // is lazy-initialized.
    TP_THROW_ASSERT_IF(events[idx]->raw() == nullptr);
  }
  {
    CudaDeviceGuard guard(deviceIdx);
    TP_CUDA_CHECK(cudaDeviceSynchronize());
  }
  return events;
}

std::vector<cudaIpcEventHandle_t> getIpcHandlesForEventArray(
    optional<CudaEvent> events[],
    size_t numEvents) {
  std::vector<cudaIpcEventHandle_t> eventHandles(numEvents);
  for (size_t idx = 0; idx < numEvents; idx++) {
    eventHandles[idx] = events[idx]->getIpcHandle();
  }
  return eventHandles;
}

} // namespace

ContextImpl::Outbox::Outbox(int deviceIdx)
    : buffer(kStagingAreaSize, deviceIdx),
      events(createIpcEventArray(deviceIdx, kNumSlots)),
      handle(this->buffer.getIpcHandle()),
      eventHandles(getIpcHandlesForEventArray(this->events.get(), kNumSlots)),
      allocator(this->buffer.ptr(), kNumSlots, kSlotSize) {}

ContextImpl::Outbox::~Outbox() {
  // The CUDA driver has a bug where creating and/or destroying IPC events
  // sometimes causes a deadlock (it's unclear which of the two steps is the
  // cause). The deadlock tends to manifest as a cudaStreamSynchronize call
  // never returning. Just to be safe, and to catch such a deadlock early and
  // clearly, let's add extra syncs here. (The bug is fixed in v460).
  {
    CudaDeviceGuard guard(buffer.deviceIdx());
    TP_CUDA_CHECK(cudaDeviceSynchronize());
  }
  events.reset();
  {
    CudaDeviceGuard guard(buffer.deviceIdx());
    TP_CUDA_CHECK(cudaDeviceSynchronize());
  }
}

std::shared_ptr<ContextImpl> ContextImpl::create() {
  Error error;
  CudaLib cudaLib;
  std::tie(error, cudaLib) = CudaLib::create();
  if (error) {
    TP_VLOG(5)
        << "CUDA IPC channel is not viable because libcuda could not be loaded: "
        << error.what();
    return nullptr;
  }

  NvmlLib nvmlLib;
  std::tie(error, nvmlLib) = NvmlLib::create();
  if (error) {
    TP_VLOG(5)
        << "CUDA IPC channel is not viable because libnvidia-ml could not be loaded: "
        << error.what();
    return nullptr;
  }

  const std::string bootId = generateBootId();
  const pid_t pid = ::getpid();

  std::unordered_map<Device, std::string> deviceDescriptors;
  for (const auto& device : getCudaDevices(cudaLib)) {
    // This part is largely inspired from
    // https://github.com/NVIDIA/cuda-samples/blob/master/Samples/simpleIPC/simpleIPC.cu.
    cudaDeviceProp props;
    TP_CUDA_CHECK(cudaGetDeviceProperties(&props, device.index));

    // Unified addressing is required for IPC.
    if (!props.unifiedAddressing) {
      TP_VLOG(4) << "CUDA IPC channel is not viable because CUDA device "
                 << device.index << " does not have unified addressing";
      return nullptr;
    }

    // The other two compute modes are "exclusive" and "prohibited", both of
    // which prevent access from an other process.
    int computeMode = -1;
    TP_CUDA_CHECK(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, device.index));
    if (computeMode != cudaComputeModeDefault) {
      TP_VLOG(4) << "CUDA IPC channel is not viable because CUDA device "
                 << device.index << " is not in default compute mode";
      return nullptr;
    }

    NopHolder<DeviceDescriptor> nopHolder;
    DeviceDescriptor& deviceDescriptor = nopHolder.getObject();
    deviceDescriptor.bootId = bootId;
    deviceDescriptor.pid = static_cast<int64_t>(pid);
    deviceDescriptor.deviceUuid = getUuidOfDevice(cudaLib, device.index);

    deviceDescriptors[device] = saveDescriptor(nopHolder);
  }

  std::vector<std::string> globalUuids;
  std::vector<std::vector<bool>> p2pSupport;
  std::tie(globalUuids, p2pSupport) = getGlobalUuidsAndP2pSupport(nvmlLib);
  TP_VLOG(4) << "The UUIDs of all the GPUs found by the CUDA IPC channel are "
             << joinStrs(globalUuids);
  TP_VLOG(4) << "The peer-to-peer support found by the CUDA IPC channel is "
             << formatMatrix(p2pSupport);

  std::ostringstream oss;
  optional<std::string> nsId = getLinuxNamespaceId(LinuxNamespace::kPid);
  if (!nsId.has_value()) {
    TP_VLOG(4)
        << "CUDA IPC channel is not viable because it couldn't determine the PID namespace ID";
    return nullptr;
  }
  oss << nsId.value() << "_" << pid;
  std::string processIdentifier = oss.str();

  return std::make_shared<ContextImpl>(
      std::move(deviceDescriptors),
      std::move(cudaLib),
      std::move(nvmlLib),
      std::move(globalUuids),
      std::move(p2pSupport),
      std::move(processIdentifier));
}

ContextImpl::ContextImpl(
    std::unordered_map<Device, std::string> deviceDescriptors,
    CudaLib cudaLib,
    NvmlLib nvmlLib,
    std::vector<std::string> globalUuids,
    std::vector<std::vector<bool>> p2pSupport,
    std::string processIdentifier)
    : ContextImplBoilerplate<ContextImpl, ChannelImpl>(
          std::move(deviceDescriptors)),
      cudaLib_(std::move(cudaLib)),
      nvmlLib_(std::move(nvmlLib)),
      globalUuids_(std::move(globalUuids)),
      p2pSupport_(std::move(p2pSupport)),
      processIdentifier_(processIdentifier) {}

std::shared_ptr<Channel> ContextImpl::createChannel(
    std::vector<std::shared_ptr<transport::Connection>> connections,
    Endpoint /* unused */) {
  TP_DCHECK_EQ(numConnectionsNeeded(), connections.size());
  return createChannelInternal(
      std::move(connections[0]), std::move(connections[1]));
}

size_t ContextImpl::numConnectionsNeeded() const {
  // The control connection needs to carry two unrelated streams in each
  // direction (the descriptors and the replies), and it's thus simpler to just
  // use two such connections.
  return 2;
}

bool ContextImpl::canCommunicateWithRemote(
    const std::string& localDeviceDescriptor,
    const std::string& remoteDeviceDescriptor) const {
  DeviceDescriptor nopLocalDeviceDescriptor =
      deserializeDeviceDescriptor(localDeviceDescriptor);
  DeviceDescriptor nopRemoteDeviceDescriptor =
      deserializeDeviceDescriptor(remoteDeviceDescriptor);

  if (nopLocalDeviceDescriptor.bootId != nopRemoteDeviceDescriptor.bootId) {
    return false;
  }

  // Disable CudaIpc when both endpoints are in the same process, as a CUDA IPC
  // handle cannot be opened in the same process in which it was created.
  if (nopLocalDeviceDescriptor.pid == nopRemoteDeviceDescriptor.pid) {
    return false;
  }

  int localGlobalIdx =
      globalIdxForDevice(globalUuids_, nopLocalDeviceDescriptor.deviceUuid);
  int remoteGlobalIdx =
      globalIdxForDevice(globalUuids_, nopRemoteDeviceDescriptor.deviceUuid);

  return p2pSupport_[localGlobalIdx][remoteGlobalIdx] &&
      p2pSupport_[remoteGlobalIdx][localGlobalIdx];
}

const CudaLib& ContextImpl::getCudaLib() {
  return cudaLib_;
}

void ContextImpl::allocateSlot(
    int deviceIdx,
    size_t length,
    SlotAllocCallback callback) {
  if (outboxes_.size() <= deviceIdx) {
    outboxes_.resize(deviceIdx + 1);
  }
  if (outboxes_[deviceIdx] == nullptr) {
    outboxes_[deviceIdx] = std::make_unique<Outbox>(deviceIdx);
  }

  // We don't need to wrap this callback with the callbackWrapper_ because the
  // callback that was passed to this method already is, and because all we're
  // doing here is wrap that callback and do read-only accesses to the outbox.
  Outbox& outbox = *outboxes_[deviceIdx];
  outboxes_[deviceIdx]->allocator.alloc(
      length,
      [&outbox, callback{std::move(callback)}](
          const Error& error, Allocator::TChunk chunk) {
        if (error) {
          callback(error, 0, std::move(chunk), nullptr);
          return;
        }
        size_t slotIdx = (chunk.get() - outbox.buffer.ptr()) / kSlotSize;
        callback(
            error, slotIdx, std::move(chunk), &outbox.events[slotIdx].value());
      });
}

ContextImpl::OutboxInfo ContextImpl::getLocalOutboxInfo(int deviceIdx) {
  TP_DCHECK(outboxes_.size() > deviceIdx);
  TP_DCHECK(outboxes_[deviceIdx] != nullptr);
  OutboxInfo info;
  info.processIdentifier = processIdentifier_;
  info.memHandle = std::string(
      reinterpret_cast<const char*>(&outboxes_[deviceIdx]->handle),
      sizeof(cudaIpcMemHandle_t));
  info.eventHandles.reserve(kNumSlots);
  for (size_t slotIdx = 0; slotIdx < kNumSlots; slotIdx++) {
    info.eventHandles.emplace_back(
        reinterpret_cast<const char*>(
            &outboxes_[deviceIdx]->eventHandles[slotIdx]),
        sizeof(cudaIpcEventHandle_t));
  }
  return info;
}

const ContextImpl::RemoteOutboxHandle& ContextImpl::openRemoteOutbox(
    int localDeviceIdx,
    int remoteDeviceIdx,
    OutboxInfo remoteOutboxInfo) {
  RemoteOutboxKey key{
      std::move(remoteOutboxInfo.processIdentifier),
      remoteDeviceIdx,
      localDeviceIdx};
  decltype(remoteOutboxes_)::iterator iter;
  bool didntExist;
  std::tie(iter, didntExist) =
      remoteOutboxes_.emplace(std::move(key), RemoteOutboxHandle{});
  RemoteOutboxHandle& outbox = iter->second;

  if (didntExist) {
    CudaDeviceGuard guard(localDeviceIdx);
    outbox.buffer = CudaIpcBuffer(
        localDeviceIdx,
        *reinterpret_cast<const cudaIpcMemHandle_t*>(
            remoteOutboxInfo.memHandle.data()));
    outbox.events = std::make_unique<optional<CudaEvent>[]>(kNumSlots);
    for (size_t slotIdx = 0; slotIdx < kNumSlots; slotIdx++) {
      outbox.events[slotIdx].emplace(
          localDeviceIdx,
          *reinterpret_cast<const cudaIpcEventHandle_t*>(
              remoteOutboxInfo.eventHandles[slotIdx].data()));
    }
  }

  return outbox;
}

void ContextImpl::handleErrorImpl() {
  for (std::unique_ptr<Outbox>& outbox : outboxes_) {
    if (outbox != nullptr) {
      outbox->allocator.close();
    }
  }
}

void ContextImpl::joinImpl() {}

bool ContextImpl::inLoop() const {
  return loop_.inLoop();
};

void ContextImpl::deferToLoop(std::function<void()> fn) {
  loop_.deferToLoop(std::move(fn));
};

} // namespace cuda_ipc
} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/channel/cuda_ipc/context_impl.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <functional>
#include <map>
#include <memory>
#include <string>
#include <vector>

#include <tensorpipe/channel/context_impl_boilerplate.h>
#include <tensorpipe/common/allocator.h>
#include <tensorpipe/common/cuda.h>
#include <tensorpipe/common/cuda_lib.h>
#include <tensorpipe/common/deferred_executor.h>
#include <tensorpipe/common/device.h>
#include <tensorpipe/common/nvml_lib.h>

namespace tensorpipe {
namespace channel {
namespace cuda_ipc {

class ChannelImpl;

class ContextImpl final
    : public ContextImplBoilerplate<ContextImpl, ChannelImpl> {
 public:
  static std::shared_ptr<ContextImpl> create();

  ContextImpl(
      std::unordered_map<Device, std::string> deviceDescriptors,
      CudaLib cudaLib,
      NvmlLib nvmlLib,
      std::vector<std::string> globalUuids,
      std::vector<std::vector<bool>> p2pSupport,
      std::string processIdentifier);

  std::shared_ptr<Channel> createChannel(
      std::vector<std::shared_ptr<transport::Connection>> connections,
      Endpoint endpoint);

  size_t numConnectionsNeeded() const override;

  bool canCommunicateWithRemote(
      const std::string& localDeviceDescriptor,
      const std::string& remoteDeviceDescriptor) const override;

  const CudaLib& getCudaLib();

  // Takes the index of the slot, the (smart) pointer to the slot, and the (raw)
  // pointer to the event for the slot.
  using SlotAllocCallback =
      std::function<void(const Error&, size_t, Allocator::TChunk, CudaEvent*)>;
  void allocateSlot(int deviceIdx, size_t length, SlotAllocCallback callback);

  struct OutboxInfo {
    std::string processIdentifier;
    std::string memHandle;
    std::vector<std::string> eventHandles;
  };
  OutboxInfo getLocalOutboxInfo(int deviceIdx);

  struct RemoteOutboxHandle {
    CudaIpcBuffer buffer;
    std::unique_ptr<optional<CudaEvent>[]> events;
  };
  const RemoteOutboxHandle& openRemoteOutbox(
      int localDeviceIdx,
      int remoteDeviceIdx,
      OutboxInfo remoteOutboxInfo);

  // Implement the DeferredExecutor interface.
  bool inLoop() const override;
  void deferToLoop(std::function<void()> fn) override;

 protected:
  // Implement the entry points called by ContextImplBoilerplate.
  void handleErrorImpl() override;
  void joinImpl() override;

 private:
  OnDemandDeferredExecutor loop_;

  const CudaLib cudaLib_;
  const NvmlLib nvmlLib_;

  const std::vector<std::string> globalUuids_;
  const std::vector<std::vector<bool>> p2pSupport_;

  // A combination of the process's PID namespace and its PID, which combined
  // with the device index allows us to uniquely identify each staging buffer on
  // the current machine.
  const std::string processIdentifier_;

  // A CUDA on-device allocation that acts as the outbox for all the channels of
  // this context. We cannot directly get and open IPC handles of the user's
  // buffers, as this will fail if the user already opened such a handle (this
  // limitation was lifted in CUDA 11.1). Moreover, since we "leak" the opened
  // IPC handles (i.e., we leave them open, and close them all when the context
  // closes), if we opened an IPC handle to a user buffer and the user freed
  // that buffer we would prevent CUDA from really making that memory available
  // again (this is an undocumented behavior which was observed experimentally).
  // As a solution, we create our own allocation and get and open an IPC handle
  // to that, as we can guarantee its lifetime and that no other IPC handle
  // exists. We then use it as a staging ground for outgoing transfers, copying
  // chunks to it from source buffers, and having the remote copy them to the
  // target buffer.
  struct Outbox {
    const CudaDeviceBuffer buffer;
    std::unique_ptr<optional<CudaEvent>[]> events;
    const cudaIpcMemHandle_t handle;
    const std::vector<cudaIpcEventHandle_t> eventHandles;
    Allocator allocator;

    explicit Outbox(int deviceIdx);
    ~Outbox();
  };
  std::vector<std::unique_ptr<Outbox>> outboxes_;

  struct RemoteOutboxKey {
    std::string processIdentifier;
    int remoteDeviceIdx;
    int localDeviceIdx;

    bool operator==(const RemoteOutboxKey& other) const noexcept {
      return processIdentifier == other.processIdentifier &&
          remoteDeviceIdx == other.remoteDeviceIdx &&
          localDeviceIdx == other.localDeviceIdx;
    }
  };
  struct RemoteOutboxKeyHash {
    size_t operator()(const RemoteOutboxKey& key) const noexcept {
      size_t h1 = std::hash<std::string>{}(key.processIdentifier);
      size_t h2 = std::hash<int>{}(key.remoteDeviceIdx);
      size_t h3 = std::hash<int>{}(key.localDeviceIdx);
      // Byte-shift hashes in order to "capture" the order of members.
      // FIXME Should we use a proper hash combiner? We can copy Boost's one.
      return h1 ^ (h2 << 1) ^ (h3 << 2);
    }
  };
  std::unordered_map<RemoteOutboxKey, RemoteOutboxHandle, RemoteOutboxKeyHash>
      remoteOutboxes_;
};

} // namespace cuda_ipc
} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/channel/cuda_ipc/factory.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/channel/cuda_ipc/factory.h>

#include <tensorpipe/channel/context_boilerplate.h>
#include <tensorpipe/channel/cuda_ipc/channel_impl.h>
#include <tensorpipe/channel/cuda_ipc/context_impl.h>

namespace tensorpipe {
namespace channel {
namespace cuda_ipc {

std::shared_ptr<Context> create() {
  return std::make_shared<ContextBoilerplate<ContextImpl, ChannelImpl>>();
}

} // namespace cuda_ipc
} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/channel/cuda_ipc/factory.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <memory>

#include <tensorpipe/channel/context.h>

namespace tensorpipe {
namespace channel {
namespace cuda_ipc {

std::shared_ptr<Context> create();

} // namespace cuda_ipc
} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/channel/cuda_xth/channel_impl.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/channel/cuda_xth/channel_impl.h>

#include <memory>
#include <string>
#include <utility>

#include <cuda_runtime.h>
#include <nop/serializer.h>
#include <nop/structure.h>

#include <tensorpipe/channel/cuda_xth/context_impl.h>
#include <tensorpipe/common/defs.h>
#include <tensorpipe/common/error.h>
#include <tensorpipe/transport/connection.h>

namespace tensorpipe {
namespace channel {
namespace cuda_xth {

namespace {

struct Descriptor {
  uintptr_t startEvent;
  uintptr_t srcPtr;
  int srcDeviceIdx;
  uintptr_t srcStream;
  NOP_STRUCTURE(Descriptor, startEvent, srcPtr, srcDeviceIdx, srcStream);
};

} // namespace

SendOperation::SendOperation(
    int deviceIdx,
    void* ptr,
    size_t length,
    cudaStream_t stream,
    TSendCallback callback)
    : deviceIdx(deviceIdx),
      ptr(ptr),
      length(length),
      stream(stream),
      callback(std::move(callback)),
      startEv(deviceIdx) {
  startEv.record(stream);
}

RecvOperation::RecvOperation(
    int deviceIdx,
    CudaBuffer buffer,
    size_t length,
    TRecvCallback callback)
    : ptr(buffer.ptr),
      length(length),
      deviceIdx(deviceIdx),
      stream(buffer.stream),
      callback(std::move(callback)) {}

void RecvOperation::process() {
  {
    CudaDeviceGuard guard(deviceIdx);
    TP_CUDA_CHECK(cudaStreamWaitEvent(stream, startEvent, 0));
    TP_CUDA_CHECK(
        cudaMemcpyAsync(ptr, srcPtr, length, cudaMemcpyDeviceToDevice, stream));
  }

  CudaEvent stopEv(deviceIdx);
  stopEv.record(stream);
  stopEv.wait(srcStream, srcDeviceIdx);
}

ChannelImpl::ChannelImpl(
    ConstructorToken token,
    std::shared_ptr<ContextImpl> context,
    std::string id,
    std::shared_ptr<transport::Connection> descriptorConnection,
    std::shared_ptr<transport::Connection> completionConnection)
    : ChannelImplBoilerplate<ContextImpl, ChannelImpl>(
          token,
          std::move(context),
          std::move(id)),
      descriptorConnection_(std::move(descriptorConnection)),
      completionConnection_(std::move(completionConnection)) {}

void ChannelImpl::initImplFromLoop() {
  context_->enroll(*this);
}

void ChannelImpl::sendImplFromLoop(
    uint64_t sequenceNumber,
    Buffer buffer,
    size_t length,
    TSendCallback callback) {
  int deviceIdx = cudaDeviceForPointer(
      context_->getCudaLib(), buffer.unwrap<CudaBuffer>().ptr);
  SendOpIter opIter = sendOps_.emplaceBack(
      sequenceNumber,
      deviceIdx,
      buffer.unwrap<CudaBuffer>().ptr,
      length,
      buffer.unwrap<CudaBuffer>().stream,
      std::move(callback));

  sendOps_.advanceOperation(opIter);
}

void ChannelImpl::advanceSendOperation(
    SendOpIter opIter,
    SendOperation::State prevOpState) {
  TP_DCHECK(context_->inLoop());

  SendOperation& op = *opIter;

  sendOps_.attemptTransition(
      opIter,
      /*from=*/SendOperation::UNINITIALIZED,
      /*to=*/SendOperation::FINISHED,
      /*cond=*/error_ || op.length == 0,
      /*actions=*/{&ChannelImpl::callSendCallback});

  // Needs to go after previous op to ensure predictable and consistent ordering
  // of write calls on the descriptor control connection and read calls on the
  // completion control connection.
  sendOps_.attemptTransition(
      opIter,
      /*from=*/SendOperation::UNINITIALIZED,
      /*to=*/SendOperation::READING_COMPLETION,
      /*cond=*/!error_ && prevOpState >= SendOperation::READING_COMPLETION,
      /*actions=*/
      {&ChannelImpl::writeDescriptor, &ChannelImpl::readCompletion});

  sendOps_.attemptTransition(
      opIter,
      /*from=*/SendOperation::READING_COMPLETION,
      /*to=*/SendOperation::FINISHED,
      /*cond=*/op.doneReadingCompletion,
      /*actions=*/{&ChannelImpl::callSendCallback});
}

void ChannelImpl::writeDescriptor(SendOpIter opIter) {
  SendOperation& op = *opIter;

  auto nopHolder = std::make_shared<NopHolder<Descriptor>>();
  Descriptor& nopDescriptor = nopHolder->getObject();
  static_assert(std::is_pointer<cudaEvent_t>::value, "");
  static_assert(std::is_pointer<cudaStream_t>::value, "");
  nopDescriptor.startEvent = reinterpret_cast<uintptr_t>(op.startEv.raw());
  nopDescriptor.srcDeviceIdx = op.deviceIdx;
  nopDescriptor.srcPtr = reinterpret_cast<uintptr_t>(op.ptr);
  nopDescriptor.srcStream = reinterpret_cast<uintptr_t>(op.stream);

  TP_VLOG(6) << "Channel " << id_ << " is writing descriptor (#"
             << op.sequenceNumber << ")";
  descriptorConnection_->write(
      *nopHolder,
      callbackWrapper_([sequenceNumber{op.sequenceNumber},
                        nopHolder](ChannelImpl& impl) {
        TP_VLOG(6) << "Channel " << impl.id_ << " done writing descriptor (#"
                   << sequenceNumber << ")";
      }));
}

void ChannelImpl::readCompletion(SendOpIter opIter) {
  SendOperation& op = *opIter;

  TP_VLOG(6) << "Channel " << id_ << " is reading completion (#"
             << op.sequenceNumber << ")";
  completionConnection_->read(
      nullptr,
      0,
      callbackWrapper_([opIter](
                           ChannelImpl& impl,
                           const void* /* unused */,
                           size_t /* unused */) {
        TP_VLOG(6) << "Channel " << impl.id_ << " done reading completion (#"
                   << opIter->sequenceNumber << ")";
        opIter->doneReadingCompletion = true;
        impl.sendOps_.advanceOperation(opIter);
      }));
}

void ChannelImpl::callSendCallback(SendOpIter opIter) {
  SendOperation& op = *opIter;

  op.callback(error_);
  // Reset callback to release the resources it was holding.
  op.callback = nullptr;
}

void ChannelImpl::recvImplFromLoop(
    uint64_t sequenceNumber,
    Buffer buffer,
    size_t length,
    TRecvCallback callback) {
  int deviceIdx = cudaDeviceForPointer(
      context_->getCudaLib(), buffer.unwrap<CudaBuffer>().ptr);
  RecvOpIter opIter = recvOps_.emplaceBack(
      sequenceNumber,
      deviceIdx,
      buffer.unwrap<CudaBuffer>(),
      length,
      std::move(callback));

  recvOps_.advanceOperation(opIter);
}

void ChannelImpl::advanceRecvOperation(
    RecvOpIter opIter,
    RecvOperation::State prevOpState) {
  TP_DCHECK(context_->inLoop());

  RecvOperation& op = *opIter;

  recvOps_.attemptTransition(
      opIter,
      /*from=*/RecvOperation::UNINITIALIZED,
      /*to=*/RecvOperation::FINISHED,
      /*cond=*/error_ || op.length == 0,
      /*actions=*/{&ChannelImpl::callRecvCallback});

  // Needs to go after previous op to ensure predictable and consistent ordering
  // of read calls on the descriptor control connection.
  recvOps_.attemptTransition(
      opIter,
      /*from=*/RecvOperation::UNINITIALIZED,
      /*to=*/RecvOperation::READING_DESCRIPTOR,
      /*cond=*/!error_ && prevOpState >= RecvOperation::READING_DESCRIPTOR,
      /*actions=*/{&ChannelImpl::readDescriptor});

  recvOps_.attemptTransition(
      opIter,
      /*from=*/RecvOperation::READING_DESCRIPTOR,
      /*to=*/RecvOperation::FINISHED,
      /*cond=*/error_ && op.doneReadingDescriptor,
      /*actions=*/{&ChannelImpl::callRecvCallback});

  // Needs to go after previous op to ensure predictable and consistent ordering
  // of write calls on the completion control connection.
  recvOps_.attemptTransition(
      opIter,
      /*from=*/RecvOperation::READING_DESCRIPTOR,
      /*to=*/RecvOperation::FINISHED,
      /*cond=*/!error_ && op.doneReadingDescriptor &&
          prevOpState >= RecvOperation::FINISHED,
      /*actions=*/
      {&ChannelImpl::waitOnStartEventAndCopyAndSyncWithSourceStream,
       &ChannelImpl::callRecvCallback,
       &ChannelImpl::writeCompletion});
}

void ChannelImpl::readDescriptor(RecvOpIter opIter) {
  RecvOperation& op = *opIter;

  TP_VLOG(6) << "Channel " << id_ << " is reading descriptor (#"
             << op.sequenceNumber << ")";
  auto nopHolderIn = std::make_shared<NopHolder<Descriptor>>();
  descriptorConnection_->read(
      *nopHolderIn, callbackWrapper_([opIter, nopHolderIn](ChannelImpl& impl) {
        TP_VLOG(6) << "Channel " << impl.id_ << " done reading descriptor (#"
                   << opIter->sequenceNumber << ")";
        opIter->doneReadingDescriptor = true;
        if (!impl.error_) {
          Descriptor& nopDescriptor = nopHolderIn->getObject();
          static_assert(std::is_pointer<cudaEvent_t>::value, "");
          static_assert(std::is_pointer<cudaStream_t>::value, "");
          opIter->startEvent =
              reinterpret_cast<cudaEvent_t>(nopDescriptor.startEvent);
          opIter->srcPtr = reinterpret_cast<const void*>(nopDescriptor.srcPtr);
          opIter->srcDeviceIdx = nopDescriptor.srcDeviceIdx;
          opIter->srcStream =
              reinterpret_cast<cudaStream_t>(nopDescriptor.srcStream);
        }
        impl.recvOps_.advanceOperation(opIter);
      }));
}

void ChannelImpl::waitOnStartEventAndCopyAndSyncWithSourceStream(
    RecvOpIter opIter) {
  RecvOperation& op = *opIter;

  TP_VLOG(6) << "Channel " << id_ << " is copying payload (#"
             << op.sequenceNumber << ")";
  op.process();
  TP_VLOG(6) << "Channel " << id_ << " done copying payload (#"
             << op.sequenceNumber << ")";
}

void ChannelImpl::callRecvCallback(RecvOpIter opIter) {
  RecvOperation& op = *opIter;

  op.callback(error_);
  // Reset callback to release the resources it was holding.
  op.callback = nullptr;
}

void ChannelImpl::writeCompletion(RecvOpIter opIter) {
  RecvOperation& op = *opIter;

  TP_VLOG(6) << "Channel " << id_ << " is writing completion (#"
             << op.sequenceNumber << ")";
  completionConnection_->write(
      nullptr,
      0,
      callbackWrapper_([sequenceNumber{op.sequenceNumber}](ChannelImpl& impl) {
        TP_VLOG(6) << "Channel " << impl.id_ << " done writing completion (#"
                   << sequenceNumber << ")";
      }));
}

void ChannelImpl::handleErrorImpl() {
  sendOps_.advanceAllOperations();
  recvOps_.advanceAllOperations();

  descriptorConnection_->close();
  completionConnection_->close();

  context_->unenroll(*this);
}

} // namespace cuda_xth
} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/channel/cuda_xth/channel_impl.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <memory>
#include <string>

#include <tensorpipe/channel/channel_impl_boilerplate.h>
#include <tensorpipe/common/cuda.h>
#include <tensorpipe/common/cuda_buffer.h>
#include <tensorpipe/common/state_machine.h>
#include <tensorpipe/transport/context.h>

namespace tensorpipe {
namespace channel {
namespace cuda_xth {

class ContextImpl;

struct SendOperation {
  enum State { UNINITIALIZED, READING_COMPLETION, FINISHED };

  // Fields used by the state machine
  uint64_t sequenceNumber{0};
  State state{UNINITIALIZED};

  // Progress flags
  bool doneReadingCompletion{false};

  // Arguments at creation
  int deviceIdx;
  void* ptr;
  size_t length;
  cudaStream_t stream;
  TSendCallback callback;

  // Other stuff
  CudaEvent startEv;

  SendOperation(
      int deviceIdx,
      void* ptr,
      size_t length,
      cudaStream_t stream,
      TSendCallback callback);
};

struct RecvOperation {
  enum State { UNINITIALIZED, READING_DESCRIPTOR, FINISHED };

  // Fields used by the state machine
  uint64_t sequenceNumber{0};
  State state{UNINITIALIZED};

  // Progress flags
  bool doneReadingDescriptor{false};

  // Arguments at creation
  void* const ptr;
  const size_t length;
  const int deviceIdx;
  const cudaStream_t stream;
  TRecvCallback callback;

  // Other data
  cudaEvent_t startEvent;
  const void* srcPtr;
  int srcDeviceIdx;
  cudaStream_t srcStream;

  RecvOperation(
      int deviceIdx,
      CudaBuffer buffer,
      size_t length,
      TRecvCallback callback);

  void process();
};

class ChannelImpl final
    : public ChannelImplBoilerplate<ContextImpl, ChannelImpl> {
 public:
  ChannelImpl(
      ConstructorToken token,
      std::shared_ptr<ContextImpl> context,
      std::string id,
      std::shared_ptr<transport::Connection> descriptorConnection,
      std::shared_ptr<transport::Connection> completionConnection);

 protected:
  // Implement the entry points called by ChannelImplBoilerplate.
  void initImplFromLoop() override;
  void sendImplFromLoop(
      uint64_t sequenceNumber,
      Buffer buffer,
      size_t length,
      TSendCallback callback) override;
  void recvImplFromLoop(
      uint64_t sequenceNumber,
      Buffer buffer,
      size_t length,
      TRecvCallback callback) override;
  void handleErrorImpl() override;

 private:
  const std::shared_ptr<transport::Connection> descriptorConnection_;
  const std::shared_ptr<transport::Connection> completionConnection_;

  OpsStateMachine<ChannelImpl, SendOperation> sendOps_{
      *this,
      &ChannelImpl::advanceSendOperation};
  using SendOpIter = decltype(sendOps_)::Iter;
  OpsStateMachine<ChannelImpl, RecvOperation> recvOps_{
      *this,
      &ChannelImpl::advanceRecvOperation};
  using RecvOpIter = decltype(recvOps_)::Iter;

  // State machines for send and recv ops.
  void advanceSendOperation(
      SendOpIter opIter,
      SendOperation::State prevOpState);
  void advanceRecvOperation(
      RecvOpIter opIter,
      RecvOperation::State prevOpState);

  // Actions (i.e., methods that begin a state transition).
  // For send operations:
  void writeDescriptor(SendOpIter opIter);
  void readCompletion(SendOpIter opIter);
  void callSendCallback(SendOpIter opIter);
  // For recv operations:
  void readDescriptor(RecvOpIter opIter);
  void waitOnStartEventAndCopyAndSyncWithSourceStream(RecvOpIter opIter);
  void callRecvCallback(RecvOpIter opIter);
  void writeCompletion(RecvOpIter opIter);
};

} // namespace cuda_xth
} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/channel/cuda_xth/context_impl.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/channel/cuda_xth/context_impl.h>

#include <unistd.h>

#include <functional>
#include <sstream>
#include <string>
#include <utility>

#include <tensorpipe/channel/cuda_xth/channel_impl.h>
#include <tensorpipe/common/cuda.h>
#include <tensorpipe/common/defs.h>
#include <tensorpipe/common/optional.h>
#include <tensorpipe/common/system.h>

namespace tensorpipe {
namespace channel {
namespace cuda_xth {

std::shared_ptr<ContextImpl> ContextImpl::create() {
  Error error;
  CudaLib cudaLib;
  std::tie(error, cudaLib) = CudaLib::create();
  if (error) {
    TP_VLOG(5)
        << "CUDA XTH channel is not viable because libcuda could not be loaded: "
        << error.what();
    return nullptr;
  }

  std::ostringstream oss;
  auto bootID = getBootID();
  TP_THROW_ASSERT_IF(!bootID) << "Unable to read boot_id";
  auto nsID = getLinuxNamespaceId(LinuxNamespace::kPid);
  if (!nsID) {
    TP_VLOG(5)
        << "CUDA XTH channel is not viable because it couldn't determine the PID namespace ID";
    return nullptr;
  }
  oss << bootID.value() << "_" << nsID.value() << "_" << ::getpid();
  const std::string domainDescriptor = oss.str();

  std::unordered_map<Device, std::string> deviceDescriptors;
  for (const auto& device : getCudaDevices(cudaLib)) {
    cudaDeviceProp props;
    TP_CUDA_CHECK(cudaGetDeviceProperties(&props, device.index));

    // Unified addressing is required for cross-device `cudaMemcpyAsync()`. We
    // could lift this requirement by adding a fallback to
    // `cudaMemcpyPeerAsync()`.
    if (!props.unifiedAddressing) {
      TP_VLOG(4) << "CUDA XTH channel is not viable because CUDA device "
                 << device.index << " does not have unified addressing";
      return nullptr;
    }
    deviceDescriptors[device] = domainDescriptor;
  }

  if (deviceDescriptors.empty()) {
    return nullptr;
  }

  return std::make_shared<ContextImpl>(
      std::move(cudaLib), std::move(deviceDescriptors));
}

ContextImpl::ContextImpl(
    CudaLib cudaLib,
    std::unordered_map<Device, std::string> deviceDescriptors)
    : ContextImplBoilerplate<ContextImpl, ChannelImpl>(
          std::move(deviceDescriptors)),
      cudaLib_(std::move(cudaLib)) {}

std::shared_ptr<Channel> ContextImpl::createChannel(
    std::vector<std::shared_ptr<transport::Connection>> connections,
    Endpoint /* unused */) {
  TP_DCHECK_EQ(numConnectionsNeeded(), connections.size());
  return createChannelInternal(
      std::move(connections[0]), std::move(connections[1]));
}

size_t ContextImpl::numConnectionsNeeded() const {
  return 2;
}

const CudaLib& ContextImpl::getCudaLib() {
  return cudaLib_;
}

void ContextImpl::handleErrorImpl() {}

void ContextImpl::joinImpl() {}

bool ContextImpl::inLoop() const {
  return loop_.inLoop();
};

void ContextImpl::deferToLoop(std::function<void()> fn) {
  loop_.deferToLoop(std::move(fn));
};

} // namespace cuda_xth
} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/channel/cuda_xth/context_impl.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <tensorpipe/channel/context_impl_boilerplate.h>
#include <tensorpipe/common/cuda_lib.h>
#include <tensorpipe/common/deferred_executor.h>
#include <tensorpipe/common/device.h>

namespace tensorpipe {
namespace channel {
namespace cuda_xth {

class ChannelImpl;

class ContextImpl final
    : public ContextImplBoilerplate<ContextImpl, ChannelImpl> {
 public:
  static std::shared_ptr<ContextImpl> create();

  ContextImpl(
      CudaLib cudaLib,
      std::unordered_map<Device, std::string> deviceDescriptors);

  std::shared_ptr<Channel> createChannel(
      std::vector<std::shared_ptr<transport::Connection>> connections,
      Endpoint endpoint);

  size_t numConnectionsNeeded() const override;

  const CudaLib& getCudaLib();

  // Implement the DeferredExecutor interface.
  bool inLoop() const override;
  void deferToLoop(std::function<void()> fn) override;

 protected:
  // Implement the entry points called by ContextImplBoilerplate.
  void handleErrorImpl() override;
  void joinImpl() override;

 private:
  OnDemandDeferredExecutor loop_;

  const CudaLib cudaLib_;
};

} // namespace cuda_xth
} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/channel/cuda_xth/factory.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/channel/cuda_xth/factory.h>

#include <tensorpipe/channel/context_boilerplate.h>
#include <tensorpipe/channel/cuda_xth/channel_impl.h>
#include <tensorpipe/channel/cuda_xth/context_impl.h>

namespace tensorpipe {
namespace channel {
namespace cuda_xth {

std::shared_ptr<Context> create() {
  return std::make_shared<ContextBoilerplate<ContextImpl, ChannelImpl>>();
}

} // namespace cuda_xth
} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/channel/cuda_xth/factory.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <memory>

#include <tensorpipe/channel/context.h>

namespace tensorpipe {
namespace channel {
namespace cuda_xth {

std::shared_ptr<Context> create();

} // namespace cuda_xth
} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/channel/error.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/channel/error.h>

#include <cstring>
#include <sstream>

namespace tensorpipe {
namespace channel {

std::string ContextClosedError::what() const {
  return "context closed";
}

std::string ChannelClosedError::what() const {
  return "channel closed";
}

std::string ContextNotViableError::what() const {
  return "context not viable";
}

} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/channel/error.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <string>

#include <tensorpipe/common/error.h>

namespace tensorpipe {
namespace channel {

class ContextClosedError final : public BaseError {
 public:
  ContextClosedError() {}

  std::string what() const override;
};

class ChannelClosedError final : public BaseError {
 public:
  ChannelClosedError() {}

  std::string what() const override;
};

class ContextNotViableError final : public BaseError {
 public:
  ContextNotViableError() {}

  std::string what() const override;
};

} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/channel/helpers.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/channel/helpers.h>

#include <string>

#include <tensorpipe/common/defs.h>
#include <tensorpipe/common/nop.h>

namespace tensorpipe {
namespace channel {

std::string saveDescriptor(const AbstractNopHolder& object) {
  const size_t len = object.getSize();
  std::string out(len, '\0');
  NopWriter writer(
      const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(out.data())), len);

  nop::Status<void> status = object.write(writer);
  TP_THROW_ASSERT_IF(status.has_error())
      << "Error saving descriptor: " << status.GetErrorMessage();

  return out;
}

void loadDescriptor(AbstractNopHolder& object, const std::string& in) {
  const size_t len = in.size();
  NopReader reader(reinterpret_cast<const uint8_t*>(in.data()), len);

  nop::Status<void> status = object.read(reader);
  TP_THROW_ASSERT_IF(status.has_error())
      << "Error loading descriptor: " << status.GetErrorMessage();
}

} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/channel/helpers.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

// Note: never include this file from headers!

#include <string>

#include <tensorpipe/common/nop.h>

namespace tensorpipe {
namespace channel {

std::string saveDescriptor(const AbstractNopHolder& object);

void loadDescriptor(AbstractNopHolder& object, const std::string& in);

} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/channel/mpt/channel_impl.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/channel/mpt/channel_impl.h>

#include <memory>
#include <string>
#include <utility>
#include <vector>

#include <tensorpipe/channel/mpt/context_impl.h>
#include <tensorpipe/common/cpu_buffer.h>
#include <tensorpipe/common/defs.h>
#include <tensorpipe/transport/connection.h>

namespace tensorpipe {
namespace channel {
namespace mpt {

ChannelImpl::ChannelImpl(
    ConstructorToken token,
    std::shared_ptr<ContextImpl> context,
    std::string id,
    std::shared_ptr<transport::Connection> connection,
    Endpoint endpoint,
    uint64_t numLanes)
    : ChannelImplBoilerplate<ContextImpl, ChannelImpl>(
          token,
          std::move(context),
          std::move(id)),
      connection_(std::move(connection)),
      endpoint_(endpoint),
      numLanes_(numLanes),
      lanes_(numLanes_) {}

void ChannelImpl::initImplFromLoop() {
  context_->enroll(*this);

  TP_DCHECK_EQ(state_, UNINITIALIZED);
  if (endpoint_ == Endpoint::kConnect) {
    state_ = CLIENT_READING_HELLO;
    auto nopHolderIn = std::make_shared<NopHolder<Packet>>();
    TP_VLOG(6) << "Channel " << id_ << " reading nop object (server hello)";
    connection_->read(
        *nopHolderIn, callbackWrapper_([nopHolderIn](ChannelImpl& impl) {
          TP_VLOG(6) << "Channel " << impl.id_
                     << " done reading nop object (server hello)";
          if (!impl.error_) {
            impl.onClientReadHelloOnConnection(nopHolderIn->getObject());
          }
        }));
  } else if (endpoint_ == Endpoint::kListen) {
    state_ = SERVER_ACCEPTING_LANES;
    const std::vector<std::string>& addresses = context_->addresses();
    TP_DCHECK_EQ(addresses.size(), numLanes_);
    auto nopHolderOut = std::make_shared<NopHolder<Packet>>();
    Packet& nopPacket = nopHolderOut->getObject();
    nopPacket.Become(nopPacket.index_of<ServerHello>());
    ServerHello& nopServerHello = *nopPacket.get<ServerHello>();
    for (uint64_t laneIdx = 0; laneIdx < numLanes_; ++laneIdx) {
      nopServerHello.laneAdvertisements.emplace_back();
      LaneAdvertisement& nopLaneAdvertisement =
          nopServerHello.laneAdvertisements.back();
      nopLaneAdvertisement.address = addresses[laneIdx];
      TP_VLOG(6) << "Channel " << id_ << " requesting connection (for lane "
                 << laneIdx << ")";
      uint64_t token = context_->registerConnectionRequest(
          laneIdx,
          callbackWrapper_(
              [laneIdx](
                  ChannelImpl& impl,
                  std::shared_ptr<transport::Connection> connection) {
                TP_VLOG(6) << "Channel " << impl.id_
                           << " done requesting connection (for lane "
                           << laneIdx << ")";
                if (!impl.error_) {
                  impl.onServerAcceptOfLane(laneIdx, std::move(connection));
                }
              }));
      laneRegistrationIds_.emplace(laneIdx, token);
      nopLaneAdvertisement.registrationId = token;
      numLanesBeingAccepted_++;
    }
    TP_VLOG(6) << "Channel " << id_ << " writing nop object (server hello)";
    connection_->write(
        *nopHolderOut, callbackWrapper_([nopHolderOut](ChannelImpl& impl) {
          TP_VLOG(6) << "Channel " << impl.id_
                     << " done writing nop object (server hello)";
        }));
  } else {
    TP_THROW_ASSERT() << "unknown endpoint";
  }
}

void ChannelImpl::onClientReadHelloOnConnection(const Packet& nopPacketIn) {
  TP_DCHECK(context_->inLoop());
  TP_DCHECK_EQ(state_, CLIENT_READING_HELLO);
  TP_DCHECK_EQ(nopPacketIn.index(), nopPacketIn.index_of<ServerHello>());

  const ServerHello& nopServerHello = *nopPacketIn.get<ServerHello>();
  TP_DCHECK_EQ(nopServerHello.laneAdvertisements.size(), numLanes_);
  lanes_.resize(numLanes_);
  for (uint64_t laneIdx = 0; laneIdx < numLanes_; ++laneIdx) {
    const LaneAdvertisement& nopLaneAdvertisement =
        nopServerHello.laneAdvertisements[laneIdx];
    std::shared_ptr<transport::Connection> lane =
        context_->connect(laneIdx, nopLaneAdvertisement.address);
    auto nopHolderOut = std::make_shared<NopHolder<Packet>>();
    Packet& nopPacket = nopHolderOut->getObject();
    nopPacket.Become(nopPacket.index_of<ClientHello>());
    ClientHello& nopClientHello = *nopPacket.get<ClientHello>();
    nopClientHello.registrationId = nopLaneAdvertisement.registrationId;
    TP_VLOG(6) << "Channel " << id_
               << " writing nop object (client hello) on lane " << laneIdx;
    lane->write(
        *nopHolderOut,
        callbackWrapper_([laneIdx, nopHolderOut](ChannelImpl& impl) {
          TP_VLOG(6) << "Channel " << impl.id_
                     << " done writing nop object (client hello) on lane "
                     << laneIdx;
        }));
    lanes_[laneIdx] = std::move(lane);
  }

  state_ = ESTABLISHED;
  sendOps_.advanceAllOperations();
  recvOps_.advanceAllOperations();
}

void ChannelImpl::onServerAcceptOfLane(
    uint64_t laneIdx,
    std::shared_ptr<transport::Connection> connection) {
  TP_DCHECK(context_->inLoop());
  TP_DCHECK_EQ(state_, SERVER_ACCEPTING_LANES);

  TP_DCHECK(!lanes_[laneIdx]);
  TP_DCHECK_LT(laneIdx, lanes_.size());
  lanes_[laneIdx] = std::move(connection);
  auto laneRegistrationIter = laneRegistrationIds_.find(laneIdx);
  TP_DCHECK(laneRegistrationIter != laneRegistrationIds_.end());
  context_->unregisterConnectionRequest(laneRegistrationIter->second);
  laneRegistrationIds_.erase(laneRegistrationIter);
  numLanesBeingAccepted_--;

  if (numLanesBeingAccepted_ == 0) {
    state_ = ESTABLISHED;
    sendOps_.advanceAllOperations();
    recvOps_.advanceAllOperations();
  }
}

void ChannelImpl::sendImplFromLoop(
    uint64_t sequenceNumber,
    Buffer buffer,
    size_t length,
    TSendCallback callback) {
  SendOpIter opIter = sendOps_.emplaceBack(sequenceNumber);
  SendOperation& op = *opIter;
  op.ptr = buffer.unwrap<CpuBuffer>().ptr;
  op.length = length;
  op.callback = std::move(callback);

  sendOps_.advanceOperation(opIter);
}

void ChannelImpl::advanceSendOperation(
    SendOpIter opIter,
    SendOperation::State prevOpState) {
  TP_DCHECK(context_->inLoop());

  SendOperation& op = *opIter;

  sendOps_.attemptTransition(
      opIter,
      /*from=*/SendOperation::UNINITIALIZED,
      /*to=*/SendOperation::FINISHED,
      /*cond=*/error_ || op.length == 0,
      /*actions=*/{&ChannelImpl::callSendCallback});

  // Needs to go after previous op to ensure predictable and consistent ordering
  // of write calls on lanes.
  sendOps_.attemptTransition(
      opIter,
      /*from=*/SendOperation::UNINITIALIZED,
      /*to=*/SendOperation::WRITING_CHUNKS,
      /*cond=*/!error_ && state_ == ESTABLISHED &&
          prevOpState >= SendOperation::WRITING_CHUNKS,
      /*actions=*/{&ChannelImpl::writeChunks});

  sendOps_.attemptTransition(
      opIter,
      /*from=*/SendOperation::WRITING_CHUNKS,
      /*to=*/SendOperation::FINISHED,
      /*cond=*/op.numChunksBeingWritten == 0,
      /*actions=*/{&ChannelImpl::callSendCallback});
}

void ChannelImpl::writeChunks(SendOpIter opIter) {
  SendOperation& op = *opIter;

  for (uint64_t laneIdx = 0; laneIdx < lanes_.size(); laneIdx++) {
    // Insert "cutpoints" at equally-spaced intervals in the buffer, rounding
    // them down if they don't end up being at an integer position.
    uint64_t offsetStart = op.length * laneIdx / lanes_.size();
    uint64_t offsetEnd = op.length * (laneIdx + 1) / lanes_.size();
    // As void "has no size" we cannot do pointer arithmetic on it. We need to
    // temporarily convert the pointer to a type that has a size of 1 byte.
    const void* ptr = reinterpret_cast<const uint8_t*>(op.ptr) + offsetStart;
    uint64_t length = offsetEnd - offsetStart;

    // Write payload.
    TP_VLOG(6) << "Channel " << id_ << " writing payload #" << op.sequenceNumber
               << " on lane " << laneIdx;
    lanes_[laneIdx]->write(
        ptr, length, callbackWrapper_([opIter, laneIdx](ChannelImpl& impl) {
          TP_VLOG(6) << "Channel " << impl.id_ << " done writing payload #"
                     << opIter->sequenceNumber << " on lane " << laneIdx;
          --opIter->numChunksBeingWritten;
          impl.sendOps_.advanceOperation(opIter);
        }));
    ++op.numChunksBeingWritten;
  }
}

void ChannelImpl::callSendCallback(SendOpIter opIter) {
  SendOperation& op = *opIter;

  op.callback(error_);
  // Reset callback to release the resources it was holding.
  op.callback = nullptr;
}

void ChannelImpl::recvImplFromLoop(
    uint64_t sequenceNumber,
    Buffer buffer,
    size_t length,
    TRecvCallback callback) {
  RecvOpIter opIter = recvOps_.emplaceBack(sequenceNumber);
  RecvOperation& op = *opIter;
  op.ptr = buffer.unwrap<CpuBuffer>().ptr;
  op.length = length;
  op.callback = std::move(callback);

  recvOps_.advanceOperation(opIter);
}

void ChannelImpl::advanceRecvOperation(
    RecvOpIter opIter,
    RecvOperation::State prevOpState) {
  TP_DCHECK(context_->inLoop());

  RecvOperation& op = *opIter;

  recvOps_.attemptTransition(
      opIter,
      /*from=*/RecvOperation::UNINITIALIZED,
      /*to=*/RecvOperation::FINISHED,
      /*cond=*/error_ || op.length == 0,
      /*actions=*/{&ChannelImpl::callRecvCallback});

  // Needs to go after previous op to ensure predictable and consistent ordering
  // of read calls on lanes.
  recvOps_.attemptTransition(
      opIter,
      /*from=*/RecvOperation::UNINITIALIZED,
      /*to=*/RecvOperation::READING_CHUNKS,
      /*cond=*/!error_ && state_ == ESTABLISHED &&
          prevOpState >= RecvOperation::READING_CHUNKS,
      /*actions=*/{&ChannelImpl::readChunks});

  recvOps_.attemptTransition(
      opIter,
      /*from=*/RecvOperation::READING_CHUNKS,
      /*to=*/RecvOperation::FINISHED,
      /*cond=*/op.numChunksBeingRead == 0,
      /*actions=*/{&ChannelImpl::callRecvCallback});
}

void ChannelImpl::readChunks(RecvOpIter opIter) {
  RecvOperation& op = *opIter;

  for (uint64_t laneIdx = 0; laneIdx < lanes_.size(); laneIdx++) {
    // Insert "cutpoints" at equally-spaced intervals in the buffer, rounding
    // them down if they don't end up being at an integer position.
    uint64_t offsetStart = op.length * laneIdx / lanes_.size();
    uint64_t offsetEnd = op.length * (laneIdx + 1) / lanes_.size();
    // As void "has no size" we cannot do pointer arithmetic on it. We need to
    // temporarily convert the pointer to a type that has a size of 1 byte.
    void* ptr = reinterpret_cast<uint8_t*>(op.ptr) + offsetStart;
    uint64_t length = offsetEnd - offsetStart;

    // Read payload.
    TP_VLOG(6) << "Channel " << id_ << " reading payload #" << op.sequenceNumber
               << " on lane " << laneIdx;
    lanes_[laneIdx]->read(
        ptr,
        length,
        callbackWrapper_([opIter, laneIdx](
                             ChannelImpl& impl,
                             const void* /* unused */,
                             size_t /* unused */) {
          TP_VLOG(6) << "Channel " << impl.id_ << " done reading payload #"
                     << opIter->sequenceNumber << " on lane " << laneIdx;
          --opIter->numChunksBeingRead;
          impl.recvOps_.advanceOperation(opIter);
        }));
    ++op.numChunksBeingRead;
  }
}

void ChannelImpl::callRecvCallback(RecvOpIter opIter) {
  RecvOperation& op = *opIter;

  op.callback(error_);
  // Reset callback to release the resources it was holding.
  op.callback = nullptr;
}

void ChannelImpl::handleErrorImpl() {
  sendOps_.advanceAllOperations();
  recvOps_.advanceAllOperations();

  // Close the connections so that all current operations will be aborted. This
  // will cause their callbacks to be invoked, and only then we'll invoke ours.
  connection_->close();
  for (auto& lane : lanes_) {
    if (lane) {
      lane->close();
    }
  }

  for (const auto& iter : laneRegistrationIds_) {
    context_->unregisterConnectionRequest(iter.second);
  }

  context_->unenroll(*this);
}

// TODO Implement setIdImpl to propagate the ID to the connections

} // namespace mpt
} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/channel/mpt/channel_impl.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <deque>
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>

#include <tensorpipe/channel/channel_impl_boilerplate.h>
#include <tensorpipe/channel/mpt/nop_types.h>
#include <tensorpipe/common/state_machine.h>
#include <tensorpipe/transport/context.h>

namespace tensorpipe {
namespace channel {
namespace mpt {

class ContextImpl;

// State capturing a single send operation.
struct SendOperation {
  enum State { UNINITIALIZED, WRITING_CHUNKS, FINISHED };

  // Fields used by the state machine
  uint64_t sequenceNumber{0};
  State state{UNINITIALIZED};

  // Progress flags
  int64_t numChunksBeingWritten{0};

  // Arguments at creation
  const void* ptr;
  size_t length;
  TSendCallback callback;
};

// State capturing a single recv operation.
struct RecvOperation {
  enum State { UNINITIALIZED, READING_CHUNKS, FINISHED };

  // Fields used by the state machine
  uint64_t sequenceNumber{0};
  State state{UNINITIALIZED};

  // Progress flags
  int64_t numChunksBeingRead{0};

  // Arguments at creation
  void* ptr;
  size_t length;
  TRecvCallback callback;
};

class ChannelImpl final
    : public ChannelImplBoilerplate<ContextImpl, ChannelImpl> {
 public:
  ChannelImpl(
      ConstructorToken token,
      std::shared_ptr<ContextImpl> context,
      std::string id,
      std::shared_ptr<transport::Connection> connection,
      Endpoint endpoint,
      uint64_t numLanes);

 protected:
  // Implement the entry points called by ChannelImplBoilerplate.
  void initImplFromLoop() override;
  void sendImplFromLoop(
      uint64_t sequenceNumber,
      Buffer buffer,
      size_t length,
      TSendCallback callback) override;
  void recvImplFromLoop(
      uint64_t sequenceNumber,
      Buffer buffer,
      size_t length,
      TRecvCallback callback) override;
  void handleErrorImpl() override;

 private:
  enum State {
    UNINITIALIZED,
    CLIENT_READING_HELLO,
    SERVER_ACCEPTING_LANES,
    ESTABLISHED,
  };

  // Called when client reads the server's hello on backbone connection
  void onClientReadHelloOnConnection(const Packet& nopPacketIn);

  // Called when server accepts new client connection for lane
  void onServerAcceptOfLane(
      uint64_t laneIdx,
      std::shared_ptr<transport::Connection> connection);

  const std::shared_ptr<transport::Connection> connection_;
  const Endpoint endpoint_;
  State state_{UNINITIALIZED};
  const uint64_t numLanes_;
  uint64_t numLanesBeingAccepted_{0};
  std::vector<std::shared_ptr<transport::Connection>> lanes_;
  std::unordered_map<uint64_t, uint64_t> laneRegistrationIds_;

  OpsStateMachine<ChannelImpl, SendOperation> sendOps_{
      *this,
      &ChannelImpl::advanceSendOperation};
  using SendOpIter = decltype(sendOps_)::Iter;
  OpsStateMachine<ChannelImpl, RecvOperation> recvOps_{
      *this,
      &ChannelImpl::advanceRecvOperation};
  using RecvOpIter = decltype(recvOps_)::Iter;

  // State machines for send and recv ops.
  void advanceSendOperation(
      SendOpIter opIter,
      SendOperation::State prevOpState);
  void advanceRecvOperation(
      RecvOpIter opIter,
      RecvOperation::State prevOpState);

  // Actions (i.e., methods that begin a state transition).
  // For send operations:
  void writeChunks(SendOpIter opIter);
  void callSendCallback(SendOpIter opIter);
  // For recv operations:
  void readChunks(RecvOpIter opIter);
  void callRecvCallback(RecvOpIter opIter);
};

} // namespace mpt
} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/channel/mpt/context_impl.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/channel/mpt/context_impl.h>

#include <memory>
#include <sstream>
#include <utility>
#include <vector>

#include <tensorpipe/channel/error.h>
#include <tensorpipe/channel/mpt/channel_impl.h>
#include <tensorpipe/common/defs.h>
#include <tensorpipe/common/error_macros.h>
#include <tensorpipe/transport/connection.h>
#include <tensorpipe/transport/context.h>
#include <tensorpipe/transport/listener.h>

namespace tensorpipe {
namespace channel {
namespace mpt {

namespace {

std::string generateDomainDescriptor(
    const std::vector<std::shared_ptr<transport::Context>>& contexts) {
  // FIXME Escape the contexts' domain descriptors in case they contain a colon?
  // Or put them all in a nop object, that'll do the escaping for us.
  // But is it okay to compare nop objects by equality bitwise?
  std::ostringstream ss;
  ss << contexts.size();
  for (const auto& context : contexts) {
    ss << ":" << context->domainDescriptor();
  }
  return ss.str();
}

} // namespace

std::shared_ptr<ContextImpl> ContextImpl::create(
    std::vector<std::shared_ptr<transport::Context>> contexts,
    std::vector<std::shared_ptr<transport::Listener>> listeners) {
  for (const auto& context : contexts) {
    if (!context->isViable()) {
      return nullptr;
    }
  }

  std::unordered_map<Device, std::string> deviceDescriptors = {
      {Device{kCpuDeviceType, 0}, generateDomainDescriptor(contexts)}};

  return std::make_shared<ContextImpl>(
      std::move(contexts), std::move(listeners), std::move(deviceDescriptors));
}

ContextImpl::ContextImpl(
    std::vector<std::shared_ptr<transport::Context>> contexts,
    std::vector<std::shared_ptr<transport::Listener>> listeners,
    std::unordered_map<Device, std::string> deviceDescriptors)
    : ContextImplBoilerplate<ContextImpl, ChannelImpl>(
          std::move(deviceDescriptors)),
      contexts_(std::move(contexts)),
      listeners_(std::move(listeners)) {
  TP_THROW_ASSERT_IF(contexts_.size() != listeners_.size());
  numLanes_ = contexts_.size();

  addresses_.reserve(numLanes_);
  for (const auto& listener : listeners_) {
    addresses_.emplace_back(listener->addr());
  }
}

void ContextImpl::initImplFromLoop() {
  for (uint64_t laneIdx = 0; laneIdx < numLanes_; ++laneIdx) {
    acceptLane(laneIdx);
  }
}

std::shared_ptr<Channel> ContextImpl::createChannel(
    std::vector<std::shared_ptr<transport::Connection>> connections,
    Endpoint endpoint) {
  TP_DCHECK_EQ(numConnectionsNeeded(), connections.size());
  return createChannelInternal(std::move(connections[0]), endpoint, numLanes_);
}

const std::vector<std::string>& ContextImpl::addresses() const {
  // As this is an immutable member (after it has been initialized in
  // the constructor), we'll access it without deferring to the loop.
  return addresses_;
}

uint64_t ContextImpl::registerConnectionRequest(
    uint64_t laneIdx,
    connection_request_callback_fn fn) {
  TP_DCHECK(loop_.inLoop());

  uint64_t registrationId = nextConnectionRequestRegistrationId_++;

  TP_VLOG(4) << "Channel context " << id_
             << " received a connection request registration (#"
             << registrationId << ") on lane " << laneIdx;

  fn = [this, registrationId, fn{std::move(fn)}](
           const Error& error,
           std::shared_ptr<transport::Connection> connection) {
    TP_VLOG(4) << "Channel context " << id_
               << " calling a connection request registration callback (#"
               << registrationId << ")";
    fn(error, std::move(connection));
    TP_VLOG(4) << "Channel context " << id_
               << " done calling a connection request registration callback (#"
               << registrationId << ")";
  };

  if (error_) {
    fn(error_, std::shared_ptr<transport::Connection>());
  } else {
    connectionRequestRegistrations_.emplace(registrationId, std::move(fn));
  }

  return registrationId;
}

void ContextImpl::unregisterConnectionRequest(uint64_t registrationId) {
  TP_DCHECK(loop_.inLoop());

  TP_VLOG(4) << "Channel context " << id_
             << " received a connection request de-registration (#"
             << registrationId << ")";

  connectionRequestRegistrations_.erase(registrationId);
}

std::shared_ptr<transport::Connection> ContextImpl::connect(
    uint64_t laneIdx,
    std::string address) {
  TP_VLOG(4) << "Channel context " << id_ << " opening connection on lane "
             << laneIdx;
  return contexts_[laneIdx]->connect(std::move(address));
}

void ContextImpl::acceptLane(uint64_t laneIdx) {
  TP_DCHECK(loop_.inLoop());

  TP_VLOG(6) << "Channel context " << id_ << " accepting connection on lane "
             << laneIdx;
  listeners_[laneIdx]->accept(
      callbackWrapper_([laneIdx](
                           ContextImpl& impl,
                           std::shared_ptr<transport::Connection> connection) {
        TP_VLOG(6) << "Channel context " << impl.id_
                   << " done accepting connection on lane " << laneIdx;
        if (impl.error_) {
          return;
        }
        impl.onAcceptOfLane(std::move(connection));
        impl.acceptLane(laneIdx);
      }));
}

void ContextImpl::onAcceptOfLane(
    std::shared_ptr<transport::Connection> connection) {
  TP_DCHECK(loop_.inLoop());

  // Keep it alive until we figure out what to do with it.
  connectionsWaitingForHello_.insert(connection);
  auto npHolderIn = std::make_shared<NopHolder<Packet>>();
  TP_VLOG(6) << "Channel context " << id_
             << " reading nop object (client hello)";
  connection->read(
      *npHolderIn,
      callbackWrapper_([npHolderIn, connection](ContextImpl& impl) mutable {
        TP_VLOG(6) << "Channel context " << impl.id_
                   << " done reading nop object (client hello)";
        if (impl.error_) {
          return;
        }
        impl.connectionsWaitingForHello_.erase(connection);
        impl.onReadClientHelloOnLane(
            std::move(connection), npHolderIn->getObject());
      }));
}

void ContextImpl::onReadClientHelloOnLane(
    std::shared_ptr<transport::Connection> connection,
    const Packet& nopPacketIn) {
  TP_DCHECK(loop_.inLoop());
  TP_DCHECK_EQ(nopPacketIn.index(), nopPacketIn.index_of<ClientHello>());

  const ClientHello& nopClientHello = *nopPacketIn.get<ClientHello>();
  uint64_t registrationId = nopClientHello.registrationId;
  auto iter = connectionRequestRegistrations_.find(registrationId);
  // The connection request may have already been deregistered, for example
  // because the channel may have been closed.
  if (iter != connectionRequestRegistrations_.end()) {
    auto fn = std::move(iter->second);
    connectionRequestRegistrations_.erase(iter);
    fn(Error::kSuccess, std::move(connection));
  }
}

void ContextImpl::handleErrorImpl() {
  for (auto& iter : connectionRequestRegistrations_) {
    connection_request_callback_fn fn = std::move(iter.second);
    fn(error_, std::shared_ptr<transport::Connection>());
  }
  connectionRequestRegistrations_.clear();

  for (const auto& connection : connectionsWaitingForHello_) {
    connection->close();
  }
  connectionsWaitingForHello_.clear();

  for (auto& listener : listeners_) {
    listener->close();
  }
  for (auto& context : contexts_) {
    context->close();
  }
}

void ContextImpl::setIdImpl() {
  for (uint64_t laneIdx = 0; laneIdx < numLanes_; ++laneIdx) {
    contexts_[laneIdx]->setId(id_ + ".ctx_" + std::to_string(laneIdx));
    listeners_[laneIdx]->setId(
        id_ + ".ctx_" + std::to_string(laneIdx) + ".l_" +
        std::to_string(laneIdx));
  }
}

void ContextImpl::joinImpl() {
  for (auto& context : contexts_) {
    context->join();
  }
}

bool ContextImpl::inLoop() const {
  return loop_.inLoop();
};

void ContextImpl::deferToLoop(std::function<void()> fn) {
  loop_.deferToLoop(std::move(fn));
};

} // namespace mpt
} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/channel/mpt/context_impl.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <atomic>
#include <functional>
#include <memory>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>

#include <tensorpipe/channel/context_impl_boilerplate.h>
#include <tensorpipe/channel/mpt/nop_types.h>
#include <tensorpipe/common/callback.h>
#include <tensorpipe/common/deferred_executor.h>
#include <tensorpipe/common/device.h>
#include <tensorpipe/transport/context.h>

namespace tensorpipe {
namespace channel {
namespace mpt {

class ChannelImpl;

class ContextImpl final
    : public ContextImplBoilerplate<ContextImpl, ChannelImpl> {
 public:
  static std::shared_ptr<ContextImpl> create(
      std::vector<std::shared_ptr<transport::Context>> contexts,
      std::vector<std::shared_ptr<transport::Listener>> listeners);

  ContextImpl(
      std::vector<std::shared_ptr<transport::Context>> contexts,
      std::vector<std::shared_ptr<transport::Listener>> listeners,
      std::unordered_map<Device, std::string> deviceDescriptors);

  std::shared_ptr<Channel> createChannel(
      std::vector<std::shared_ptr<transport::Connection>> connections,
      Endpoint endpoint);

  // Implement the DeferredExecutor interface.
  bool inLoop() const override;
  void deferToLoop(std::function<void()> fn) override;

  using connection_request_callback_fn =
      std::function<void(const Error&, std::shared_ptr<transport::Connection>)>;

  const std::vector<std::string>& addresses() const;

  uint64_t registerConnectionRequest(
      uint64_t laneIdx,
      connection_request_callback_fn fn);

  void unregisterConnectionRequest(uint64_t registrationId);

  std::shared_ptr<transport::Connection> connect(
      uint64_t laneIdx,
      std::string address);

 protected:
  // Implement the entry points called by ContextImplBoilerplate.
  void initImplFromLoop() override;
  void handleErrorImpl() override;
  void joinImpl() override;
  void setIdImpl() override;

 private:
  OnDemandDeferredExecutor loop_;

  void acceptLane(uint64_t laneIdx);
  void onAcceptOfLane(std::shared_ptr<transport::Connection> connection);
  void onReadClientHelloOnLane(
      std::shared_ptr<transport::Connection> connection,
      const Packet& nopPacketIn);

  const std::vector<std::shared_ptr<transport::Context>> contexts_;
  const std::vector<std::shared_ptr<transport::Listener>> listeners_;

  uint64_t numLanes_{0};
  std::vector<std::string> addresses_;

  uint64_t nextConnectionRequestRegistrationId_{0};

  // Needed to keep them alive.
  std::unordered_set<std::shared_ptr<transport::Connection>>
      connectionsWaitingForHello_;

  std::unordered_map<uint64_t, connection_request_callback_fn>
      connectionRequestRegistrations_;
};

} // namespace mpt
} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/channel/mpt/factory.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/channel/mpt/factory.h>

#include <tensorpipe/channel/context_boilerplate.h>
#include <tensorpipe/channel/mpt/channel_impl.h>
#include <tensorpipe/channel/mpt/context_impl.h>

namespace tensorpipe {
namespace channel {
namespace mpt {

std::shared_ptr<Context> create(
    std::vector<std::shared_ptr<transport::Context>> contexts,
    std::vector<std::shared_ptr<transport::Listener>> listeners) {
  return std::make_shared<ContextBoilerplate<ContextImpl, ChannelImpl>>(
      std::move(contexts), std::move(listeners));
}

} // namespace mpt
} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/channel/mpt/factory.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <memory>
#include <vector>

#include <tensorpipe/channel/context.h>
#include <tensorpipe/transport/context.h>

namespace tensorpipe {
namespace channel {
namespace mpt {

std::shared_ptr<Context> create(
    std::vector<std::shared_ptr<transport::Context>> contexts,
    std::vector<std::shared_ptr<transport::Listener>> listeners);

} // namespace mpt
} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/channel/mpt/nop_types.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <string>
#include <vector>

#include <nop/serializer.h>
#include <nop/structure.h>
#include <nop/types/variant.h>

namespace tensorpipe {
namespace channel {
namespace mpt {

struct LaneAdvertisement {
  // This pointless constructor is needed to work around a bug in GCC 5.5 (and
  // possibly other versions). It appears to be needed in the nop types that are
  // used inside std::vectors.
  LaneAdvertisement() {}

  std::string address;
  uint64_t registrationId;
  NOP_STRUCTURE(LaneAdvertisement, address, registrationId);
};

struct ServerHello {
  std::vector<LaneAdvertisement> laneAdvertisements;
  NOP_STRUCTURE(ServerHello, laneAdvertisements);
};

struct ClientHello {
  uint64_t registrationId;
  NOP_STRUCTURE(ClientHello, registrationId);
};

using Packet = nop::Variant<ServerHello, ClientHello>;

} // namespace mpt
} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/channel/xth/channel_impl.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/channel/xth/channel_impl.h>

#include <memory>
#include <string>
#include <utility>

#include <nop/serializer.h>
#include <nop/structure.h>

#include <tensorpipe/channel/xth/context_impl.h>
#include <tensorpipe/common/cpu_buffer.h>
#include <tensorpipe/common/defs.h>
#include <tensorpipe/common/error.h>
#include <tensorpipe/transport/connection.h>

namespace tensorpipe {
namespace channel {
namespace xth {

namespace {

struct Descriptor {
  uint64_t ptr;
  NOP_STRUCTURE(Descriptor, ptr);
};

} // namespace

ChannelImpl::ChannelImpl(
    ConstructorToken token,
    std::shared_ptr<ContextImpl> context,
    std::string id,
    std::shared_ptr<transport::Connection> descriptorConnection,
    std::shared_ptr<transport::Connection> completionConnection)
    : ChannelImplBoilerplate<ContextImpl, ChannelImpl>(
          token,
          std::move(context),
          std::move(id)),
      descriptorConnection_(std::move(descriptorConnection)),
      completionConnection_(std::move(completionConnection)) {}

void ChannelImpl::initImplFromLoop() {
  context_->enroll(*this);
}

void ChannelImpl::sendImplFromLoop(
    uint64_t sequenceNumber,
    Buffer buffer,
    size_t length,
    TSendCallback callback) {
  SendOpIter opIter = sendOps_.emplaceBack(sequenceNumber);
  SendOperation& op = *opIter;
  op.ptr = buffer.unwrap<CpuBuffer>().ptr;
  op.length = length;
  op.callback = std::move(callback);

  sendOps_.advanceOperation(opIter);
}

void ChannelImpl::advanceSendOperation(
    SendOpIter opIter,
    SendOperation::State prevOpState) {
  TP_DCHECK(context_->inLoop());

  SendOperation& op = *opIter;

  sendOps_.attemptTransition(
      opIter,
      /*from=*/SendOperation::UNINITIALIZED,
      /*to=*/SendOperation::FINISHED,
      /*cond=*/error_ || op.length == 0,
      /*actions=*/{&ChannelImpl::callSendCallback});

  // Needs to go after previous op to ensure predictable and consistent ordering
  // of write calls on the descriptor control connection and read calls on the
  // completion control connection.
  sendOps_.attemptTransition(
      opIter,
      /*from=*/SendOperation::UNINITIALIZED,
      /*to=*/SendOperation::READING_COMPLETION,
      /*cond=*/!error_ && prevOpState >= SendOperation::READING_COMPLETION,
      /*actions=*/
      {&ChannelImpl::writeDescriptor, &ChannelImpl::readCompletion});

  sendOps_.attemptTransition(
      opIter,
      /*from=*/SendOperation::READING_COMPLETION,
      /*to=*/SendOperation::FINISHED,
      /*cond=*/op.doneReadingCompletion,
      /*actions=*/{&ChannelImpl::callSendCallback});
}

void ChannelImpl::writeDescriptor(SendOpIter opIter) {
  SendOperation& op = *opIter;

  auto nopHolder = std::make_shared<NopHolder<Descriptor>>();
  Descriptor& nopDescriptor = nopHolder->getObject();
  nopDescriptor.ptr = reinterpret_cast<std::uintptr_t>(op.ptr);

  TP_VLOG(6) << "Channel " << id_ << " is writing descriptor (#"
             << op.sequenceNumber << ")";
  descriptorConnection_->write(
      *nopHolder,
      callbackWrapper_([sequenceNumber{op.sequenceNumber},
                        nopHolder](ChannelImpl& impl) {
        TP_VLOG(6) << "Channel " << impl.id_ << " done writing descriptor (#"
                   << sequenceNumber << ")";
      }));
}

void ChannelImpl::readCompletion(SendOpIter opIter) {
  SendOperation& op = *opIter;

  TP_VLOG(6) << "Channel " << id_ << " is reading completion (#"
             << op.sequenceNumber << ")";
  completionConnection_->read(
      nullptr,
      0,
      callbackWrapper_([opIter](
                           ChannelImpl& impl,
                           const void* /* unused */,
                           size_t /* unused */) {
        TP_VLOG(6) << "Channel " << impl.id_ << " done reading completion (#"
                   << opIter->sequenceNumber << ")";
        opIter->doneReadingCompletion = true;
        impl.sendOps_.advanceOperation(opIter);
      }));
}

void ChannelImpl::callSendCallback(SendOpIter opIter) {
  SendOperation& op = *opIter;

  op.callback(error_);
  // Reset callback to release the resources it was holding.
  op.callback = nullptr;
}

void ChannelImpl::recvImplFromLoop(
    uint64_t sequenceNumber,
    Buffer buffer,
    size_t length,
    TRecvCallback callback) {
  RecvOpIter opIter = recvOps_.emplaceBack(sequenceNumber);
  RecvOperation& op = *opIter;
  op.ptr = buffer.unwrap<CpuBuffer>().ptr;
  op.length = length;
  op.callback = std::move(callback);

  recvOps_.advanceOperation(opIter);
}

void ChannelImpl::advanceRecvOperation(
    RecvOpIter opIter,
    RecvOperation::State prevOpState) {
  TP_DCHECK(context_->inLoop());

  RecvOperation& op = *opIter;

  recvOps_.attemptTransition(
      opIter,
      /*from=*/RecvOperation::UNINITIALIZED,
      /*to=*/RecvOperation::FINISHED,
      /*cond=*/error_ || op.length == 0,
      /*actions=*/{&ChannelImpl::callRecvCallback});

  // Needs to go after previous op to ensure predictable and consistent ordering
  // of read calls on the descriptor control connection.
  recvOps_.attemptTransition(
      opIter,
      /*from=*/RecvOperation::UNINITIALIZED,
      /*to=*/RecvOperation::READING_DESCRIPTOR,
      /*cond=*/!error_ && prevOpState >= RecvOperation::READING_DESCRIPTOR,
      /*actions=*/{&ChannelImpl::readDescriptor});

  recvOps_.attemptTransition(
      opIter,
      /*from=*/RecvOperation::READING_DESCRIPTOR,
      /*to=*/RecvOperation::FINISHED,
      /*cond=*/error_ && op.doneReadingDescriptor,
      /*actions=*/{&ChannelImpl::callRecvCallback});

  recvOps_.attemptTransition(
      opIter,
      /*from=*/RecvOperation::READING_DESCRIPTOR,
      /*to=*/RecvOperation::COPYING,
      /*cond=*/!error_ && op.doneReadingDescriptor,
      /*actions=*/{&ChannelImpl::copy});

  recvOps_.attemptTransition(
      opIter,
      /*from=*/RecvOperation::COPYING,
      /*to=*/RecvOperation::FINISHED,
      /*cond=*/error_ && op.doneCopying,
      /*actions=*/{&ChannelImpl::callRecvCallback});

  // Needs to go after previous op to ensure predictable and consistent ordering
  // of write calls on the completion control connection.
  recvOps_.attemptTransition(
      opIter,
      /*from=*/RecvOperation::COPYING,
      /*to=*/RecvOperation::FINISHED,
      /*cond=*/!error_ && op.doneCopying &&
          prevOpState >= RecvOperation::FINISHED,
      /*actions=*/
      {&ChannelImpl::callRecvCallback, &ChannelImpl::writeCompletion});
}

void ChannelImpl::readDescriptor(RecvOpIter opIter) {
  RecvOperation& op = *opIter;

  TP_VLOG(6) << "Channel " << id_ << " is reading descriptor (#"
             << op.sequenceNumber << ")";
  auto nopHolderIn = std::make_shared<NopHolder<Descriptor>>();
  descriptorConnection_->read(
      *nopHolderIn, callbackWrapper_([opIter, nopHolderIn](ChannelImpl& impl) {
        TP_VLOG(6) << "Channel " << impl.id_ << " done reading descriptor (#"
                   << opIter->sequenceNumber << ")";
        opIter->doneReadingDescriptor = true;
        if (!impl.error_) {
          Descriptor& nopDescriptor = nopHolderIn->getObject();
          opIter->remotePtr = reinterpret_cast<void*>(nopDescriptor.ptr);
        }
        impl.recvOps_.advanceOperation(opIter);
      }));
}

void ChannelImpl::copy(RecvOpIter opIter) {
  RecvOperation& op = *opIter;

  TP_VLOG(6) << "Channel " << id_ << " is copying payload (#"
             << op.sequenceNumber << ")";
  context_->requestCopy(
      op.remotePtr,
      op.ptr,
      op.length,
      callbackWrapper_([opIter](ChannelImpl& impl) {
        TP_VLOG(6) << "Channel " << impl.id_ << " done copying payload (#"
                   << opIter->sequenceNumber << ")";
        opIter->doneCopying = true;
        impl.recvOps_.advanceOperation(opIter);
      }));
}

void ChannelImpl::callRecvCallback(RecvOpIter opIter) {
  RecvOperation& op = *opIter;

  op.callback(error_);
  // Reset callback to release the resources it was holding.
  op.callback = nullptr;
}

void ChannelImpl::writeCompletion(RecvOpIter opIter) {
  RecvOperation& op = *opIter;

  TP_VLOG(6) << "Channel " << id_ << " is writing completion (#"
             << op.sequenceNumber << ")";
  completionConnection_->write(
      nullptr,
      0,
      callbackWrapper_([sequenceNumber{op.sequenceNumber}](ChannelImpl& impl) {
        TP_VLOG(6) << "Channel " << impl.id_ << " done writing completion (#"
                   << sequenceNumber << ")";
      }));
}

void ChannelImpl::handleErrorImpl() {
  sendOps_.advanceAllOperations();
  recvOps_.advanceAllOperations();

  descriptorConnection_->close();
  completionConnection_->close();

  context_->unenroll(*this);
}

} // namespace xth
} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/channel/xth/channel_impl.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <memory>
#include <string>

#include <tensorpipe/channel/channel_impl_boilerplate.h>
#include <tensorpipe/common/state_machine.h>
#include <tensorpipe/transport/context.h>

namespace tensorpipe {
namespace channel {
namespace xth {

class ContextImpl;

struct SendOperation {
  enum State { UNINITIALIZED, READING_COMPLETION, FINISHED };

  // Fields used by the state machine
  uint64_t sequenceNumber{0};
  State state{UNINITIALIZED};

  // Progress flags
  bool doneReadingCompletion{false};

  // Arguments at creation
  void* ptr;
  size_t length;
  TSendCallback callback;
};

struct RecvOperation {
  enum State { UNINITIALIZED, READING_DESCRIPTOR, COPYING, FINISHED };

  // Fields used by the state machine
  uint64_t sequenceNumber{0};
  State state{UNINITIALIZED};

  // Progress flags
  bool doneReadingDescriptor{false};
  bool doneCopying{false};

  // Arguments at creation
  void* ptr;
  size_t length;
  TRecvCallback callback;

  // Other data
  void* remotePtr;
};

class ChannelImpl final
    : public ChannelImplBoilerplate<ContextImpl, ChannelImpl> {
 public:
  ChannelImpl(
      ConstructorToken token,
      std::shared_ptr<ContextImpl> context,
      std::string id,
      std::shared_ptr<transport::Connection> descriptorConnection,
      std::shared_ptr<transport::Connection> completionConnection);

 protected:
  // Implement the entry points called by ChannelImplBoilerplate.
  void initImplFromLoop() override;
  void sendImplFromLoop(
      uint64_t sequenceNumber,
      Buffer buffer,
      size_t length,
      TSendCallback callback) override;
  void recvImplFromLoop(
      uint64_t sequenceNumber,
      Buffer buffer,
      size_t length,
      TRecvCallback callback) override;
  void handleErrorImpl() override;

 private:
  const std::shared_ptr<transport::Connection> descriptorConnection_;
  const std::shared_ptr<transport::Connection> completionConnection_;

  OpsStateMachine<ChannelImpl, SendOperation> sendOps_{
      *this,
      &ChannelImpl::advanceSendOperation};
  using SendOpIter = decltype(sendOps_)::Iter;
  OpsStateMachine<ChannelImpl, RecvOperation> recvOps_{
      *this,
      &ChannelImpl::advanceRecvOperation};
  using RecvOpIter = decltype(recvOps_)::Iter;

  // State machines for send and recv ops.
  void advanceSendOperation(
      SendOpIter opIter,
      SendOperation::State prevOpState);
  void advanceRecvOperation(
      RecvOpIter opIter,
      RecvOperation::State prevOpState);

  // Actions (i.e., methods that begin a state transition).
  // For send operations:
  void writeDescriptor(SendOpIter opIter);
  void readCompletion(SendOpIter opIter);
  void callSendCallback(SendOpIter opIter);
  // For recv operations:
  void readDescriptor(RecvOpIter opIter);
  void copy(RecvOpIter opIter);
  void callRecvCallback(RecvOpIter opIter);
  void writeCompletion(RecvOpIter opIter);
};

} // namespace xth
} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/channel/xth/context_impl.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/channel/xth/context_impl.h>

#include <unistd.h>

#include <cstring>
#include <functional>
#include <limits>
#include <sstream>
#include <string>
#include <thread>
#include <utility>

#include <tensorpipe/channel/xth/channel_impl.h>
#include <tensorpipe/common/defs.h>
#include <tensorpipe/common/system.h>

namespace tensorpipe {
namespace channel {
namespace xth {

std::shared_ptr<ContextImpl> ContextImpl::create() {
  std::ostringstream oss;
  auto bootID = getBootID();
  TP_THROW_ASSERT_IF(!bootID) << "Unable to read boot_id";
  auto nsID = getLinuxNamespaceId(LinuxNamespace::kPid);
  if (!nsID.has_value()) {
    TP_VLOG(5)
        << "XTH channel is not viable because it couldn't determine the PID namespace ID";
    return nullptr;
  }
  oss << bootID.value() << "_" << nsID.value() << "_" << ::getpid();
  const std::string domainDescriptor = oss.str();

  std::unordered_map<Device, std::string> deviceDescriptors = {
      {Device{kCpuDeviceType, 0}, domainDescriptor}};
  return std::make_shared<ContextImpl>(std::move(deviceDescriptors));
}

ContextImpl::ContextImpl(
    std::unordered_map<Device, std::string> deviceDescriptors)
    : ContextImplBoilerplate<ContextImpl, ChannelImpl>(
          std::move(deviceDescriptors)),
      requests_(std::numeric_limits<int>::max()) {
  thread_ = std::thread(&ContextImpl::handleCopyRequests, this);
}

std::shared_ptr<Channel> ContextImpl::createChannel(
    std::vector<std::shared_ptr<transport::Connection>> connections,
    Endpoint /* unused */) {
  TP_DCHECK_EQ(numConnectionsNeeded(), connections.size());
  return createChannelInternal(
      std::move(connections[0]), std::move(connections[1]));
}

size_t ContextImpl::numConnectionsNeeded() const {
  return 2;
}

void ContextImpl::handleErrorImpl() {
  requests_.push(nullopt);
}

void ContextImpl::joinImpl() {
  thread_.join();
  // TP_DCHECK(requests_.empty());
}

bool ContextImpl::inLoop() const {
  return loop_.inLoop();
};

void ContextImpl::deferToLoop(std::function<void()> fn) {
  loop_.deferToLoop(std::move(fn));
};

void ContextImpl::requestCopy(
    void* remotePtr,
    void* localPtr,
    size_t length,
    std::function<void(const Error&)> fn) {
  uint64_t requestId = nextRequestId_++;
  TP_VLOG(4) << "Channel context " << id_ << " received a copy request (#"
             << requestId << ")";

  fn = [this, requestId, fn{std::move(fn)}](const Error& error) {
    TP_VLOG(4) << "Channel context " << id_
               << " is calling a copy request callback (#" << requestId << ")";
    fn(error);
    TP_VLOG(4) << "Channel context " << id_
               << " done calling a copy request callback (#" << requestId
               << ")";
  };

  requests_.push(CopyRequest{remotePtr, localPtr, length, std::move(fn)});
}

void ContextImpl::handleCopyRequests() {
  setThreadName("TP_XTH_loop");
  while (true) {
    auto maybeRequest = requests_.pop();
    if (!maybeRequest.has_value()) {
      break;
    }
    CopyRequest request = std::move(maybeRequest).value();

    // Don't even call memcpy on a length of 0 to avoid issues with the pointer
    // possibly being null.
    if (request.length > 0) {
      // Perform copy.
      std::memcpy(request.localPtr, request.remotePtr, request.length);
    }

    request.callback(Error::kSuccess);
  }
}

} // namespace xth
} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/channel/xth/context_impl.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <atomic>
#include <functional>
#include <thread>

#include <tensorpipe/channel/context_impl_boilerplate.h>
#include <tensorpipe/common/deferred_executor.h>
#include <tensorpipe/common/device.h>
#include <tensorpipe/common/error.h>
#include <tensorpipe/common/optional.h>
#include <tensorpipe/common/queue.h>

namespace tensorpipe {
namespace channel {
namespace xth {

class ChannelImpl;

class ContextImpl final
    : public ContextImplBoilerplate<ContextImpl, ChannelImpl> {
 public:
  static std::shared_ptr<ContextImpl> create();

  explicit ContextImpl(
      std::unordered_map<Device, std::string> deviceDescriptors);

  std::shared_ptr<Channel> createChannel(
      std::vector<std::shared_ptr<transport::Connection>> connections,
      Endpoint endpoint);

  size_t numConnectionsNeeded() const override;

  // Implement the DeferredExecutor interface.
  bool inLoop() const override;
  void deferToLoop(std::function<void()> fn) override;

  using copy_request_callback_fn = std::function<void(const Error&)>;

  void requestCopy(
      void* remotePtr,
      void* localPtr,
      size_t length,
      copy_request_callback_fn fn);

 protected:
  // Implement the entry points called by ContextImplBoilerplate.
  void handleErrorImpl() override;
  void joinImpl() override;

 private:
  OnDemandDeferredExecutor loop_;

  struct CopyRequest {
    void* remotePtr;
    void* localPtr;
    size_t length;
    copy_request_callback_fn callback;
  };

  std::thread thread_;
  Queue<optional<CopyRequest>> requests_;

  // This is atomic because it may be accessed from outside the loop.
  std::atomic<uint64_t> nextRequestId_{0};

  void handleCopyRequests();
};

} // namespace xth
} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/channel/xth/factory.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/channel/xth/factory.h>

#include <tensorpipe/channel/context_boilerplate.h>
#include <tensorpipe/channel/xth/channel_impl.h>
#include <tensorpipe/channel/xth/context_impl.h>

namespace tensorpipe {
namespace channel {
namespace xth {

std::shared_ptr<Context> create() {
  return std::make_shared<ContextBoilerplate<ContextImpl, ChannelImpl>>();
}

} // namespace xth
} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/channel/xth/factory.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <memory>

#include <tensorpipe/channel/context.h>

namespace tensorpipe {
namespace channel {
namespace xth {

std::shared_ptr<Context> create();

} // namespace xth
} // namespace channel
} // namespace tensorpipe


================================================
FILE: tensorpipe/common/address.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/common/address.h>

#include <tensorpipe/common/defs.h>

namespace tensorpipe {

std::tuple<std::string, std::string> splitSchemeOfURL(const std::string& url) {
  std::string::size_type endOfScheme = url.find("://");
  if (endOfScheme == std::string::npos) {
    TP_THROW_EINVAL() << "url has no scheme: " << url;
  }
  return std::make_tuple(
      url.substr(0, endOfScheme), url.substr(endOfScheme + 3));
}

} // namespace tensorpipe


================================================
FILE: tensorpipe/common/address.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <string>

namespace tensorpipe {

std::tuple<std::string, std::string> splitSchemeOfURL(const std::string& url);

}


================================================
FILE: tensorpipe/common/allocator.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/common/allocator.h>

#include <tensorpipe/common/defs.h>
#include <tensorpipe/common/error_macros.h>

namespace tensorpipe {

Allocator::Allocator(uint8_t* data, size_t numChunks, size_t chunkSize)
    : numChunks_(numChunks),
      chunkSize_(chunkSize),
      data_(data),
      chunkAvailable_(numChunks, true) {}

Allocator::~Allocator() {
  close();
}

void Allocator::alloc(size_t size, TAllocCallback callback) {
  TP_DCHECK(size <= chunkSize_);
  pendingAllocations_.push_back(std::move(callback));
  processAllocations();
}

size_t Allocator::getChunkLength() const {
  return chunkSize_;
}

void Allocator::close() {
  if (closed_) {
    return;
  }
  closed_ = true;
  processAllocations();
}

void Allocator::processAllocations() {
  while (!pendingAllocations_.empty()) {
    auto& callback = pendingAllocations_.front();
    if (closed_) {
      callback(TP_CREATE_ERROR(AllocatorClosedError), nullptr);
    } else {
      TChunk ptr = getAvailableChunk();
      if (!ptr) {
        break;
      }
      callback(Error::kSuccess, std::move(ptr));
    }
    pendingAllocations_.pop_front();
  }
}

Allocator::TChunk Allocator::getAvailableChunk() {
  for (size_t curChunk = 0; curChunk < numChunks_; ++curChunk) {
    if (chunkAvailable_[curChunk]) {
      chunkAvailable_[curChunk] = false;
      ++allocatedChunks_;
      return TChunk(data_ + curChunk * chunkSize_, [this](uint8_t* ptr) {
        releaseChunk(ptr);
      });
    }
  }

  return nullptr;
}

void Allocator::releaseChunk(uint8_t* ptr) {
  size_t chunkId = (ptr - data_) / chunkSize_;
  chunkAvailable_[chunkId] = true;
  --allocatedChunks_;
  processAllocations();
}

} // namespace tensorpipe


================================================
FILE: tensorpipe/common/allocator.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <cstddef>
#include <cstdint>
#include <deque>
#include <functional>
#include <memory>
#include <vector>

#include <tensorpipe/common/error.h>

namespace tensorpipe {

class AllocatorClosedError final : public BaseError {
  std::string what() const override {
    return "allocator closed";
  }
};

class Allocator {
 public:
  // Note: this is a std::shared_ptr<uint8_t[]> semantically. A shared_ptr with
  // array type is supported in C++17 and higher.
  using TChunk = std::shared_ptr<uint8_t>;
  using TAllocCallback = std::function<void(const Error&, TChunk)>;

  explicit Allocator(uint8_t* data, size_t numChunks, size_t chunkSize);

  ~Allocator();

  void alloc(size_t size, TAllocCallback callback);
  size_t getChunkLength() const;

  void close();

 private:
  const size_t numChunks_;
  const size_t chunkSize_;
  uint8_t* const data_;
  std::vector<bool> chunkAvailable_;
  size_t allocatedChunks_{0};
  std::deque<TAllocCallback> pendingAllocations_;
  bool closed_{false};

  void processAllocations();
  TChunk getAvailableChunk();
  void releaseChunk(uint8_t* ptr);
};

} // namespace tensorpipe


================================================
FILE: tensorpipe/common/buffer.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <cstddef>
#include <stdexcept>
#include <type_traits>
#include <utility>

#include <tensorpipe/common/cpu_buffer.h>
#include <tensorpipe/common/device.h>

namespace tensorpipe {

class Buffer {
  class AbstractBufferWrapper {
   public:
    virtual Device device() const = 0;
    virtual void copyConstructInto(void* ptr) const = 0;
    virtual void moveConstructInto(void* ptr) = 0;
    virtual ~AbstractBufferWrapper() = default;
  };

  template <typename TBuffer>
  class BufferWrapper : public AbstractBufferWrapper {
    static_assert(
        std::is_trivially_copyable<TBuffer>::value,
        "wrapping non-trivially copyable class");

   public:
    TBuffer buffer;

    explicit BufferWrapper(TBuffer buffer) : buffer(std::move(buffer)) {}

    Device device() const override {
      return buffer.getDevice();
    }

    void copyConstructInto(void* ptr) const override {
      new (ptr) BufferWrapper(*this);
    }

    void moveConstructInto(void* ptr) override {
      new (ptr) BufferWrapper(std::move(*this));
    }
  };

 public:
  template <typename TBuffer>
  /* implicit */ Buffer(TBuffer b) {
    static_assert(
        sizeof(BufferWrapper<TBuffer>) <= kStructSize, "kStructSize too small");
    static_assert(
        alignof(BufferWrapper<TBuffer>) <= kStructAlign,
        "kStructAlign too small");
    new (&raw_) BufferWrapper<TBuffer>(std::move(b));
  }

  Buffer() : Buffer(CpuBuffer{}) {}

  Buffer(const Buffer& other) {
    other.ptr()->copyConstructInto(&raw_);
  }

  Buffer& operator=(const Buffer& other) {
    if (this != &other) {
      ptr()->~AbstractBufferWrapper();
      other.ptr()->copyConstructInto(&raw_);
    }
    return *this;
  }

  Buffer(Buffer&& other) noexcept {
    other.ptr()->moveConstructInto(&raw_);
  }

  Buffer& operator=(Buffer&& other) {
    if (this != &other) {
      ptr()->~AbstractBufferWrapper();
      other.ptr()->moveConstructInto(&raw_);
    }
    return *this;
  }

  ~Buffer() {
    ptr()->~AbstractBufferWrapper();
  }

  template <typename TBuffer>
  TBuffer& unwrap() {
    BufferWrapper<TBuffer>* wrapperPtr =
        dynamic_cast<BufferWrapper<TBuffer>*>(ptr());
    if (wrapperPtr == nullptr) {
      throw std::runtime_error("Invalid unwrapping of tensorpipe::Buffer");
    }
    return wrapperPtr->buffer;
  }

  template <typename TBuffer>
  const TBuffer& unwrap() const {
    const BufferWrapper<TBuffer>* wrapperPtr =
        dynamic_cast<const BufferWrapper<TBuffer>*>(ptr());
    if (wrapperPtr == nullptr) {
      throw std::runtime_error("Invalid unwrapping of tensorpipe::Buffer");
    }
    return wrapperPtr->buffer;
  }

  Device device() const {
    return ptr()->device();
  }

 private:
  static constexpr int kStructSize = 32;
  static constexpr int kStructAlign = 8;
  std::aligned_storage<kStructSize, kStructAlign>::type raw_{};

  const AbstractBufferWrapper* ptr() const {
    // FIXME: Once we go C++17, use std::launder on the returned pointer.
    return reinterpret_cast<const AbstractBufferWrapper*>(&raw_);
  }

  AbstractBufferWrapper* ptr() {
    // FIXME: Once we go C++17, use std::launder on the returned pointer.
    return reinterpret_cast<AbstractBufferWrapper*>(&raw_);
  }
};

} // namespace tensorpipe


================================================
FILE: tensorpipe/common/busy_polling_loop.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <atomic>
#include <string>
#include <thread>
#include <utility>

#include <tensorpipe/common/deferred_executor.h>
#include <tensorpipe/common/system.h>

namespace tensorpipe {

class BusyPollingLoop : public EventLoopDeferredExecutor {
 protected:
  virtual bool pollOnce() = 0;

  virtual bool readyToClose() = 0;

  void stopBusyPolling() {
    closed_ = true;
    // No need to wake up the thread, since it is busy-waiting.
  }

  void eventLoop() override {
    while (!closed_ || !readyToClose()) {
      if (pollOnce()) {
        // continue
      } else if (deferredFunctionCount_ > 0) {
        deferredFunctionCount_ -= runDeferredFunctionsFromEventLoop();
      } else {
        std::this_thread::yield();
      }
    }
  }

  void wakeupEventLoopToDeferFunction() override {
    ++deferredFunctionCount_;
    // No need to wake up the thread, since it is busy-waiting.
  }

 private:
  std::atomic<bool> closed_{false};

  std::atomic<int64_t> deferredFunctionCount_{0};
};

} // namespace tensorpipe


================================================
FILE: tensorpipe/common/callback.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <atomic>
#include <deque>
#include <functional>
#include <memory>
#include <mutex>
#include <thread>
#include <tuple>
#include <unordered_map>

#include <tensorpipe/common/deferred_executor.h>
#include <tensorpipe/common/defs.h>
#include <tensorpipe/common/error.h>
#include <tensorpipe/common/optional.h>

namespace tensorpipe {

namespace {

// NOTE: This is an incomplete implementation of C++17's `std::apply`.
template <typename F, typename T, size_t... I>
auto cbApply(F&& f, T&& t, std::index_sequence<I...> /*unused*/) {
  return f(std::get<I>(std::forward<T>(t))...);
}

template <typename F, typename T>
auto cbApply(F&& f, T&& t) {
  return cbApply(
      std::move(f),
      std::forward<T>(t),
      std::make_index_sequence<std::tuple_size<T>::value>{});
}

} // namespace

// A wrapper for a callback that "burns out" after it fires and thus needs to be
// rearmed every time. Invocations that are triggered while the callback is
// unarmed are stashed and will be delayed until a callback is provided again.
template <typename... Args>
class RearmableCallback {
  using TFn = std::function<void(Args...)>;
  using TStoredArgs = std::tuple<typename std::remove_reference<Args>::type...>;

 public:
  void arm(TFn fn) {
    if (!args_.empty()) {
      TStoredArgs args{std::move(args_.front())};
      args_.pop_front();
      cbApply(std::move(fn), std::move(args));
    } else {
      callbacks_.push_back(std::move(fn));
    }
  }

  void trigger(Args... args) {
    if (!callbacks_.empty()) {
      TFn fn{std::move(callbacks_.front())};
      callbacks_.pop_front();
      cbApply(std::move(fn), std::tuple<Args...>(std::forward<Args>(args)...));
    } else {
      args_.emplace_back(std::forward<Args>(args)...);
    }
  }

  // This method is intended for "flushing" the callback, for example when an
  // error condition is reached which means that no more callbacks will be
  // processed but the current ones still must be honored.
  void triggerAll(std::function<std::tuple<Args...>()> generator) {
    while (!callbacks_.empty()) {
      TFn fn{std::move(callbacks_.front())};
      callbacks_.pop_front();
      cbApply(std::move(fn), generator());
    }
  }

 private:
  std::deque<TFn> callbacks_;
  std::deque<TStoredArgs> args_;
};

// This class provides some boilerplate that is used by the pipe, the listener
// and others when passing a callback to some lower-level component.
// It will acquire a shared_ptr to the object (thus preventing the object from
// being destroyed until the callback has been fired) and in case of error it
// will deal with it but it will still end up invoking the actual callback.
template <typename TSubject>
class CallbackWrapper {
 public:
  CallbackWrapper(
      std::enable_shared_from_this<TSubject>& subject,
      DeferredExecutor& loop)
      : subject_(subject), loop_(loop) {}

  template <typename TBoundFn>
  auto operator()(TBoundFn fn) {
    return [this, subject{subject_.shared_from_this()}, fn{std::move(fn)}](
               const Error& error, auto&&... args) mutable {
      this->entryPoint(
          std::move(subject),
          std::move(fn),
          error,
          std::forward<decltype(args)>(args)...);
    };
  }

 private:
  std::enable_shared_from_this<TSubject>& subject_;
  DeferredExecutor& loop_;

  template <typename TBoundFn, typename... Args>
  void entryPoint(
      std::shared_ptr<TSubject> subject,
      TBoundFn fn,
      const Error& error,
      Args&&... args) {
    // Do *NOT* move subject into the lambda's closure, as the shared_ptr we're
    // holding may be the last one keeping subject alive, in which case it would
    // die once the lambda runs, and it might kill the loop in turn too, _while_
    // the loop's deferToLoop method is running. That's bad. So copy it instead.
    // FIXME We're copying the args here...
    loop_.deferToLoop(
        [this, subject, fn{std::move(fn)}, error{error}, args...]() mutable {
          entryPointFromLoop(
              *subject, std::move(fn), error, std::forward<Args>(args)...);
        });
  }

  template <typename TBoundFn, typename... Args>
  void entryPointFromLoop(
      TSubject& subject,
      TBoundFn fn,
      const Error& error,
      Args&&... args) {
    TP_DCHECK(loop_.inLoop());

    subject.setError(error);
    // Proceed regardless of any error: this is why it's called "eager".
    fn(subject, std::forward<Args>(args)...);
  }
};

} // namespace tensorpipe


================================================
FILE: tensorpipe/common/cpu_buffer.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <tensorpipe/common/device.h>

namespace tensorpipe {

struct CpuBuffer {
  void* ptr{nullptr};

  Device getDevice() const {
    return Device{kCpuDeviceType, 0};
  }
};

} // namespace tensorpipe


================================================
FILE: tensorpipe/common/cuda.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <iomanip>
#include <ios>
#include <memory>
#include <sstream>
#include <string>
#include <utility>
#include <vector>

#include <cuda_runtime.h>

#include <tensorpipe/common/cuda_lib.h>
#include <tensorpipe/common/defs.h>
#include <tensorpipe/common/device.h>
#include <tensorpipe/common/error.h>
#include <tensorpipe/common/strings.h>

#define TP_CUDA_CHECK(a)                                                \
  do {                                                                  \
    cudaError_t error = (a);                                            \
    TP_THROW_ASSERT_IF(cudaSuccess != error)                            \
        << __TP_EXPAND_OPD(a) << " " << cudaGetErrorName(error) << " (" \
        << cudaGetErrorString(error) << ")";                            \
  } while (false)

namespace tensorpipe {

class CudaError final : public BaseError {
 public:
  explicit CudaError(cudaError_t error) : error_(error) {}

  std::string what() const override {
    return std::string(cudaGetErrorString(error_));
  }

 private:
  cudaError_t error_;
};

class CudaDeviceGuard {
 public:
  CudaDeviceGuard() = delete;
  CudaDeviceGuard(const CudaDeviceGuard&) = delete;
  CudaDeviceGuard(CudaDeviceGuard&&) = delete;
  CudaDeviceGuard& operator=(const CudaDeviceGuard&) = delete;
  CudaDeviceGuard& operator=(CudaDeviceGuard&&) = delete;

  explicit CudaDeviceGuard(int device) {
    TP_CUDA_CHECK(cudaGetDevice(&device_));
    TP_CUDA_CHECK(cudaSetDevice(device));
  }

  ~CudaDeviceGuard() {
    TP_CUDA_CHECK(cudaSetDevice(device_));
  }

 private:
  int device_;
};

class CudaEvent {
 public:
  CudaEvent() = delete;
  CudaEvent(const CudaEvent&) = delete;
  CudaEvent(CudaEvent&&) = delete;
  CudaEvent& operator=(const CudaEvent&) = delete;
  CudaEvent& operator=(CudaEvent&&) = delete;

  explicit CudaEvent(int device, bool interprocess = false)
      : deviceIdx_(device) {
    CudaDeviceGuard guard(deviceIdx_);
    int flags = cudaEventDisableTiming;
    if (interprocess) {
      flags |= cudaEventInterprocess;
    }
    TP_CUDA_CHECK(cudaEventCreateWithFlags(&ev_, flags));
  }

  explicit CudaEvent(int device, cudaIpcEventHandle_t handle)
      : deviceIdx_(device) {
    // It could crash if we don't set device when creating events from handles
    CudaDeviceGuard guard(deviceIdx_);
    TP_CUDA_CHECK(cudaIpcOpenEventHandle(&ev_, handle));
  }

  void record(cudaStream_t stream) {
    CudaDeviceGuard guard(deviceIdx_);
    TP_CUDA_CHECK(cudaEventRecord(ev_, stream));
  }

  void wait(cudaStream_t stream, int device) {
    CudaDeviceGuard guard(device);
    TP_CUDA_CHECK(cudaStreamWaitEvent(stream, ev_, 0));
  }

  bool query() const {
    CudaDeviceGuard guard(deviceIdx_);
    cudaError_t res = cudaEventQuery(ev_);
    if (res == cudaErrorNotReady) {
      return false;
    }
    TP_CUDA_CHECK(res);
    return true;
  }

  cudaEvent_t raw() {
    return ev_;
  }

  cudaIpcEventHandle_t getIpcHandle() const {
    CudaDeviceGuard guard(deviceIdx_);
    cudaIpcEventHandle_t handle;
    TP_CUDA_CHECK(cudaIpcGetEventHandle(&handle, ev_));
    return handle;
  }

  std::string serializedHandle() {
    cudaIpcEventHandle_t handle = getIpcHandle();
    return std::string(reinterpret_cast<const char*>(&handle), sizeof(handle));
  }

  ~CudaEvent() {
    CudaDeviceGuard guard(deviceIdx_);
    TP_CUDA_CHECK(cudaEventDestroy(ev_));
  }

 private:
  cudaEvent_t ev_;
  int deviceIdx_;
};

inline int cudaDeviceForPointer(const CudaLib& cudaLib, const void* ptr) {
  // When calling cudaSetDevice(0) when device 0 hasn't been initialized yet
  // the CUDA runtime sets the current context of the CUDA driver to what's
  // apparently an invalid non-null value. This causes cudaPointerGetAttributes
  // to misbehave (possibly other functions too, but this is the only function
  // that we call outside of a device guard). In fact, device guards are likely
  // the reason we call cudaSetDevice(0) at all, because at destruction they
  // reset the current device to the value it had before construction, and that
  // will be zero if no other device guard was active at that point.
  // The ugly workaround is to manually undo the runtime's errors, by clearing
  // the driver's current context. In a sense, by creating a "reverse" guard.
  CUcontext ctx;
  TP_CUDA_DRIVER_CHECK(cudaLib, cudaLib.ctxGetCurrent(&ctx));
  TP_CUDA_DRIVER_CHECK(cudaLib, cudaLib.ctxSetCurrent(nullptr));

  int deviceIdx;
  TP_CUDA_DRIVER_CHECK(
      cudaLib,
      cudaLib.pointerGetAttribute(
          &deviceIdx,
          CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL,
          reinterpret_cast<CUdeviceptr>(ptr)));

  TP_CUDA_DRIVER_CHECK(cudaLib, cudaLib.ctxSetCurrent(ctx));
  return deviceIdx;
}

class CudaPinnedMemoryDeleter {
 public:
  explicit CudaPinnedMemoryDeleter(int deviceIdx) : deviceIdx_(deviceIdx) {}

  void operator()(uint8_t* ptr) {
    CudaDeviceGuard guard(deviceIdx_);
    TP_CUDA_CHECK(cudaFreeHost(ptr));
  }

 private:
  const int deviceIdx_;
};

using CudaPinnedBuffer = std::unique_ptr<uint8_t[], CudaPinnedMemoryDeleter>;

inline CudaPinnedBuffer makeCudaPinnedBuffer(size_t length, int deviceIdx) {
  CudaDeviceGuard guard(deviceIdx);
  uint8_t* ptr;
  TP_CUDA_CHECK(cudaMallocHost(&ptr, length));
  return CudaPinnedBuffer(ptr, CudaPinnedMemoryDeleter(deviceIdx));
}

class CudaDeviceBuffer {
 public:
  CudaDeviceBuffer() = default;

  CudaDeviceBuffer(size_t length, int deviceIdx) {
    CudaDeviceGuard guard(deviceIdx);
    uint8_t* ptr;
    TP_CUDA_CHECK(cudaMalloc(&ptr, length));
    ptr_ = {ptr, Deleter{deviceIdx}};
  }

  uint8_t* ptr() const {
    return ptr_.get();
  }

  int deviceIdx() const {
    return ptr_.get_deleter().deviceIdx;
  }

  void reset() {
    ptr_.reset();
  }

  cudaIpcMemHandle_t getIpcHandle() const {
    CudaDeviceGuard guard(deviceIdx());
    cudaIpcMemHandle_t handle;
    TP_CUDA_CHECK(cudaIpcGetMemHandle(&handle, ptr_.get()));
    return handle;
  }

 private:
  struct Deleter {
    int deviceIdx;

    void operator()(uint8_t* ptr) {
      CudaDeviceGuard guard(deviceIdx);
      TP_CUDA_CHECK(cudaFree(ptr));
    }
  };

  std::unique_ptr<uint8_t[], Deleter> ptr_;
};

class CudaIpcBuffer {
 public:
  CudaIpcBuffer() = default;

  CudaIpcBuffer(int deviceIdx, const cudaIpcMemHandle_t& handle) {
    CudaDeviceGuard guard(deviceIdx);
    void* ptr;
    TP_CUDA_CHECK(
        cudaIpcOpenMemHandle(&ptr, handle, cudaIpcMemLazyEnablePeerAccess));
    ptr_ = {reinterpret_cast<uint8_t*>(ptr), Deleter{deviceIdx}};
  }

  uint8_t* ptr() const {
    return ptr_.get();
  }

  int deviceIdx() const {
    return ptr_.get_deleter().deviceIdx;
  }

  void reset() {
    ptr_.reset();
  }

 private:
  struct Deleter {
    int deviceIdx;

    void operator()(uint8_t* ptr) {
      CudaDeviceGuard guard(deviceIdx);
      TP_CUDA_CHECK(cudaIpcCloseMemHandle(ptr));
    }
  };

  std::unique_ptr<uint8_t[], Deleter> ptr_;
};

inline std::string getUuidOfDevice(const CudaLib& cudaLib, int deviceIdx) {
  CUdevice device;
  TP_CUDA_DRIVER_CHECK(cudaLib, cudaLib.deviceGet(&device, deviceIdx));

  CUuuid uuid;
  TP_CUDA_DRIVER_CHECK(cudaLib, cudaLib.deviceGetUuid(&uuid, device));

  // The CUDA driver and NVML choose two different format for UUIDs, hence we
  // need to reconcile them. We do so using the most human readable format, that
  // is "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee" (8-4-4-4-12).
  std::ostringstream uuidSs;
  uuidSs << std::hex << std::setfill('0');
  for (int j = 0; j < 16; ++j) {
    // The bitmask is required otherwise a negative value will get promoted to
    // (signed) int with sign extension if char is signed.
    uuidSs << std::setw(2) << (uuid.bytes[j] & 0xff);
    if (j == 3 || j == 5 || j == 7 || j == 9) {
      uuidSs << '-';
    }
  }

  std::string uuidStr = uuidSs.str();
  TP_THROW_ASSERT_IF(!isValidUuid(uuidStr))
      << "Couldn't obtain valid UUID for GPU #" << deviceIdx
      << " from CUDA driver. Got: " << uuidStr;

  return uuidStr;
}

inline std::vector<std::string> getUuidsOfVisibleDevices(
    const CudaLib& cudaLib) {
  int deviceCount;
  TP_CUDA_DRIVER_CHECK(cudaLib, cudaLib.deviceGetCount(&deviceCount));

  std::vector<std::string> result(deviceCount);
  for (int devIdx = 0; devIdx < deviceCount; ++devIdx) {
    result[devIdx] = getUuidOfDevice(cudaLib, devIdx);
  }

  return result;
}

inline std::vector<Device> getCudaDevices(const CudaLib& cudaLib) {
  int deviceCount;
  TP_CUDA_DRIVER_CHECK(cudaLib, cudaLib.deviceGetCount(&deviceCount));
  std::vector<Device> result(deviceCount);
  for (int devIdx = 0; devIdx < deviceCount; ++devIdx) {
    result[devIdx] = Device{kCudaDeviceType, devIdx};
  }

  return result;
}

} // namespace tensorpipe


================================================
FILE: tensorpipe/common/cuda_buffer.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/common/cuda_buffer.h>

#include <tensorpipe/common/cuda.h>
#include <tensorpipe/common/defs.h>

namespace tensorpipe {

Device CudaBuffer::getDevice() const {
  static CudaLib cudaLib = []() {
    Error error;
    CudaLib lib;
    std::tie(error, lib) = CudaLib::create();
    TP_THROW_ASSERT_IF(error)
        << "Cannot get CUDA device for pointer because libcuda could not be loaded: "
        << error.what();
    return lib;
  }();

  return Device{kCudaDeviceType, cudaDeviceForPointer(cudaLib, ptr)};
}

} // namespace tensorpipe


================================================
FILE: tensorpipe/common/cuda_buffer.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <cuda_runtime.h>

#include <tensorpipe/common/device.h>

namespace tensorpipe {

struct CudaBuffer {
  void* ptr{nullptr};
  cudaStream_t stream{cudaStreamDefault};

  Device getDevice() const;
};

} // namespace tensorpipe


================================================
FILE: tensorpipe/common/cuda_lib.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <memory>

#include <cuda.h>

#include <tensorpipe/common/defs.h>
#include <tensorpipe/common/dl.h>

#define TP_CUDA_DRIVER_CHECK(cuda_lib, a)                                 \
  do {                                                                    \
    CUresult error = (a);                                                 \
    if (error != CUDA_SUCCESS) {                                          \
      CUresult res;                                                       \
      const char* errorName;                                              \
      const char* errorStr;                                               \
      res = cuda_lib.getErrorName(error, &errorName);                     \
      TP_THROW_ASSERT_IF(res != CUDA_SUCCESS);                            \
      res = cuda_lib.getErrorString(error, &errorStr);                    \
      TP_THROW_ASSERT_IF(res != CUDA_SUCCESS);                            \
      TP_THROW_ASSERT() << __TP_EXPAND_OPD(a) << " " << errorName << " (" \
                        << errorStr << ")";                               \
    }                                                                     \
  } while (false)

namespace tensorpipe {

class NoDevicesError final : public BaseError {
 public:
  std::string what() const override {
    return "The CUDA driver failed to init because it didn't find any device";
  }
};

// Master list of all symbols we care about from libcuda.

#define TP_FORALL_CUDA_SYMBOLS(_)                               \
  _(ctxGetCurrent, cuCtxGetCurrent, (CUcontext*))               \
  _(ctxSetCurrent, cuCtxSetCurrent, (CUcontext))                \
  _(deviceGet, cuDeviceGet, (CUdevice*, int))                   \
  _(deviceGetCount, cuDeviceGetCount, (int*))                   \
  _(deviceGetUuid, cuDeviceGetUuid, (CUuuid*, CUdevice))        \
  _(getErrorName, cuGetErrorName, (CUresult, const char**))     \
  _(getErrorString, cuGetErrorString, (CUresult, const char**)) \
  _(init, cuInit, (unsigned int))                               \
  _(memGetAddressRange_v2,                                      \
    cuMemGetAddressRange_v2,                                    \
    (CUdeviceptr*, size_t*, CUdeviceptr))                       \
  _(pointerGetAttribute,                                        \
    cuPointerGetAttribute,                                      \
    (void*, CUpointer_attribute, CUdeviceptr))

// Wrapper for libcuda.

class CudaLib {
 private:
  explicit CudaLib(DynamicLibraryHandle dlhandle)
      : dlhandle_(std::move(dlhandle)) {}

  DynamicLibraryHandle dlhandle_;

#define TP_DECLARE_FIELD(method_name, function_name, args_types) \
  CUresult(*function_name##_ptr_) args_types = nullptr;
  TP_FORALL_CUDA_SYMBOLS(TP_DECLARE_FIELD)
#undef TP_DECLARE_FIELD

 public:
  CudaLib() = default;

#define TP_FORWARD_CALL(method_name, function_name, args_types)  \
  template <typename... Args>                                    \
  auto method_name(Args&&... args) const {                       \
    return (*function_name##_ptr_)(std::forward<Args>(args)...); \
  }
  TP_FORALL_CUDA_SYMBOLS(TP_FORWARD_CALL)
#undef TP_FORWARD_CALL

  static std::tuple<Error, CudaLib> create() {
    Error error;
    DynamicLibraryHandle dlhandle;
    // To keep things "neat" and contained, we open in "local" mode (as
    // opposed to global) so that the cuda symbols can only be resolved
    // through this handle and are not exposed (a.k.a., "leaked") to other
    // shared objects.
    std::tie(error, dlhandle) =
        DynamicLibraryHandle::create("libcuda.so.1", RTLD_LOCAL | RTLD_LAZY);
    if (error) {
      return std::make_tuple(std::move(error), CudaLib());
    }
    // Log at level 9 as we can't know whether this will be used in a transport
    // or channel, thus err on the side of this being as low-level as possible
    // because we don't expect this to be of interest that often.
    TP_VLOG(9) << [&]() -> std::string {
      std::string filename;
      std::tie(error, filename) = dlhandle.getFilename();
      if (error) {
        return "Couldn't determine location of shared library libcuda.so.1: " +
            error.what();
      }
      return "Found shared library libcuda.so.1 at " + filename;
    }();
    CudaLib lib(std::move(dlhandle));
#define TP_LOAD_SYMBOL(method_name, function_name, args_types)       \
  {                                                                  \
    void* ptr;                                                       \
    std::tie(error, ptr) = lib.dlhandle_.loadSymbol(#function_name); \
    if (error) {                                                     \
      return std::make_tuple(std::move(error), CudaLib());           \
    }                                                                \
    TP_THROW_ASSERT_IF(ptr == nullptr);                              \
    lib.function_name##_ptr_ =                                       \
        reinterpret_cast<decltype(function_name##_ptr_)>(ptr);       \
  }
    TP_FORALL_CUDA_SYMBOLS(TP_LOAD_SYMBOL)
#undef TP_LOAD_SYMBOL
    CUresult result = lib.init(0);
    // If the driver doesn't find any devices it fails to init (beats me why)
    // but we must support this case, by disabling the channels, rather than
    // throwing. Hence we treat it as if we couldn't find the driver.
    if (result == CUDA_ERROR_NO_DEVICE) {
      return std::make_tuple(TP_CREATE_ERROR(NoDevicesError), CudaLib());
    }
    TP_CUDA_DRIVER_CHECK(lib, result);
    return std::make_tuple(Error::kSuccess, std::move(lib));
  }

  CUresult memGetAddressRange(
      CUdeviceptr* pbase,
      size_t* psize,
      CUdeviceptr dptr) const {
    // NOTE: We are forwarding to cuMemGetAddressRange_v2() directly, because
    // the name cuMemGetAddressRange is #defined to its _v2 variant in cuda.h.
    // Calling the actual cuMemGetAddressRange() function here would lead to a
    // CUDA_ERROR_INVALID_CONTEXT.
    return memGetAddressRange_v2(pbase, psize, dptr);
  }
};

#undef TP_FORALL_CUDA_SYMBOLS

} // namespace tensorpipe


================================================
FILE: tensorpipe/common/cuda_loop.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/common/cuda_loop.h>

#include <tensorpipe/common/cuda.h>
#include <tensorpipe/common/system.h>

namespace tensorpipe {

namespace {

struct CudaCallback {
  CudaLoop& loop;
  std::function<void(const Error&)> callback;

  CudaCallback(CudaLoop& loop, std::function<void(const Error&)> callback)
      : loop(loop), callback(std::move(callback)) {}
};

class CudaLoopClosedError final : public BaseError {
  std::string what() const override {
    return "CUDA loop already closed";
  }
};

} // namespace

CudaLoop::CudaLoop() {
  thread_ = std::thread([this]() {
    setThreadName("TP_CUDA_callback_loop");
    processCallbacks();
  });
}

CudaLoop::~CudaLoop() {
  join();
}

void CudaLoop::join() {
  close();

  if (!joined_.exchange(true)) {
    thread_.join();
  }
}

void CudaLoop::close() {
  std::unique_lock<std::mutex> lock(mutex_);
  if (closed_) {
    return;
  }
  closed_ = true;
  cv_.notify_all();
}

void CudaLoop::processCallbacks() {
  for (;;) {
    std::deque<Operation> operations;
    {
      std::unique_lock<std::mutex> lock(mutex_);

      if (operations_.empty()) {
        if (closed_ && pendingOperations_ == 0) {
          break;
        } else {
          cv_.wait(lock);
        }
      }

      std::swap(operations, operations_);
      pendingOperations_ -= operations.size();
    }

    for (auto& op : operations) {
      op.callback(op.error);
    }
  }
}

void CudaLoop::addCallback(
    int device,
    cudaStream_t stream,
    std::function<void(const Error&)> callback) {
  {
    std::unique_lock<std::mutex> lock(mutex_);
    if (closed_) {
      callback(TP_CREATE_ERROR(CudaLoopClosedError));
      return;
    }
    ++pendingOperations_;
  }

  auto cudaCallback =
      std::make_unique<CudaCallback>(*this, std::move(callback));
  CudaDeviceGuard guard(device);
  TP_CUDA_CHECK(cudaStreamAddCallback(
      stream, runCudaCallback, cudaCallback.release(), 0));
}

void CUDART_CB CudaLoop::runCudaCallback(
    cudaStream_t /* unused */,
    cudaError_t cudaError,
    void* callbackPtr) {
  std::unique_ptr<CudaCallback> cudaCallback(
      reinterpret_cast<CudaCallback*>(callbackPtr));
  CudaLoop& loop = cudaCallback->loop;
  {
    std::unique_lock<std::mutex> lock(loop.mutex_);
    auto error = Error::kSuccess;
    if (cudaError != cudaSuccess) {
      error = TP_CREATE_ERROR(CudaError, cudaError);
    }
    loop.operations_.push_back(
        {std::move(cudaCallback->callback), std::move(error)});
    loop.cv_.notify_all();
  }
  cudaCallback.reset();
}

} // namespace tensorpipe


================================================
FILE: tensorpipe/common/cuda_loop.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <atomic>
#include <condition_variable>
#include <deque>
#include <functional>
#include <list>
#include <mutex>
#include <thread>

#include <cuda_runtime.h>

#include <tensorpipe/common/error_macros.h>

namespace tensorpipe {

class CudaLoop {
  struct Operation {
    std::function<void(const Error&)> callback;
    Error error;
  };

 public:
  CudaLoop();

  ~CudaLoop();

  void join();
  void close();

  void addCallback(
      int device,
      cudaStream_t stream,
      std::function<void(const Error&)> callback);

 private:
  std::thread thread_;
  std::deque<Operation> operations_;
  std::mutex mutex_;
  std::condition_variable cv_;
  uint64_t pendingOperations_{0};

  bool closed_{false};
  std::atomic<bool> joined_{false};

  void processCallbacks();

  // Proxy static method for cudaStreamAddCallback(), which does not accept
  // lambdas.
  static void CUDART_CB runCudaCallback(
      cudaStream_t stream,
      cudaError_t cudaError,
      void* callbackPtr);
};

} // namespace tensorpipe


================================================
FILE: tensorpipe/common/deferred_executor.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <atomic>
#include <deque>
#include <exception>
#include <functional>
#include <future>
#include <memory>
#include <mutex>
#include <string>
#include <thread>
#include <utility>
#include <vector>

#include <tensorpipe/common/defs.h>
#include <tensorpipe/common/system.h>

namespace tensorpipe {

// Dealing with thread-safety using per-object mutexes is prone to deadlocks
// because of reentrant calls (both "upward", when invoking a callback that
// calls back into a method of the object, and "downward", when passing a
// callback to an operation of another object that calls it inline) and lock
// inversions (object A calling a method of object B and attempting to acquire
// its lock, with the reverse happening at the same time). Using a "loop" model,
// where operations aren't called inlined and piled up on the stack but instead
// deferred to a later iteration of the loop, solves many of these issues. This
// abstract interface defines the essential methods we need such event loops to
// provide.
class DeferredExecutor {
 public:
  using TTask = std::function<void()>;

  virtual void deferToLoop(TTask fn) = 0;

  virtual bool inLoop() const = 0;

  // Prefer using deferToLoop over runInLoop when you don't need to wait for the
  // result.
  template <typename F>
  void runInLoop(F&& fn) {
    // When called from the event loop thread itself (e.g., from a callback),
    // deferring would cause a deadlock because the given callable can only be
    // run when the loop is allowed to proceed. On the other hand, it means it
    // is thread-safe to run it immediately. The danger here however is that it
    // can lead to an inconsistent order between operations run from the event
    // loop, from outside of it, and deferred.
    if (inLoop()) {
      fn();
    } else {
      // Must use a copyable wrapper around std::promise because
      // we use it from a std::function which must be copyable.
      auto promise = std::make_shared<std::promise<void>>();
      auto future = promise->get_future();
      // Marked as mutable because the fn might hold some state (e.g., the
      // closure of a lambda) which it might want to modify.
      deferToLoop([promise, fn{std::forward<F>(fn)}]() mutable {
        try {
          fn();
          promise->set_value();
        } catch (...) {
          promise->set_exception(std::current_exception());
        }
      });
      future.get();
    }
  }

  virtual ~DeferredExecutor() = default;
};

// Transports typically have their own thread they can use as deferred executors
// but many objects (like pipes) don't naturally own threads and introducing
// them would also mean introducing latency costs due to context switching.
// In order to give these objects a loop they can use to defer their operations
// to, we can have them temporarily hijack the calling thread and repurpose it
// to run an ephemeral loop on which to run the original task and all the ones
// that a task running on the loop chooses to defer to a later iteration of the
// loop, recursively. Once all these tasks have been completed, the makeshift
// loop is dismantled and control of the thread is returned to the caller.
// FIXME Rename this to OnDemandDeferredExecutor?
class OnDemandDeferredExecutor : public DeferredExecutor {
 public:
  bool inLoop() const override {
    // If the current thread is already holding the lock (i.e., it's already in
    // this function somewhere higher up in the stack) then this check won't
    // race and we will detect it correctly. If this is not the case, then this
    // check may race with another thread, but that's nothing to worry about
    // because in either case the outcome will be negative.
    return currentLoop_ == std::this_thread::get_id();
  }

  void deferToLoop(TTask fn) override {
    {
      std::unique_lock<std::mutex> lock(mutex_);
      pendingTasks_.push_back(std::move(fn));
      if (currentLoop_ != std::thread::id()) {
        return;
      }
      currentLoop_ = std::this_thread::get_id();
    }

    while (true) {
      TTask task;
      {
        std::unique_lock<std::mutex> lock(mutex_);
        if (pendingTasks_.empty()) {
          currentLoop_ = std::thread::id();
          return;
        }
        task = std::move(pendingTasks_.front());
        pendingTasks_.pop_front();
      }
      task();
    }
  }

 private:
  std::mutex mutex_;
  std::atomic<std::thread::id> currentLoop_{std::thread::id()};
  std::deque<TTask> pendingTasks_;
};

class EventLoopDeferredExecutor : public virtual DeferredExecutor {
 public:
  void deferToLoop(TTask fn) override {
    {
      std::unique_lock<std::mutex> lock(mutex_);
      if (likely(isThreadConsumingDeferredFunctions_)) {
        fns_.push_back(std::move(fn));
        wakeupEventLoopToDeferFunction();
        return;
      }
    }
    // Must call it without holding the lock, as it could cause a reentrant
    // call.
    onDemandLoop_.deferToLoop(std::move(fn));
  }

  inline bool inLoop() const override {
    {
      std::unique_lock<std::mutex> lock(mutex_);
      if (likely(isThreadConsumingDeferredFunctions_)) {
        return std::this_thread::get_id() == thread_.get_id();
      }
    }
    return onDemandLoop_.inLoop();
  }

 protected:
  // This is the actual long-running event loop, which is implemented by
  // subclasses and called inside the thread owned by this parent class.
  virtual void eventLoop() = 0;

  // This is called after the event loop terminated, still within the thread
  // that used to run that event loop. It will be called after this class has
  // transitioned control to the on-demand deferred executor. It thus allows to
  // clean up any resources without worrying about new work coming in.
  virtual void cleanUpLoop() {}

  // This function is called by the parent class when a function is deferred to
  // it, and must be implemented by subclasses, which are required to have their
  // event loop call runDeferredFunctionsFromEventLoop as soon as possible. This
  // function is guaranteed to be called once per function deferral (in case
  // subclasses want to keep count).
  virtual void wakeupEventLoopToDeferFunction() = 0;

  // Called by subclasses to have the parent class start the thread. We cannot
  // implicitly call this in the parent class's constructor because it could
  // lead to a race condition between the event loop (run by the thread) and the
  // subclass's constructor (which is executed after the parent class's one).
  // Hence this method should be invoked at the end of the subclass constructor.
  void startThread(std::string threadName) {
    // FIXME Once we've fixed the viability (by having a factory function return
    // a nullptr, instead of having a method on the context), remove this, and
    // instead add a safety check in deferToLoop that ensures that within the
    // isThreadConsumingDeferredFunctions_ branch the thread is joinable, i.e.,
    // up and still running.
    {
      std::unique_lock<std::mutex> lock(mutex_);
      TP_DCHECK(!isThreadConsumingDeferredFunctions_);
      TP_DCHECK(!thread_.joinable());
      TP_DCHECK(fns_.empty());
      isThreadConsumingDeferredFunctions_ = true;
    }
    thread_ = std::thread(
        &EventLoopDeferredExecutor::loop, this, std::move(threadName));
  }

  // This is basically the reverse operation of the above, and is needed for the
  // same (reversed) reason. Note that this only waits for the thread to finish:
  // the subclass must have its own way of telling its event loop to stop and
  // return control.
  void joinThread() {
    thread_.join();
  }

  // Must be called by the subclass after it was woken up. Even if multiple
  // functions were deferred, this method only needs to be called once. However,
  // care must be taken to avoid races between this call and new wakeups. This
  // method also returns the number of functions it executed, in case the
  // subclass is keeping count.
  size_t runDeferredFunctionsFromEventLoop() {
    decltype(fns_) fns;

    {
      std::unique_lock<std::mutex> lock(mutex_);
      std::swap(fns, fns_);
    }

    for (auto& fn : fns) {
      fn();
    }

    return fns.size();
  }

 private:
  void loop(std::string threadName) {
    setThreadName(std::move(threadName));

    eventLoop();

    // The loop is winding down and "handing over" control to the on demand
    // loop. But it can only do so safely once there are no pending deferred
    // functions, as otherwise those may risk never being executed.
    while (true) {
      decltype(fns_) fns;

      {
        std::unique_lock<std::mutex> lock(mutex_);
        if (fns_.empty()) {
          isThreadConsumingDeferredFunctions_ = false;
          break;
        }
        std::swap(fns, fns_);
      }

      for (auto& fn : fns) {
        fn();
      }
    }

    cleanUpLoop();
  }

  std::thread thread_;

  // Whether the thread is taking care of running the deferred functions
  //
  // This is part of what can only be described as a hack. Sometimes, even when
  // using the API as intended, objects try to defer tasks to the loop after
  // that loop has been closed and joined. Since those tasks may be lambdas that
  // captured shared_ptrs to the objects in their closures, this may lead to a
  // reference cycle and thus a leak. Our hack is to have this flag to record
  // when we can no longer defer tasks to the loop and in that case we just run
  // those tasks inline. In order to keep ensuring the single-threadedness
  // assumption of our model (which is what we rely on to be safe from race
  // conditions) we use an on-demand loop. This flag starts as false as in some
  // cases (like non-viable transports) the thread may never be started and thus
  // we want the on-demand loop to be engaged from the beginning.
  bool isThreadConsumingDeferredFunctions_{false};
  OnDemandDeferredExecutor onDemandLoop_;

  // Mutex to guard the deferring and the running of functions.
  mutable std::mutex mutex_;

  // List of deferred functions to run when the loop is ready.
  std::vector<std::function<void()>> fns_;
};

} // namespace tensorpipe


================================================
FILE: tensorpipe/common/defs.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <sys/time.h>
#include <sys/types.h>
#include <unistd.h>

#include <cstring>
#include <ctime>
#include <functional>
#include <iomanip>
#include <iostream>
#include <limits>
#include <sstream>
#include <string>
#include <system_error>

// Branch hint macros. C++20 will include them as part of language.
#define likely(x) __builtin_expect((x) ? 1 : 0, 1)
#define unlikely(x) __builtin_expect((x) ? 1 : 0, 0)

/// Auxiliar class to build exception, fill up it's what message and throw
/// in a single line. Usually uses as r-value so that destructor is called
/// at end of line that created it, throwing the desired exception.
/// (See TP_THROW).
namespace tensorpipe {
template <class TException>
class ExceptionThrower final {
 public:
  template <class... TArgs>
  ExceptionThrower(TArgs&&... nonWhat) {
    exBuilder_ = [&](const std::string& what) {
      return TException(std::move(nonWhat)..., what);
    };
  }

  // Throw exception on destructor, when l-value instance goes of scope
  // and stream has been written. Use noexcept(false) to inform the compiler
  // that it's ok to throw in destructor.
  ~ExceptionThrower() noexcept(false) {
    throw exBuilder_(oss_.str() + "\"");
  }

  std::ostream& getStream() {
    return oss_;
  }

 protected:
  std::function<TException(const std::string&)> exBuilder_;
  std::ostringstream oss_;
};
} // namespace tensorpipe

//
// Macros to throw commonly used exceptions.
//
#define TP_STRINGIFY(s) #s
#define TP_EXPAND_TO_STR(s) TP_STRINGIFY(s)

// Strip all leading components up to the *last* occurrence of "tensorpipe/".
// This removes all the system-specific prefixes added by the compiler.
#define TP_TRIM_FILENAME(s)                                         \
  [](const char* filename) -> const char* {                         \
    while (true) {                                                  \
      const char* match = std::strstr(filename + 1, "tensorpipe/"); \
      if (match == nullptr) {                                       \
        break;                                                      \
      }                                                             \
      filename = match;                                             \
    }                                                               \
    return filename;                                                \
  }(s)

#define TP_LOG_LOC \
  TP_TRIM_FILENAME(__FILE__) << ":" << TP_EXPAND_TO_STR(__LINE__)
#define TP_LOG_PREFFIX "In " << __func__ << " at " << TP_LOG_LOC

#define TP_THROW(ex_type, ...)                                     \
  ::tensorpipe::ExceptionThrower<ex_type>(__VA_ARGS__).getStream() \
      << TP_LOG_PREFFIX << " \""

#define TP_THROW_EINVAL() TP_THROW(std::invalid_argument)

#define TP_THROW_SYSTEM(err) \
  TP_THROW(std::system_error, err, std::system_category())
#define TP_THROW_SYSTEM_IF(cond, err) \
  if (unlikely(cond))                 \
  TP_THROW_SYSTEM(err)

#define TP_THROW_SYSTEM_CODE(err) TP_THROW(std::system_error, err)
#define TP_THROW_SYSTEM_CODE_IF(cond, err) \
  if (unlikely(cond))                      \
  TP_THROW_SYSTEM_CODE(err) << TP_STRINGIFY(cond)

#define TP_THROW_ASSERT() TP_THROW(std::runtime_error)
#define TP_THROW_ASSERT_IF(cond) \
  if (unlikely(cond))            \
  TP_THROW_ASSERT() << TP_STRINGIFY(cond)

// Conditional throwing exception
#define TP_THROW_IF_NULLPTR(ptr) \
  if (unlikely(ptr == nullptr))  \
  TP_THROW_EINVAL() << TP_STRINGIFY(ptr) << " has nullptr value"

// Safe-cast to std::error_code
namespace tensorpipe {
inline std::error_code toErrorCode(ssize_t e) {
  if (unlikely(e <= 0)) {
    TP_THROW_EINVAL() << "Error not a positive number. "
                      << "Is this value really an error?";
  } else if (unlikely(e > std::numeric_limits<int>::max())) {
    TP_THROW_EINVAL() << "Error out of range. Is this really an error?";
  }
  return {static_cast<int>(e), std::system_category()};
}
} // namespace tensorpipe

//
// Simple logging to stderr. This macros can be replaced if a more
// sophisticated logging is used in the future.
// Currently, tensorpipe is meant be used as shared library and to use
// exceptions for error handling, so the need for logging in
// the library is reduced.
//
namespace tensorpipe {
class LogEntry final {
 public:
  explicit LogEntry(char type) {
    oss_ << type;

    // In C++17 use std::timespec.
    struct timeval tv;
    // In C++17 use std::timespec_get.
    gettimeofday(&tv, nullptr);
    struct std::tm tm;
    // Need to use localtime_r as std::localtime may not be thread-safe.
    localtime_r(&tv.tv_sec, &tm);
    oss_ << std::setfill('0') << std::setw(2) << 1 + tm.tm_mon << std::setw(2)
         << tm.tm_mday << ' ' << std::setw(2) << tm.tm_hour << ':'
         << std::setw(2) << tm.tm_min << ':' << std::setw(2) << tm.tm_sec << '.'
         << std::setw(6) << tv.tv_usec;

    // The glog format uses the thread ID but it's painful to get (there is a
    // gettid syscall, but it's not exposed in glibc) so we use the PID instead.
    oss_ << ' ' << std::setfill(' ') << std::setw(5) << getpid();
  }

  ~LogEntry() noexcept {
    // Multiple threads or processes writing to the same log (e.g., stderr)
    // might lead to interleaved text and thus garbled output. It seems that a
    // single write syscall is "rather" atomic so instead of issuing a separate
    // write for the trailing newline we append it to the message and write them
    // together.
    oss_ << std::endl;
    std::cerr << oss_.str();
  }

  std::ostream& getStream() {
    return oss_;
  }

 protected:
  std::ostringstream oss_;
};
} // namespace tensorpipe

#define TP_LOG_DEBUG() \
  ::tensorpipe::LogEntry('V').getStream() << ' ' << TP_LOG_LOC << "] "
#define TP_LOG_INFO() \
  ::tensorpipe::LogEntry('I').getStream() << ' ' << TP_LOG_LOC << "] "
#define TP_LOG_WARNING() \
  ::tensorpipe::LogEntry('W').getStream() << ' ' << TP_LOG_LOC << "] "
#define TP_LOG_ERROR() \
  ::tensorpipe::LogEntry('E').getStream() << ' ' << TP_LOG_LOC << "] "

#define TP_LOG_DEBUG_IF(cond) \
  if (unlikely(cond))         \
  TP_LOG_DEBUG()
#define TP_LOG_INFO_IF(cond) \
  if (unlikely(cond))        \
  TP_LOG_INFO()
#define TP_LOG_WARNING_IF(cond) \
  if (unlikely(cond))           \
  TP_LOG_WARNING()
#define TP_LOG_ERROR_IF(cond) \
  if (unlikely(cond))         \
  TP_LOG_ERROR()

#define __TP_EXPAND_OPD(opd) TP_STRINGIFY(opd) << "(" << (opd) << ")"

//
// Debug checks.
// Note that non-debug checks are not provided because developers
// must handle all errors explicitly.
//

#define __TP_DCHECK(a)  \
  if (unlikely(!((a)))) \
  TP_THROW_ASSERT() << "Expected true for " << __TP_EXPAND_OPD(a)

#define __TP_DCHECK_CMP(a, b, op)                        \
  if (unlikely(!((a)op(b))))                             \
  TP_THROW_ASSERT() << "Expected " << __TP_EXPAND_OPD(a) \
                    << " " TP_STRINGIFY(op) << " " << __TP_EXPAND_OPD(b)

// Expand macro only in debug mode.
#ifdef NDEBUG

#define _TP_DLOG() \
  while (false)    \
  TP_LOG_DEBUG()

#define _TP_DCHECK(a) \
  while (false)       \
  __TP_DCHECK(a)

#define _TP_DCHECK_CMP(a, b, op) \
  while (false)                  \
  __TP_DCHECK_CMP(a, b, op)

#else

#define _TP_DLOG() TP_LOG_DEBUG()

#define _TP_DCHECK(a) __TP_DCHECK(a)

#define _TP_DCHECK_CMP(a, b, op) __TP_DCHECK_CMP(a, b, op)

#endif

// Public API for debug logging.
#define TP_DLOG() _TP_DLOG()

// Public API for debug checks.
#define TP_DCHECK(a) _TP_DCHECK(a)
#define TP_DCHECK_EQ(a, b) _TP_DCHECK_CMP(a, b, ==)
#define TP_DCHECK_NE(a, b) _TP_DCHECK_CMP(a, b, !=)
#define TP_DCHECK_LT(a, b) _TP_DCHECK_CMP(a, b, <)
#define TP_DCHECK_LE(a, b) _TP_DCHECK_CMP(a, b, <=)
#define TP_DCHECK_GT(a, b) _TP_DCHECK_CMP(a, b, >)
#define TP_DCHECK_GE(a, b) _TP_DCHECK_CMP(a, b, >=)

//
// Verbose logging.
// Some logging is helpful to diagnose tricky production issues but is too
// verbose to keep on all the time. It also should not be controlled by the
// debug flags, as we want to allow it to be enabled in production builds.
//

// The level of each TP_VLOG call should reflect where the object issuing it is
// located in the stack , and whether it's a call that involves handling
// requests from objects higher up, or issuing requests to objects lower down.
// This brings us to the following classification:
// - level 1 is for requests that core classes receive from the user
// - level 2 is for generic core classes stuff
// - level 3 is for requests that core classes issue to channels/transports
// - level 4 is for requests that channels receive from core classes
// - level 5 is for generic channels stuff
// - level 6 is for requests that channels issue to transports
// - level 7 is for requests that transports receive from core classes/channels
// - level 8 is for generic transports stuff
// - level 9 is for how transports deal with system resources

namespace tensorpipe {
inline unsigned long getVerbosityLevelInternal() {
  char* levelStr = std::getenv("TP_VERBOSE_LOGGING");
  if (levelStr == nullptr) {
    return 0;
  }
  return std::strtoul(levelStr, /*str_end=*/nullptr, /*base=*/10);
}

inline unsigned long getVerbosityLevel() {
  static unsigned long level = getVerbosityLevelInternal();
  return level;
}
} // namespace tensorpipe

#define TP_VLOG(level) TP_LOG_DEBUG_IF(level <= getVerbosityLevel())

//
// Argument checks
//
#define TP_ARG_CHECK(a) \
  if (unlikely(!((a)))) \
  TP_THROW_EINVAL() << "Expected argument to be true: " << __TP_EXPAND_OPD(a)

#define _TP_ARG_CMP(a, b, op)                                     \
  if (unlikely(!((a)op(b))))                                      \
  TP_THROW_EINVAL() << "Expected argument " << __TP_EXPAND_OPD(a) \
                    << " " TP_STRINGIFY(_op_) << " " << __TP_EXPAND_OPD(b)

#define TP_ARG_CHECK_EQ(a, b) _TP_ARG_CMP(a, b, ==)
#define TP_ARG_CHECK_NE(a, b) _TP_ARG_CMP(a, b, !=)
#define TP_ARG_CHECK_LT(a, b) _TP_ARG_CMP(a, b, <)
#define TP_ARG_CHECK_LE(a, b) _TP_ARG_CMP(a, b, <=)
#define TP_ARG_CHECK_GT(a, b) _TP_ARG_CMP(a, b, >)
#define TP_ARG_CHECK_GE(a, b) _TP_ARG_CMP(a, b, >=)

// Define DEXCEPT macro that is noexcept only in debug mode.
#ifdef NDEBUG
#define DEXCEPT noexcept(true)
#else
#define DEXCEPT noexcept(false)
#endif

#define TP_LOG_EXCEPTION(e)                         \
  TP_LOG_ERROR() << "Exception in " << __FUNCTION__ \
                 << " . Message: " << e.what()


================================================
FILE: tensorpipe/common/device.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <sstream>
#include <stdexcept>
#include <string>

namespace tensorpipe {

const std::string kCpuDeviceType{"cpu"};
const std::string kCudaDeviceType{"cuda"};

struct Device {
  std::string type;
  int index;

  // This pointless constructor is needed to work around a bug in GCC 5.5 (and
  // possibly other versions). It appears to be needed in the nop types that
  // are used inside nop::Optional.
  Device() {}

  Device(std::string type, int index) : type(std::move(type)), index(index) {}

  std::string toString() const {
    std::stringstream ss;
    ss << type << ":" << index;
    return ss.str();
  }

  bool operator==(const Device& other) const {
    return type == other.type && index == other.index;
  }
};

} // namespace tensorpipe

namespace std {

template <>
struct hash<::tensorpipe::Device> {
  size_t operator()(const ::tensorpipe::Device& device) const noexcept {
    return std::hash<std::string>{}(device.toString());
  }
};

template <>
struct hash<std::pair<::tensorpipe::Device, ::tensorpipe::Device>> {
  size_t operator()(const std::pair<::tensorpipe::Device, ::tensorpipe::Device>&
                        p) const noexcept {
    size_t h1 = std::hash<::tensorpipe::Device>{}(p.first);
    size_t h2 = std::hash<::tensorpipe::Device>{}(p.second);
    // Shifting one hash to avoid collisions between (a, b) and (b, a).
    return h1 ^ (h2 << 1);
  }
};

} // namespace std


================================================
FILE: tensorpipe/common/dl.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <dlfcn.h>
#include <link.h>

#include <array>
#include <climits>
#include <cstdlib>
#include <memory>
#include <string>
#include <tuple>

#include <tensorpipe/common/defs.h>
#include <tensorpipe/common/error.h>
#include <tensorpipe/common/error_macros.h>

namespace tensorpipe {

class DlError final : public BaseError {
 public:
  explicit DlError(char* error) : error_(error) {}

  std::string what() const override {
    return error_;
  }

 private:
  std::string error_;
};

class DynamicLibraryHandle {
 public:
  DynamicLibraryHandle() = default;

  static std::tuple<Error, DynamicLibraryHandle> create(
      const char* filename,
      int flags) {
    void* ptr = ::dlopen(filename, flags);
    if (ptr == nullptr) {
      return std::make_tuple(
          TP_CREATE_ERROR(DlError, ::dlerror()), DynamicLibraryHandle());
    }
    return std::make_tuple(Error::kSuccess, DynamicLibraryHandle(ptr));
  }

  bool hasValue() const {
    return ptr_ != nullptr;
  }

  std::tuple<Error, void*> loadSymbol(const char* name) {
    // Since dlsym doesn't return a specific value to signal errors (because
    // NULL is a valid return value), we need to detect errors by calling
    // dlerror and checking whether it returns a string or not (i.e., NULL). But
    // in order to do so, we must first reset the error, in case one was already
    // recorded.
    ::dlerror();
    void* ptr = ::dlsym(ptr_.get(), name);
    char* err = ::dlerror();
    if (err != nullptr) {
      return std::make_tuple(TP_CREATE_ERROR(DlError, err), nullptr);
    }
    return std::make_tuple(Error::kSuccess, ptr);
  }

  std::tuple<Error, std::string> getFilename() {
    struct link_map* linkMap;
    int rv = ::dlinfo(ptr_.get(), RTLD_DI_LINKMAP, &linkMap);
    if (rv < 0) {
      return std::make_tuple(
          TP_CREATE_ERROR(DlError, ::dlerror()), std::string());
    }
    std::array<char, PATH_MAX> path;
    char* resolvedPath = ::realpath(linkMap->l_name, path.data());
    if (resolvedPath == nullptr) {
      return std::make_tuple(
          TP_CREATE_ERROR(SystemError, "realpath", errno), std::string());
    }
    TP_DCHECK(resolvedPath == path.data());
    return std::make_tuple(Error::kSuccess, std::string(path.data()));
  }

 private:
  struct Deleter {
    void operator()(void* ptr) {
      int res = ::dlclose(ptr);
      TP_THROW_ASSERT_IF(res != 0) << "dlclose() failed: " << ::dlerror();
    }
  };

  DynamicLibraryHandle(void* ptr) : ptr_(ptr, Deleter{}) {}

  std::unique_ptr<void, Deleter> ptr_;
};

} // namespace tensorpipe


================================================
FILE: tensorpipe/common/epoll_loop.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/common/epoll_loop.h>

#include <sys/eventfd.h>

#include <tensorpipe/common/system.h>

namespace tensorpipe {

EpollLoop::EpollLoop(DeferredExecutor& deferredExecutor)
    : deferredExecutor_(deferredExecutor) {
  {
    auto rv = ::epoll_create(1);
    TP_THROW_SYSTEM_IF(rv == -1, errno);
    epollFd_ = Fd(rv);
  }
  {
    auto rv = ::eventfd(0, EFD_NONBLOCK);
    TP_THROW_SYSTEM_IF(rv == -1, errno);
    eventFd_ = Fd(rv);
  }

  // Register the eventfd with epoll.
  {
    struct epoll_event ev;
    ev.events = EPOLLIN;
    ev.data.u64 = 0;
    auto rv = ::epoll_ctl(epollFd_.fd(), EPOLL_CTL_ADD, eventFd_.fd(), &ev);
    TP_THROW_SYSTEM_IF(rv == -1, errno);
  }

  // Start epoll(2) thread.
  thread_ = std::thread(&EpollLoop::loop, this);
}

void EpollLoop::close() {
  if (!closed_.exchange(true)) {
    wakeup();
  }
}

void EpollLoop::join() {
  close();

  if (!joined_.exchange(true)) {
    thread_.join();
  }
}

EpollLoop::~EpollLoop() {
  join();

  // Unregister the eventfd with epoll.
  {
    auto rv = ::epoll_ctl(epollFd_.fd(), EPOLL_CTL_DEL, eventFd_.fd(), nullptr);
    TP_THROW_SYSTEM_IF(rv == -1, errno);
  }
}

void EpollLoop::registerDescriptor(
    int fd,
    int events,
    std::shared_ptr<EventHandler> h) {
  TP_DCHECK(deferredExecutor_.inLoop());

  std::lock_guard<std::mutex> lock(handlersMutex_);

  uint64_t record = nextRecord_++;

  struct epoll_event ev;
  ev.events = events;
  ev.data.u64 = record;

  auto fdIter = fdToRecord_.find(fd);
  if (fdIter == fdToRecord_.end()) {
    fdToRecord_.emplace(fd, record);
    recordToHandler_.emplace(record, h);

    auto rv = ::epoll_ctl(epollFd_.fd(), EPOLL_CTL_ADD, fd, &ev);
    TP_THROW_SYSTEM_IF(rv == -1, errno);
  } else {
    uint64_t oldRecord = fdIter->second;
    fdIter->second = record;
    recordToHandler_.erase(oldRecord);
    recordToHandler_.emplace(record, h);

    auto rv = ::epoll_ctl(epollFd_.fd(), EPOLL_CTL_MOD, fd, &ev);
    TP_THROW_SYSTEM_IF(rv == -1, errno);
  }
}

void EpollLoop::unregisterDescriptor(int fd) {
  TP_DCHECK(deferredExecutor_.inLoop());

  std::lock_guard<std::mutex> lock(handlersMutex_);

  auto fdIter = fdToRecord_.find(fd);
  TP_DCHECK(fdIter != fdToRecord_.end());
  uint64_t oldRecord = fdIter->second;
  fdToRecord_.erase(fdIter);
  recordToHandler_.erase(oldRecord);

  auto rv = ::epoll_ctl(epollFd_.fd(), EPOLL_CTL_DEL, fd, nullptr);
  TP_THROW_SYSTEM_IF(rv == -1, errno);

  // Maybe we're done and the event loop is waiting for the last handlers to
  // be unregistered before terminating, so just in case we wake it up.
  if (fdToRecord_.empty()) {
    wakeup();
  }
}

void EpollLoop::wakeup() {
  // Perform a write to eventfd to wake up epoll_wait(2).
  eventFd_.writeOrThrow<uint64_t>(1);
}

bool EpollLoop::hasRegisteredHandlers() {
  std::lock_guard<std::mutex> lock(handlersMutex_);
  TP_DCHECK_EQ(fdToRecord_.size(), recordToHandler_.size());
  return !fdToRecord_.empty();
}

void EpollLoop::loop() {
  setThreadName("TP_IBV_loop");

  // Stop when another thread has asked the loop the close and when all
  // handlers have been unregistered except for the wakeup eventfd one.
  while (!closed_ || hasRegisteredHandlers()) {
    // Use fixed epoll_event capacity for every call.
    std::vector<struct epoll_event> epollEvents(kCapacity);

    // Block waiting for something to happen...
    auto nfds =
        ::epoll_wait(epollFd_.fd(), epollEvents.data(), epollEvents.size(), -1);
    if (nfds == -1) {
      if (errno == EINTR) {
        continue;
      }
      TP_THROW_SYSTEM(errno);
    }

    // Always immediately read from the eventfd so that it is no longer readable
    // on the next call to epoll_wait(2). As it's opened in non-blocking mode,
    // reading from it if its value is zero just return EAGAIN. Reset it before
    // invoking any of the callbacks, so that if they perform a wakeup they will
    // wake up the next iteration of epoll_wait(2).
    {
      uint64_t val;
      auto rv = eventFd_.read(reinterpret_cast<void*>(&val), sizeof(val));
      TP_DCHECK(
          (rv == -1 && errno == EAGAIN) || (rv == sizeof(val) && val > 0));
    }

    // Resize based on actual number of events.
    epollEvents.resize(nfds);

    // Defer handling to reactor and wait for it to process these events.
    deferredExecutor_.runInLoop(
        [this, epollEvents{std::move(epollEvents)}]() mutable {
          handleEpollEventsFromLoop(std::move(epollEvents));
        });
  }
}

void EpollLoop::handleEpollEventsFromLoop(
    std::vector<struct epoll_event> epollEvents) {
  TP_DCHECK(deferredExecutor_.inLoop());

  // Process events returned by epoll_wait(2).
  for (const auto& event : epollEvents) {
    const uint64_t record = event.data.u64;

    // Make a copy so that if the handler unregisters itself as it runs it will
    // still be kept alive by our copy of the shared_ptr.
    std::shared_ptr<EventHandler> handler;
    {
      std::unique_lock<std::mutex> handlersLock(handlersMutex_);
      const auto recordIter = recordToHandler_.find(record);
      if (recordIter == recordToHandler_.end()) {
        continue;
      }
      handler = recordIter->second;
    }

    handler->handleEventsFromLoop(event.events);
  }
}

std::string EpollLoop::formatEpollEvents(uint32_t events) {
  std::string res;
  if (events & EPOLLIN) {
    res = res.empty() ? "IN" : res + " | IN";
    events &= ~EPOLLIN;
  }
  if (events & EPOLLOUT) {
    res = res.empty() ? "OUT" : res + " | OUT";
    events &= ~EPOLLOUT;
  }
  if (events & EPOLLERR) {
    res = res.empty() ? "ERR" : res + " | ERR";
    events &= ~EPOLLERR;
  }
  if (events & EPOLLHUP) {
    res = res.empty() ? "HUP" : res + " | HUP";
    events &= ~EPOLLHUP;
  }
  if (events > 0) {
    std::string eventsStr = std::to_string(events);
    res = res.empty() ? eventsStr : res + " | " + eventsStr;
  }
  return res;
}

} // namespace tensorpipe


================================================
FILE: tensorpipe/common/epoll_loop.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <atomic>
#include <memory>
#include <mutex>
#include <string>
#include <thread>
#include <unordered_map>
#include <vector>

#include <sys/epoll.h>

#include <tensorpipe/common/deferred_executor.h>
#include <tensorpipe/common/fd.h>

namespace tensorpipe {

class EpollLoop final {
 public:
  // Abstract base class called by the epoll(2) event loop.
  //
  // Dispatch to multiple types is needed because we must deal with a
  // few listening sockets and an eventfd(2) per connection.
  //
  class EventHandler {
   public:
    virtual ~EventHandler() = default;

    virtual void handleEventsFromLoop(int events) = 0;
  };

  explicit EpollLoop(DeferredExecutor& deferredExecutor);

  // Register file descriptor with event loop.
  //
  // Trigger the handler if any of the epoll events in the `events`
  // mask occurs. If an event is triggered, the loop first acquires a
  // copy of the shared_ptr to the handler before calling into its
  // handler function. This ensures that the handler is alive for the
  // duration of this function.
  //
  void registerDescriptor(int fd, int events, std::shared_ptr<EventHandler> h);

  // Unregister file descriptor from event loop.
  //
  // This resets the shared_ptr to the event handler that was registered
  // in `registerDescriptor`. Upon returning, the handler can no
  // longer be called, even if there were pending events for the file
  // descriptor. Only if the loop had acquired a shared_ptr to the
  // handler prior to this function being called, can the handler
  // function still be called.
  //
  void unregisterDescriptor(int fd);

  void close();

  // Tell loop to terminate when no more handlers remain.
  void join();

  ~EpollLoop();

  static std::string formatEpollEvents(uint32_t events);

 private:
  static constexpr auto kCapacity = 64;

  // The reactor is used to process events for this loop.
  DeferredExecutor& deferredExecutor_;

  // Wake up the event loop.
  void wakeup();

  // Main loop function.
  void loop();

  // Check whether some handlers are currently registered.
  bool hasRegisteredHandlers();

  Fd epollFd_;
  Fd eventFd_;
  std::atomic<bool> closed_{false};
  std::atomic<bool> joined_{false};
  std::thread thread_;

  // Interaction with epoll(7).
  //
  // A dedicated thread runs epoll_wait(2) in a loop and, every time it returns,
  // it defers a function to the reactor which is responsible for processing the
  // epoll events and executing the handlers, and then notify the epoll thread
  // that it is done, for it to start another iteration. This back-and-forth
  // between these threads is done to ensure that all epoll handlers are run
  // from the reactor thread, just like everything else. Doing so makes it
  // easier to reason about how certain events are sequenced. For example, if
  // another processes first makes a write to a connection and then closes the
  // accompanying Unix domain socket, we know for a fact that the reactor will
  // first react to the write, and then react to the epoll event caused by
  // closing the socket. If we didn't force serialization onto the reactor, we
  // would not have this guarantee.
  //
  // It's safe to call epoll_ctl from one thread while another thread is blocked
  // on an epoll_wait call. This means that the kernel internally serializes the
  // operations on a single epoll fd. However, we have no way to control whether
  // a modification of the set of file descriptors monitored by epoll occurred
  // just before or just after the return from the epoll_wait. This means that
  // when we start processing the result of epoll_wait we can't know what set of
  // file descriptors it operated on. This becomes a problem if, for example, in
  // between the moment epoll_wait returns and the moment we process the results
  // a file descriptor is unregistered and closed and another one with the same
  // value is opened and registered: we'd end up calling the handler of the new
  // fd for the events of the old one (which probably include errors).
  //
  // However, epoll offers a way to address this: epoll_wait returns, for each
  // event, the piece of extra data that was provided by the *last* call on
  // epoll_ctl for that fd. This allows us to detect whether epoll_wait had
  // taken into account an update to the set of fds or not. We do so by giving
  // each update a unique identifier, called "record". Each update to a fd will
  // associate a new record to it. The handlers are associated to records (and
  // not to fds), and for each fd we know which handler is the one currently
  // installed. This way when processing an event we can detect whether the
  // record for that event is still valid or whether it is stale, in which case
  // we disregard the event, and wait for it to fire again at the next epoll
  // iteration, with the up-to-date handler.
  std::unordered_map<int, uint64_t> fdToRecord_;
  std::unordered_map<uint64_t, std::shared_ptr<EventHandler>> recordToHandler_;
  uint64_t nextRecord_{1}; // Reserve record 0 for the eventfd
  std::mutex handlersMutex_;

  // Deferred to the reactor to handle the events received by epoll_wait(2).
  void handleEpollEventsFromLoop(std::vector<struct epoll_event> epollEvents);
};

} // namespace tensorpipe


================================================
FILE: tensorpipe/common/error.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/common/error.h>

#include <cstring>
#include <sstream>

#include <tensorpipe/common/defs.h>

namespace tensorpipe {

const Error Error::kSuccess = Error();

std::string Error::what() const {
  TP_DCHECK(error_);
  std::ostringstream ss;
  ss << error_->what() << " (this error originated at " << file_ << ":" << line_
     << ")";
  return ss.str();
}

std::string SystemError::what() const {
  std::ostringstream ss;
  ss << syscall_ << ": " << strerror(error_);
  return ss.str();
}

int SystemError::errorCode() const {
  return error_;
}

std::string ShortReadError::what() const {
  std::ostringstream ss;
  ss << "short read: got " << actual_ << " bytes while expecting to read "
     << expected_ << " bytes";
  return ss.str();
}

std::string ShortWriteError::what() const {
  std::ostringstream ss;
  ss << "short write: wrote " << actual_ << " bytes while expecting to write "
     << expected_ << " bytes";
  return ss.str();
}

std::string EOFError::what() const {
  return "eof";
}

} // namespace tensorpipe


================================================
FILE: tensorpipe/common/error.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <memory>
#include <string>

namespace tensorpipe {

// Base class for actual errors.
class BaseError {
 public:
  virtual ~BaseError() = default;

  // Returns an explanatory string.
  // Like `std::exception` but returns a `std::string`.
  virtual std::string what() const = 0;
};

// Wrapper class for errors.
//
// Background: we wish to not use exceptions yet need an error
// representation that can propagate across function and thread
// boundaries. This representation must be copyable (so we can store
// and return it at a later point in time) and retain downstream type
// information. This implies a heap allocation because it's the
// easiest way to deal with variable size objects (barring a union of
// all downstream error classes and a lot of custom code). Instead of
// passing a shared_ptr around directly, we use this wrapper class to
// keep implementation details hidden from calling code.
//
class Error final {
 public:
  // Constant instance that indicates success.
  static const Error kSuccess;

  // Default constructor for error that is not an error.
  Error() {}

  Error(std::shared_ptr<BaseError> error, std::string file, int line)
      : error_(std::move(error)), file_(std::move(file)), line_(line) {}

  ~Error() = default;

  // Converting to boolean means checking if there is an error. This
  // means we don't need to use an `std::optional` and allows for a
  // snippet like the following:
  //
  //   if (error) {
  //     // Deal with it.
  //   }
  //
  operator bool() const {
    return static_cast<bool>(error_);
  }

  template <typename T>
  std::shared_ptr<T> castToType() const {
    return std::dynamic_pointer_cast<T>(error_);
  }

  template <typename T>
  bool isOfType() const {
    return castToType<T>() != nullptr;
  }

  // Like `std::exception` but returns a `std::string`.
  std::string what() const;

 private:
  std::shared_ptr<BaseError> error_;
  std::string file_;
  int line_;
};

class SystemError final : public BaseError {
 public:
  explicit SystemError(const char* syscall, int error)
      : syscall_(syscall), error_(error) {}

  std::string what() const override;

  int errorCode() const;

 private:
  const char* syscall_;
  const int error_;
};

class ShortReadError final : public BaseError {
 public:
  ShortReadError(ssize_t expected, ssize_t actual)
      : expected_(expected), actual_(actual) {}

  std::string what() const override;

 private:
  const ssize_t expected_;
  const ssize_t actual_;
};

class ShortWriteError final : public BaseError {
 public:
  ShortWriteError(ssize_t expected, ssize_t actual)
      : expected_(expected), actual_(actual) {}

  std::string what() const override;

 private:
  const ssize_t expected_;
  const ssize_t actual_;
};

class EOFError final : public BaseError {
 public:
  EOFError() {}

  std::string what() const override;
};

} // namespace tensorpipe


================================================
FILE: tensorpipe/common/error_macros.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <tensorpipe/common/defs.h>
#include <tensorpipe/common/error.h>

#define TP_CREATE_ERROR(typ, ...)         \
  (Error(                                 \
      std::make_shared<typ>(__VA_ARGS__), \
      TP_TRIM_FILENAME(__FILE__),         \
      __LINE__))


================================================
FILE: tensorpipe/common/fd.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/common/fd.h>

#include <unistd.h>

#include <tensorpipe/common/defs.h>
#include <tensorpipe/common/error.h>
#include <tensorpipe/common/error_macros.h>

namespace tensorpipe {

ssize_t Fd::read(void* buf, size_t count) {
  ssize_t rv = -1;
  for (;;) {
    rv = ::read(fd_, buf, count);
    if (rv == -1 && errno == EINTR) {
      continue;
    }
    break;
  }
  return rv;
}

// Proxy to write(2) with EINTR retry.
ssize_t Fd::write(const void* buf, size_t count) {
  ssize_t rv = -1;
  for (;;) {
    rv = ::write(fd_, buf, count);
    if (rv == -1 && errno == EINTR) {
      continue;
    }
    break;
  }
  return rv;
}

// Call read and throw if it doesn't complete.
Error Fd::readFull(void* buf, size_t count) {
  auto rv = read(buf, count);
  if (rv == -1) {
    return TP_CREATE_ERROR(SystemError, "read", errno);
  }
  if (rv != count) {
    return TP_CREATE_ERROR(ShortReadError, count, rv);
  }
  return Error::kSuccess;
}

// Call write and throw if it doesn't complete.
Error Fd::writeFull(const void* buf, size_t count) {
  auto rv = write(buf, count);
  if (rv == -1) {
    return TP_CREATE_ERROR(SystemError, "write", errno);
  }
  if (rv != count) {
    return TP_CREATE_ERROR(ShortWriteError, count, rv);
  }
  return Error::kSuccess;
}

} // namespace tensorpipe


================================================
FILE: tensorpipe/common/fd.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <stdexcept>
#include <type_traits>

#include <unistd.h>

#include <tensorpipe/common/error.h>

namespace tensorpipe {

class Fd {
 public:
  Fd() = default;

  explicit Fd(int fd) : fd_(fd) {}

  virtual ~Fd() {
    reset();
  }

  // Disable copy constructor.
  Fd(const Fd&) = delete;

  // Disable copy assignment.
  Fd& operator=(const Fd&) = delete;

  // Custom move constructor.
  Fd(Fd&& other) noexcept {
    std::swap(fd_, other.fd_);
  }

  // Custom move assignment.
  Fd& operator=(Fd&& other) noexcept {
    std::swap(fd_, other.fd_);
    return *this;
  }

  // Return underlying file descriptor.
  int fd() const {
    return fd_;
  }

  bool hasValue() const {
    return fd_ >= 0;
  }

  void reset() {
    if (hasValue()) {
      ::close(fd_);
      fd_ = -1;
    }
  }

  // Proxy to read(2) with EINTR retry.
  ssize_t read(void* buf, size_t count);

  // Proxy to write(2) with EINTR retry.
  ssize_t write(const void* buf, size_t count);

  // Call read and return error if it doesn't exactly read `count` bytes.
  Error readFull(void* buf, size_t count);

  // Call write and return error if it doesn't exactly write `count` bytes.
  Error writeFull(const void* buf, size_t count);

  // Call `readFull` with trivially copyable type. Throws on errors.
  template <typename T>
  T readOrThrow() {
    T tmp;
    static_assert(std::is_trivially_copyable<T>::value, "!");
    auto err = readFull(&tmp, sizeof(T));
    if (err) {
      throw std::runtime_error(err.what());
    }
    return tmp;
  }

  // Call `writeFull` with trivially copyable type. Throws on errors.
  template <typename T>
  void writeOrThrow(const T& t) {
    static_assert(std::is_trivially_copyable<T>::value, "!");
    auto err = writeFull(&t, sizeof(T));
    if (err) {
      throw std::runtime_error(err.what());
    }
  }

  // Call `readFull` with trivially copyable type.
  template <typename T>
  Error read(T* t) {
    static_assert(std::is_trivially_copyable<T>::value, "!");
    return readFull(t, sizeof(T));
  }

  // Call `writeFull` with trivially copyable type.
  template <typename T>
  Error write(const T& t) {
    static_assert(std::is_trivially_copyable<T>::value, "!");
    return writeFull(&t, sizeof(T));
  }

 protected:
  int fd_{-1};
};

} // namespace tensorpipe


================================================
FILE: tensorpipe/common/ibv.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/common/ibv.h>

#include <cstdlib>
#include <cstring>

namespace tensorpipe {

std::string ibvWorkCompletionOpcodeToStr(IbvLib::wc_opcode opcode) {
  switch (opcode) {
    case IbvLib::WC_SEND:
      return "SEND";
    case IbvLib::WC_RDMA_WRITE:
      return "RDMA_WRITE";
    case IbvLib::WC_RDMA_READ:
      return "RDMA_READ";
    case IbvLib::WC_COMP_SWAP:
      return "COMP_SWAP";
    case IbvLib::WC_FETCH_ADD:
      return "FETCH_ADD";
    case IbvLib::WC_BIND_MW:
      return "BIND_MW";
    case IbvLib::WC_RECV:
      return "RECV";
    case IbvLib::WC_RECV_RDMA_WITH_IMM:
      return "RECV_RDMA_WITH_IMM";
    default:
      return "UNKNOWN (" + std::to_string(opcode) + ")";
  }
}

struct IbvAddress makeIbvAddress(
    const IbvLib& ibvLib,
    const IbvContext& context,
    uint8_t portNum,
    uint8_t globalIdentifierIndex) {
  struct IbvAddress addr;
  std::memset(&addr, 0, sizeof(addr));

  addr.portNum = portNum;
  addr.globalIdentifierIndex = globalIdentifierIndex;

  IbvLib::port_attr portAttr;
  std::memset(&portAttr, 0, sizeof(portAttr));
  TP_CHECK_IBV_INT(ibvLib.query_port(context.get(), portNum, &portAttr));
  addr.localIdentifier = portAttr.lid;
  addr.maximumTransmissionUnit = portAttr.active_mtu;
  addr.maximumMessageSize = portAttr.max_msg_sz;

  TP_CHECK_IBV_INT(ibvLib.query_gid(
      context.get(), portNum, globalIdentifierIndex, &addr.globalIdentifier));

  return addr;
}

struct IbvSetupInformation makeIbvSetupInformation(
    const IbvAddress& addr,
    const IbvQueuePair& qp) {
  struct IbvSetupInformation info;
  std::memset(&info, 0, sizeof(info));

  info.localIdentifier = addr.localIdentifier;
  info.globalIdentifier = addr.globalIdentifier;
  info.queuePairNumber = qp->qp_num;
  info.maximumTransmissionUnit = addr.maximumTransmissionUnit;
  info.maximumMessageSize = addr.maximumMessageSize;

  return info;
}

void transitionIbvQueuePairToInit(
    const IbvLib& ibvLib,
    IbvQueuePair& qp,
    const IbvAddress& selfAddr) {
  IbvLib::qp_attr attr;
  std::memset(&attr, 0, sizeof(attr));
  int attrMask = 0;

  attrMask |= IbvLib::QP_STATE;
  attr.qp_state = IbvLib::QPS_INIT;

  // Hardcode the use of the first entry of the partition key table, as it will
  // always be valid.
  // FIXME: Make this configurable similarly to the port number.
  attrMask |= IbvLib::QP_PKEY_INDEX;
  attr.pkey_index = 0;

  attrMask |= IbvLib::QP_PORT;
  attr.port_num = selfAddr.portNum;

  attrMask |= IbvLib::QP_ACCESS_FLAGS;
  attr.qp_access_flags =
      IbvLib::ACCESS_LOCAL_WRITE | IbvLib::ACCESS_REMOTE_WRITE;

  TP_CHECK_IBV_INT(ibvLib.modify_qp(qp.get(), &attr, attrMask));
}

void transitionIbvQueuePairToReadyToReceive(
    const IbvLib& ibvLib,
    IbvQueuePair& qp,
    const IbvAddress& selfAddr,
    const IbvSetupInformation& destinationInfo) {
  IbvLib::qp_attr attr;
  std::memset(&attr, 0, sizeof(attr));
  int attrMask = 0;

  attrMask |= IbvLib::QP_STATE;
  attr.qp_state = IbvLib::QPS_RTR;

  // Global routing is only set up as far as needed to support RoCE.
  attrMask |= IbvLib::QP_AV;
  if (destinationInfo.localIdentifier != 0) {
    attr.ah_attr.is_global = 0;
    attr.ah_attr.dlid = destinationInfo.localIdentifier;
  } else {
    attr.ah_attr.is_global = 1;
    attr.ah_attr.grh.dgid = destinationInfo.globalIdentifier;
    attr.ah_attr.grh.sgid_index = selfAddr.globalIdentifierIndex;
    attr.ah_attr.grh.hop_limit = 1;
  }
  attr.ah_attr.port_num = selfAddr.portNum;

  attrMask |= IbvLib::QP_PATH_MTU;
  attr.path_mtu = std::min(
      selfAddr.maximumTransmissionUnit,
      destinationInfo.maximumTransmissionUnit);

  attrMask |= IbvLib::QP_DEST_QPN;
  attr.dest_qp_num = destinationInfo.queuePairNumber;

  // The packet sequence numbers of the local send and of the remote receive
  // queues (and vice versa) only need to match. Thus we set them all to zero.
  attrMask |= IbvLib::QP_RQ_PSN;
  attr.rq_psn = 0;

  attrMask |= IbvLib::QP_MAX_DEST_RD_ATOMIC;
  attr.max_dest_rd_atomic = 1;

  attrMask |= IbvLib::QP_MIN_RNR_TIMER;
  attr.min_rnr_timer = 20; // 10.24 milliseconds

  TP_CHECK_IBV_INT(ibvLib.modify_qp(qp.get(), &attr, attrMask));
}

void transitionIbvQueuePairToReadyToSend(
    const IbvLib& ibvLib,
    IbvQueuePair& qp) {
  IbvLib::qp_attr attr;
  std::memset(&attr, 0, sizeof(attr));
  int attrMask = 0;

  attrMask |= IbvLib::QP_STATE;
  attr.qp_state = IbvLib::QPS_RTS;

  // The packet sequence numbers of the local send and of the remote receive
  // queues (and vice versa) only need to match. Thus we set them all to zero.
  attrMask |= IbvLib::QP_SQ_PSN;
  attr.sq_psn = 0;

  attrMask |= IbvLib::QP_TIMEOUT;
  attr.timeout = 14; // 67.1 milliseconds

  attrMask |= IbvLib::QP_RETRY_CNT;
  attr.retry_cnt = 7;

  attrMask |= IbvLib::QP_RNR_RETRY;
  attr.rnr_retry = 7; // infinite

  attrMask |= IbvLib::QP_MAX_QP_RD_ATOMIC;
  attr.max_rd_atomic = 1;

  TP_CHECK_IBV_INT(ibvLib.modify_qp(qp.get(), &attr, attrMask));
}

void transitionIbvQueuePairToError(const IbvLib& ibvLib, IbvQueuePair& qp) {
  IbvLib::qp_attr attr;
  std::memset(&attr, 0, sizeof(attr));
  int attrMask = 0;

  attrMask |= IbvLib::QP_STATE;
  attr.qp_state = IbvLib::QPS_ERR;

  TP_CHECK_IBV_INT(ibvLib.modify_qp(qp.get(), &attr, attrMask));
}

} // namespace tensorpipe


================================================
FILE: tensorpipe/common/ibv.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <memory>

#include <tensorpipe/common/defs.h>
#include <tensorpipe/common/ibv_lib.h>

namespace tensorpipe {

// Error checking macros

#define TP_CHECK_IBV_PTR(op)                   \
  [&]() {                                      \
    auto ptr = op;                             \
    TP_THROW_SYSTEM_IF(ptr == nullptr, errno); \
    return ptr;                                \
  }()

#define TP_CHECK_IBV_INT(op)           \
  {                                    \
    int rv = op;                       \
    TP_THROW_SYSTEM_IF(rv < 0, errno); \
  }

#define TP_CHECK_IBV_VOID(op) op;

// Logging helpers

std::string ibvWorkCompletionOpcodeToStr(IbvLib::wc_opcode opcode);

// RAII wrappers

class IbvDeviceList {
 private:
  IbvDeviceList(const IbvLib& ibvLib, IbvLib::device** ptr, int size)
      : deviceList_(ptr, Deleter{&ibvLib}), size_(size) {}

 public:
  IbvDeviceList() = default;

  static std::tuple<Error, IbvDeviceList> create(const IbvLib& ibvLib) {
    int size;
    IbvLib::device** ptr = ibvLib.get_device_list(&size);
    if (ptr == nullptr) {
      // Earlier versions of libibverbs had a bug where errno would be set to
      // *negative* ENOSYS when the module wasn't found. This got fixed in
      // https://github.com/linux-rdma/rdma-core/commit/062bf1a72badaf6ad2d51ebe4c8c8bdccfc376e2
      // However, to support those versions, we manually flip it in case.
      return std::make_tuple(
          TP_CREATE_ERROR(
              SystemError,
              "ibv_get_device_list",
              errno == -ENOSYS ? ENOSYS : errno),
          IbvDeviceList());
    }
    return std::make_tuple(Error::kSuccess, IbvDeviceList(ibvLib, ptr, size));
  }

  int size() {
    return size_;
  }

  IbvLib::device& operator[](int i) {
    return *deviceList_.get()[i];
  }

  void reset() {
    deviceList_.reset();
  }

  // FIXME Can we support a "range" API (i.e., a begin() and end() method) so
  // that this can be used in a for (auto& dev : deviceList) expression?

 private:
  struct Deleter {
    void operator()(IbvLib::device** ptr) {
      TP_CHECK_IBV_VOID(ibvLib->free_device_list(ptr));
    }

    const IbvLib* ibvLib;
  };

  std::unique_ptr<IbvLib::device*, Deleter> deviceList_;
  int size_;
};

struct IbvContextDeleter {
  void operator()(IbvLib::context* ptr) {
    TP_CHECK_IBV_INT(ibvLib->close_device(ptr));
  }

  const IbvLib* ibvLib;
};

using IbvContext = std::unique_ptr<IbvLib::context, IbvContextDeleter>;

inline IbvContext createIbvContext(
    const IbvLib& ibvLib,
    IbvLib::device& device) {
  return IbvContext(
      TP_CHECK_IBV_PTR(ibvLib.open_device(&device)),
      IbvContextDeleter{&ibvLib});
}

struct IbvProtectionDomainDeleter {
  void operator()(IbvLib::pd* ptr) {
    TP_CHECK_IBV_INT(ibvLib->dealloc_pd(ptr));
  }

  const IbvLib* ibvLib;
};

using IbvProtectionDomain =
    std::unique_ptr<IbvLib::pd, IbvProtectionDomainDeleter>;

inline IbvProtectionDomain createIbvProtectionDomain(
    const IbvLib& ibvLib,
    IbvContext& context) {
  return IbvProtectionDomain(
      TP_CHECK_IBV_PTR(ibvLib.alloc_pd(context.get())),
      IbvProtectionDomainDeleter{&ibvLib});
}

struct IbvCompletionQueueDeleter {
  void operator()(IbvLib::cq* ptr) {
    TP_CHECK_IBV_INT(ibvLib->destroy_cq(ptr));
  }

  const IbvLib* ibvLib;
};

using IbvCompletionQueue =
    std::unique_ptr<IbvLib::cq, IbvCompletionQueueDeleter>;

inline IbvCompletionQueue createIbvCompletionQueue(
    const IbvLib& ibvLib,
    IbvContext& context,
    int cqe,
    void* cq_context,
    IbvLib::comp_channel* channel,
    int comp_vector) {
  return IbvCompletionQueue(
      TP_CHECK_IBV_PTR(ibvLib.create_cq(
          context.get(), cqe, cq_context, channel, comp_vector)),
      IbvCompletionQueueDeleter{&ibvLib});
}

struct IbvSharedReceiveQueueDeleter {
  void operator()(IbvLib::srq* ptr) {
    TP_CHECK_IBV_INT(ibvLib->destroy_srq(ptr));
  }

  const IbvLib* ibvLib;
};

using IbvSharedReceiveQueue =
    std::unique_ptr<IbvLib::srq, IbvSharedReceiveQueueDeleter>;

inline IbvSharedReceiveQueue createIbvSharedReceiveQueue(
    const IbvLib& ibvLib,
    IbvProtectionDomain& pd,
    IbvLib::srq_init_attr& initAttr) {
  return IbvSharedReceiveQueue(
      TP_CHECK_IBV_PTR(ibvLib.create_srq(pd.get(), &initAttr)),
      IbvSharedReceiveQueueDeleter{&ibvLib});
}

struct IbvMemoryRegionDeleter {
  void operator()(IbvLib::mr* ptr) {
    TP_CHECK_IBV_INT(ibvLib->dereg_mr(ptr));
  }

  const IbvLib* ibvLib;
};

using IbvMemoryRegion = std::unique_ptr<IbvLib::mr, IbvMemoryRegionDeleter>;

inline IbvMemoryRegion createIbvMemoryRegion(
    const IbvLib& ibvLib,
    IbvProtectionDomain& pd,
    void* addr,
    size_t length,
    int accessFlags) {
  return IbvMemoryRegion(
      TP_CHECK_IBV_PTR(ibvLib.reg_mr(pd.get(), addr, length, accessFlags)),
      IbvMemoryRegionDeleter{&ibvLib});
}

struct IbvQueuePairDeleter {
  void operator()(IbvLib::qp* ptr) {
    TP_CHECK_IBV_INT(ibvLib->destroy_qp(ptr));
  }

  const IbvLib* ibvLib;
};

using IbvQueuePair = std::unique_ptr<IbvLib::qp, IbvQueuePairDeleter>;

inline IbvQueuePair createIbvQueuePair(
    const IbvLib& ibvLib,
    IbvProtectionDomain& pd,
    IbvLib::qp_init_attr& initAttr) {
  return IbvQueuePair(
      TP_CHECK_IBV_PTR(ibvLib.create_qp(pd.get(), &initAttr)),
      IbvQueuePairDeleter{&ibvLib});
}

// Helpers

struct IbvAddress {
  uint8_t portNum;
  uint8_t globalIdentifierIndex;
  // The already-resolved LID of the above device+port pair.
  uint32_t localIdentifier;
  // The already-resolved GID of the above device+port+index combination.
  IbvLib::gid globalIdentifier;
  IbvLib::mtu maximumTransmissionUnit;
  uint32_t maximumMessageSize;
};

struct IbvSetupInformation {
  uint32_t localIdentifier;
  IbvLib::gid globalIdentifier;
  uint32_t queuePairNumber;
  IbvLib::mtu maximumTransmissionUnit;
  uint32_t maximumMessageSize;
};

struct IbvAddress makeIbvAddress(
    const IbvLib& ibvLib,
    const IbvContext& context,
    uint8_t portNum,
    uint8_t globalIdentifierIndex);

struct IbvSetupInformation makeIbvSetupInformation(
    const IbvAddress& addr,
    const IbvQueuePair& qp);

void transitionIbvQueuePairToInit(
    const IbvLib& ibvLib,
    IbvQueuePair& qp,
    const IbvAddress& selfAddr);

void transitionIbvQueuePairToReadyToReceive(
    const IbvLib& ibvLib,
    IbvQueuePair& qp,
    const IbvAddress& selfAddr,
    const IbvSetupInformation& destinationInfo);

void transitionIbvQueuePairToReadyToSend(
    const IbvLib& ibvLib,
    IbvQueuePair& qp);

void transitionIbvQueuePairToError(const IbvLib& ibvLib, IbvQueuePair& qp);

} // namespace tensorpipe


================================================
FILE: tensorpipe/common/ibv_lib.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <memory>

#include <tensorpipe/common/defs.h>
#include <tensorpipe/common/dl.h>

namespace tensorpipe {

// Master list of all symbols we care about from libibverbs.

#define TP_FORALL_IBV_SYMBOLS(_)                                      \
  _(ack_async_event, void, (IbvLib::async_event*))                    \
  _(alloc_pd, IbvLib::pd*, (IbvLib::context*))                        \
  _(close_device, int, (IbvLib::context*))                            \
  _(create_cq,                                                        \
    IbvLib::cq*,                                                      \
    (IbvLib::context*, int, void*, IbvLib::comp_channel*, int))       \
  _(create_qp, IbvLib::qp*, (IbvLib::pd*, IbvLib::qp_init_attr*))     \
  _(create_srq, IbvLib::srq*, (IbvLib::pd*, IbvLib::srq_init_attr*))  \
  _(dealloc_pd, int, (IbvLib::pd*))                                   \
  _(dereg_mr, int, (IbvLib::mr*))                                     \
  _(destroy_cq, int, (IbvLib::cq*))                                   \
  _(destroy_qp, int, (IbvLib::qp*))                                   \
  _(destroy_srq, int, (IbvLib::srq*))                                 \
  _(event_type_str, const char*, (IbvLib::event_type))                \
  _(free_device_list, void, (IbvLib::device**))                       \
  _(get_async_event, int, (IbvLib::context*, IbvLib::async_event*))   \
  _(get_device_list, IbvLib::device**, (int*))                        \
  _(get_device_name, const char*, (IbvLib::device*))                  \
  _(modify_qp, int, (IbvLib::qp*, IbvLib::qp_attr*, int))             \
  _(open_device, IbvLib::context*, (IbvLib::device*))                 \
  _(query_gid, int, (IbvLib::context*, uint8_t, int, IbvLib::gid*))   \
  _(query_port, int, (IbvLib::context*, uint8_t, IbvLib::port_attr*)) \
  _(reg_mr, IbvLib::mr*, (IbvLib::pd*, void*, size_t, int))           \
  _(wc_status_str, const char*, (IbvLib::wc_status))

// Wrapper for libibverbs.

class IbvLib {
 public:
  // Constants

  enum { SYSFS_NAME_MAX = 64, SYSFS_PATH_MAX = 256 };
  enum { WC_IP_CSUM_OK_SHIFT = 2 };

  // Enums

  enum access_flags {
    ACCESS_LOCAL_WRITE = 1,
    ACCESS_REMOTE_WRITE = (1 << 1),
    ACCESS_REMOTE_READ = (1 << 2),
    ACCESS_REMOTE_ATOMIC = (1 << 3),
    ACCESS_MW_BIND = (1 << 4),
    ACCESS_ZERO_BASED = (1 << 5),
    ACCESS_ON_DEMAND = (1 << 6),
    ACCESS_HUGETLB = (1 << 7),
    ACCESS_RELAXED_ORDERING = (1 << 20),
  };

  enum event_type {
    EVENT_CQ_ERR,
    EVENT_QP_FATAL,
    EVENT_QP_REQ_ERR,
    EVENT_QP_ACCESS_ERR,
    EVENT_COMM_EST,
    EVENT_SQ_DRAINED,
    EVENT_PATH_MIG,
    EVENT_PATH_MIG_ERR,
    EVENT_DEVICE_FATAL,
    EVENT_PORT_ACTIVE,
    EVENT_PORT_ERR,
    EVENT_LID_CHANGE,
    EVENT_PKEY_CHANGE,
    EVENT_SM_CHANGE,
    EVENT_SRQ_ERR,
    EVENT_SRQ_LIMIT_REACHED,
    EVENT_QP_LAST_WQE_REACHED,
    EVENT_CLIENT_REREGISTER,
    EVENT_GID_CHANGE,
    EVENT_WQ_FATAL,
  };

  enum mig_state { MIG_MIGRATED, MIG_REARM, MIG_ARMED };

  enum mtu {
    MTU_256 = 1,
    MTU_512 = 2,
    MTU_1024 = 3,
    MTU_2048 = 4,
    MTU_4096 = 5
  };

  enum mw_type { MW_TYPE_1 = 1, MW_TYPE_2 = 2 };

  enum node_type {
    NODE_UNKNOWN = -1,
    NODE_CA = 1,
    NODE_SWITCH,
    NODE_ROUTER,
    NODE_RNIC,
    NODE_USNIC,
    NODE_USNIC_UDP,
    NODE_UNSPECIFIED,
  };

  enum port_state {
    PORT_NOP = 0,
    PORT_DOWN = 1,
    PORT_INIT = 2,
    PORT_ARMED = 3,
    PORT_ACTIVE = 4,
    PORT_ACTIVE_DEFER = 5
  };

  enum qp_attr_mask {
    QP_STATE = 1 << 0,
    QP_CUR_STATE = 1 << 1,
    QP_EN_SQD_ASYNC_NOTIFY = 1 << 2,
    QP_ACCESS_FLAGS = 1 << 3,
    QP_PKEY_INDEX = 1 << 4,
    QP_PORT = 1 << 5,
    QP_QKEY = 1 << 6,
    QP_AV = 1 << 7,
    QP_PATH_MTU = 1 << 8,
    QP_TIMEOUT = 1 << 9,
    QP_RETRY_CNT = 1 << 10,
    QP_RNR_RETRY = 1 << 11,
    QP_RQ_PSN = 1 << 12,
    QP_MAX_QP_RD_ATOMIC = 1 << 13,
    QP_ALT_PATH = 1 << 14,
    QP_MIN_RNR_TIMER = 1 << 15,
    QP_SQ_PSN = 1 << 16,
    QP_MAX_DEST_RD_ATOMIC = 1 << 17,
    QP_PATH_MIG_STATE = 1 << 18,
    QP_CAP = 1 << 19,
    QP_DEST_QPN = 1 << 20,
    QP_RATE_LIMIT = 1 << 25,
  };

  enum qp_state {
    QPS_RESET,
    QPS_INIT,
    QPS_RTR,
    QPS_RTS,
    QPS_SQD,
    QPS_SQE,
    QPS_ERR,
    QPS_UNKNOWN
  };

  enum qp_type {
    QPT_RC = 2,
    QPT_UC,
    QPT_UD,
    QPT_RAW_PACKET = 8,
    QPT_XRC_SEND = 9,
    QPT_XRC_RECV,
    QPT_DRIVER = 0xff,
  };

  enum transport_type {
    TRANSPORT_UNKNOWN = -1,
    TRANSPORT_IB = 0,
    TRANSPORT_IWARP,
    TRANSPORT_USNIC,
    TRANSPORT_USNIC_UDP,
    TRANSPORT_UNSPECIFIED,
  };

  enum wc_flags {
    WC_GRH = 1 << 0,
    WC_WITH_IMM = 1 << 1,
    WC_IP_CSUM_OK = 1 << WC_IP_CSUM_OK_SHIFT,
    WC_WITH_INV = 1 << 3,
    WC_TM_SYNC_REQ = 1 << 4,
    WC_TM_MATCH = 1 << 5,
    WC_TM_DATA_VALID = 1 << 6,
  };

  enum wc_opcode {
    WC_SEND,
    WC_RDMA_WRITE,
    WC_RDMA_READ,
    WC_COMP_SWAP,
    WC_FETCH_ADD,
    WC_BIND_MW,
    WC_LOCAL_INV,
    WC_TSO,
    WC_RECV = 1 << 7,
    WC_RECV_RDMA_WITH_IMM,

    WC_TM_ADD,
    WC_TM_DEL,
    WC_TM_SYNC,
    WC_TM_RECV,
    WC_TM_NO_TAG,
    WC_DRIVER1,
  };

  enum wc_status {
    WC_SUCCESS,
    WC_LOC_LEN_ERR,
    WC_LOC_QP_OP_ERR,
    WC_LOC_EEC_OP_ERR,
    WC_LOC_PROT_ERR,
    WC_WR_FLUSH_ERR,
    WC_MW_BIND_ERR,
    WC_BAD_RESP_ERR,
    WC_LOC_ACCESS_ERR,
    WC_REM_INV_REQ_ERR,
    WC_REM_ACCESS_ERR,
    WC_REM_OP_ERR,
    WC_RETRY_EXC_ERR,
    WC_RNR_RETRY_EXC_ERR,
    WC_LOC_RDD_VIOL_ERR,
    WC_REM_INV_RD_REQ_ERR,
    WC_REM_ABORT_ERR,
    WC_INV_EECN_ERR,
    WC_INV_EEC_STATE_ERR,
    WC_FATAL_ERR,
    WC_RESP_TIMEOUT_ERR,
    WC_GENERAL_ERR,
    WC_TM_ERR,
    WC_TM_RNDV_INCOMPLETE,
  };

  enum wr_opcode {
    WR_RDMA_WRITE,
    WR_RDMA_WRITE_WITH_IMM,
    WR_SEND,
    WR_SEND_WITH_IMM,
    WR_RDMA_READ,
    WR_ATOMIC_CMP_AND_SWP,
    WR_ATOMIC_FETCH_AND_ADD,
    WR_LOCAL_INV,
    WR_BIND_MW,
    WR_SEND_WITH_INV,
    WR_TSO,
    WR_DRIVER1,
  };

  // Structs and unions

  // Forward declarations

  struct _compat_port_attr;
  struct ah;
  struct context;
  struct cq;
  struct device;
  struct mr;
  struct mw_bind;
  struct mw;
  struct pd;
  struct qp;
  struct srq;
  struct wq;

  // Attributes

  struct port_attr {
    IbvLib::port_state state;
    IbvLib::mtu max_mtu;
    IbvLib::mtu active_mtu;
    int gid_tbl_len;
    uint32_t port_cap_flags;
    uint32_t max_msg_sz;
    uint32_t bad_pkey_cntr;
    uint32_t qkey_viol_cntr;
    uint16_t pkey_tbl_len;
    uint16_t lid;
    uint16_t sm_lid;
    uint8_t lmc;
    uint8_t max_vl_num;
    uint8_t sm_sl;
    uint8_t subnet_timeout;
    uint8_t init_type_reply;
    uint8_t active_width;
    uint8_t active_speed;
    uint8_t phys_state;
    uint8_t link_layer;
    uint8_t flags;
    uint16_t port_cap_flags2;
  };

  struct qp_cap {
    uint32_t max_send_wr;
    uint32_t max_recv_wr;
    uint32_t max_send_sge;
    uint32_t max_recv_sge;
    uint32_t max_inline_data;
  };

  union gid {
    uint8_t raw[16];
    struct {
      uint64_t subnet_prefix;
      uint64_t interface_id;
    } global;
  };

  struct global_route {
    IbvLib::gid dgid;
    uint32_t flow_label;
    uint8_t sgid_index;
    uint8_t hop_limit;
    uint8_t traffic_class;
  };

  struct ah_attr {
    IbvLib::global_route grh;
    uint16_t dlid;
    uint8_t sl;
    uint8_t src_path_bits;
    uint8_t static_rate;
    uint8_t is_global;
    uint8_t port_num;
  };

  struct qp_attr {
    IbvLib::qp_state qp_state;
    IbvLib::qp_state cur_qp_state;
    IbvLib::mtu path_mtu;
    IbvLib::mig_state path_mig_state;
    uint32_t qkey;
    uint32_t rq_psn;
    uint32_t sq_psn;
    uint32_t dest_qp_num;
    unsigned int qp_access_flags;
    IbvLib::qp_cap cap;
    IbvLib::ah_attr ah_attr;
    IbvLib::ah_attr alt_ah_attr;
    uint16_t pkey_index;
    uint16_t alt_pkey_index;
    uint8_t en_sqd_async_notify;
    uint8_t sq_draining;
    uint8_t max_rd_atomic;
    uint8_t max_dest_rd_atomic;
    uint8_t min_rnr_timer;
    uint8_t port_num;
    uint8_t timeout;
    uint8_t retry_cnt;
    uint8_t rnr_retry;
    uint8_t alt_port_num;
    uint8_t alt_timeout;
    uint32_t rate_limit;
  };

  struct qp_init_attr {
    void* qp_context;
    IbvLib::cq* send_cq;
    IbvLib::cq* recv_cq;
    IbvLib::srq* srq;
    IbvLib::qp_cap cap;
    IbvLib::qp_type qp_type;
    int sq_sig_all;
  };

  struct srq_attr {
    uint32_t max_wr;
    uint32_t max_sge;
    uint32_t srq_limit;
  };

  struct srq_init_attr {
    void* srq_context;
    IbvLib::srq_attr attr;
  };

  // Work requests and completions

  struct sge {
    uint64_t addr;
    uint32_t length;
    uint32_t lkey;
  };

  struct recv_wr {
    uint64_t wr_id;
    IbvLib::recv_wr* next;
    IbvLib::sge* sg_list;
    int num_sge;
  };

  struct mw_bind_info {
    IbvLib::mr* mr;
    uint64_t addr;
    uint64_t length;
    unsigned int mw_access_flags;
  };

  struct send_wr {
    uint64_t wr_id;
    IbvLib::send_wr* next;
    IbvLib::sge* sg_list;
    int num_sge;
    IbvLib::wr_opcode opcode;
    unsigned int send_flags;
    union {
      uint32_t imm_data;
      uint32_t invalidate_rkey;
    };
    union {
      struct {
        uint64_t remote_addr;
        uint32_t rkey;
      } rdma;
      struct {
        uint64_t remote_addr;
        uint64_t compare_add;
        uint64_t swap;
        uint32_t rkey;
      } atomic;
      struct {
        IbvLib::ah* ah;
        uint32_t remote_qpn;
        uint32_t remote_qkey;
      } ud;
    } wr;
    union {
      struct {
        uint32_t remote_srqn;
      } xrc;
    } qp_type;
    union {
      struct {
        IbvLib::mw* mw;
        uint32_t rkey;
        IbvLib::mw_bind_info bind_info;
      } bind_mw;
      struct {
        void* hdr;
        uint16_t hdr_sz;
        uint16_t mss;
      } tso;
    };
  };

  struct wc {
    uint64_t wr_id;
    IbvLib::wc_status status;
    IbvLib::wc_opcode opcode;
    uint32_t vendor_err;
    uint32_t byte_len;
    union {
      uint32_t imm_data;
      uint32_t invalidated_rkey;
    };
    uint32_t qp_num;
    uint32_t src_qp;
    unsigned int wc_flags;
    uint16_t pkey_index;
    uint16_t slid;
    uint8_t sl;
    uint8_t dlid_path_bits;
  };

  // Main structs

  struct async_event {
    union {
      IbvLib::cq* cq;
      IbvLib::qp* qp;
      IbvLib::srq* srq;
      IbvLib::wq* wq;
      int port_num;
    } element;
    IbvLib::event_type event_type;
  };

  struct comp_channel {
    IbvLib::context* context;
    int fd;
    int refcnt;
  };

  struct context_ops {
    void* (*_compat_query_device)(void);
    int (*_compat_query_port)(
        IbvLib::context* context,
        uint8_t port_num,
        struct IbvLib::_compat_port_attr* port_attr);
    void* (*_compat_alloc_pd)(void);
    void* (*_compat_dealloc_pd)(void);
    void* (*_compat_reg_mr)(void);
    void* (*_compat_rereg_mr)(void);
    void* (*_compat_dereg_mr)(void);
    IbvLib::mw* (*alloc_mw)(IbvLib::pd* pd, IbvLib::mw_type type);
    int (*bind_mw)(IbvLib::qp* qp, IbvLib::mw* mw, IbvLib::mw_bind* mw_bind);
    int (*dealloc_mw)(IbvLib::mw* mw);
    void* (*_compat_create_cq)(void);
    int (*poll_cq)(IbvLib::cq* cq, int num_entries, IbvLib::wc* wc);
    int (*req_notify_cq)(IbvLib::cq* cq, int solicited_only);
    void* (*_compat_cq_event)(void);
    void* (*_compat_resize_cq)(void);
    void* (*_compat_destroy_cq)(void);
    void* (*_compat_create_srq)(void);
    void* (*_compat_modify_srq)(void);
    void* (*_compat_query_srq)(void);
    void* (*_compat_destroy_srq)(void);
    int (*post_srq_recv)(
        IbvLib::srq* srq,
        IbvLib::recv_wr* recv_wr,
        IbvLib::recv_wr** bad_recv_wr);
    void* (*_compat_create_qp)(void);
    void* (*_compat_query_qp)(void);
    void* (*_compat_modify_qp)(void);
    void* (*_compat_destroy_qp)(void);
    int (*post_send)(
        IbvLib::qp* qp,
        IbvLib::send_wr* wr,
        IbvLib::send_wr** bad_wr);
    int (*post_recv)(
        IbvLib::qp* qp,
        IbvLib::recv_wr* wr,
        IbvLib::recv_wr** bad_wr);
    void* (*_compat_create_ah)(void);
    void* (*_compat_destroy_ah)(void);
    void* (*_compat_attach_mcast)(void);
    void* (*_compat_detach_mcast)(void);
    void* (*_compat_async_event)(void);
  };

  struct context {
    IbvLib::device* device;
    IbvLib::context_ops ops;
    int cmd_fd;
    int async_fd;
    int num_comp_vectors;
    pthread_mutex_t mutex;
    void* abi_compat;
  };

  struct cq {
    IbvLib::context* context;
    IbvLib::comp_channel* channel;
    void* cq_context;
    uint32_t handle;
    int cqe;

    pthread_mutex_t mutex;
    pthread_cond_t cond;
    uint32_t comp_events_completed;
    uint32_t async_events_completed;
  };

  struct _device_ops {
    IbvLib::context* (*_dummy1)(IbvLib::device* device, int cmd_fd);
    void (*_dummy2)(IbvLib::context* context);
  };

  struct device {
    IbvLib::_device_ops _ops;
    IbvLib::node_type node_type;
    IbvLib::transport_type transport_type;
    char name[IbvLib::SYSFS_NAME_MAX];
    char dev_name[IbvLib::SYSFS_NAME_MAX];
    char dev_path[IbvLib::SYSFS_PATH_MAX];
    char ibdev_path[IbvLib::SYSFS_PATH_MAX];
  };

  struct mr {
    IbvLib::context* context;
    IbvLib::pd* pd;
    void* addr;
    size_t length;
    uint32_t handle;
    uint32_t lkey;
    uint32_t rkey;
  };

  struct pd {
    IbvLib::context* context;
    uint32_t handle;
  };

  struct qp {
    IbvLib::context* context;
    void* qp_context;
    IbvLib::pd* pd;
    IbvLib::cq* send_cq;
    IbvLib::cq* recv_cq;
    IbvLib::srq* srq;
    uint32_t handle;
    uint32_t qp_num;
    IbvLib::qp_state state;
    IbvLib::qp_type qp_type;

    pthread_mutex_t mutex;
    pthread_cond_t cond;
    uint32_t events_completed;
  };

  struct srq {
    IbvLib::context* context;
    void* srq_context;
    IbvLib::pd* pd;
    uint32_t handle;

    pthread_mutex_t mutex;
    pthread_cond_t cond;
    uint32_t events_completed;
  };

 private:
  explicit IbvLib(DynamicLibraryHandle dlhandle)
      : dlhandle_(std::move(dlhandle)) {}

  DynamicLibraryHandle dlhandle_;

#define TP_DECLARE_FIELD(function_name, return_type, args_types) \
  return_type(*function_name##_ptr_) args_types = nullptr;
  TP_FORALL_IBV_SYMBOLS(TP_DECLARE_FIELD)
#undef TP_DECLARE_FIELD

 public:
  IbvLib() = default;

#define TP_FORWARD_CALL(function_name, return_type, args_types)  \
  template <typename... Args>                                    \
  auto function_name(Args&&... args) const {                     \
    return (*function_name##_ptr_)(std::forward<Args>(args)...); \
  }
  TP_FORALL_IBV_SYMBOLS(TP_FORWARD_CALL)
#undef TP_FORWARD_CALL

  static std::tuple<Error, IbvLib> create() {
    Error error;
    DynamicLibraryHandle dlhandle;
    // To keep things "neat" and contained, we open in "local" mode (as opposed
    // to global) so that the ibverbs symbols can only be resolved through this
    // handle and are not exposed (a.k.a., "leaded") to other shared objects.
    std::tie(error, dlhandle) =
        DynamicLibraryHandle::create("libibverbs.so.1", RTLD_LOCAL | RTLD_LAZY);
    if (error) {
      return std::make_tuple(std::move(error), IbvLib());
    }
    // Log at level 9 as we can't know whether this will be used in a transport
    // or channel, thus err on the side of this being as low-level as possible
    // because we don't expect this to be of interest that often.
    TP_VLOG(9) << [&]() -> std::string {
      std::string filename;
      std::tie(error, filename) = dlhandle.getFilename();
      if (error) {
        return "Couldn't determine location of shared library libibverbs.so.1: " +
            error.what();
      }
      return "Found shared library libibverbs.so.1 at " + filename;
    }();
    IbvLib lib(std::move(dlhandle));
#define TP_LOAD_SYMBOL(function_name, return_type, args_types)              \
  {                                                                         \
    void* ptr;                                                              \
    std::tie(error, ptr) = lib.dlhandle_.loadSymbol("ibv_" #function_name); \
    if (error) {                                                            \
      return std::make_tuple(std::move(error), IbvLib());                   \
    }                                                                       \
    TP_THROW_ASSERT_IF(ptr == nullptr);                                     \
    lib.function_name##_ptr_ =                                              \
        reinterpret_cast<decltype(function_name##_ptr_)>(ptr);              \
  }
    TP_FORALL_IBV_SYMBOLS(TP_LOAD_SYMBOL)
#undef TP_LOAD_SYMBOL
    return std::make_tuple(Error::kSuccess, std::move(lib));
  }

  // These functions (which, it would seem, are the ones that are used in the
  // critical control path, and which thus must have the lowest latency and
  // avoid any syscall/kernel overhead) are not exposed as symbols of
  // libibverbs.so: they are defined inline in the header and, in fact, they
  // access a function pointer stored on the ibv_context and execute it.

  int poll_cq(IbvLib::cq* cq, int num_entries, IbvLib::wc* wc) const {
    return cq->context->ops.poll_cq(cq, num_entries, wc);
  }

  int post_send(IbvLib::qp* qp, IbvLib::send_wr* wr, IbvLib::send_wr** bad_wr)
      const {
    return qp->context->ops.post_send(qp, wr, bad_wr);
  }

  int post_recv(IbvLib::qp* qp, IbvLib::recv_wr* wr, IbvLib::recv_wr** bad_wr)
      const {
    return qp->context->ops.post_recv(qp, wr, bad_wr);
  }

  int post_srq_recv(
      IbvLib::srq* srq,
      IbvLib::recv_wr* recv_wr,
      IbvLib::recv_wr** bad_recv_wr) const {
    return srq->context->ops.post_srq_recv(srq, recv_wr, bad_recv_wr);
  }
};

#undef TP_FORALL_IBV_SYMBOLS

} // namespace tensorpipe


================================================
FILE: tensorpipe/common/memory.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <sys/mman.h>

#include <cstdint>
#include <memory>

#include <tensorpipe/common/defs.h>
#include <tensorpipe/common/error.h>
#include <tensorpipe/common/error_macros.h>

namespace tensorpipe {

class MmappedPtr {
  MmappedPtr(uint8_t* ptr, size_t length) {
    ptr_ = decltype(ptr_)(ptr, Deleter{length});
  }

 public:
  MmappedPtr() = default;

  static std::tuple<Error, MmappedPtr> create(
      size_t length,
      int prot,
      int flags,
      int fd) {
    void* ptr;
    ptr = ::mmap(nullptr, length, prot, flags, fd, 0);
    if (ptr == MAP_FAILED) {
      return std::make_tuple(
          TP_CREATE_ERROR(SystemError, "mmap", errno), MmappedPtr());
    }
    return std::make_tuple(
        Error::kSuccess, MmappedPtr(reinterpret_cast<uint8_t*>(ptr), length));
  }

  uint8_t* ptr() {
    return ptr_.get();
  }

  const uint8_t* ptr() const {
    return ptr_.get();
  }

  size_t getLength() const {
    return ptr_.get_deleter().length;
  }

  void reset() {
    ptr_.reset();
  }

 private:
  struct Deleter {
    size_t length;

    void operator()(void* ptr) {
      int ret = ::munmap(ptr, length);
      TP_THROW_SYSTEM_IF(ret != 0, errno);
    }
  };

  std::unique_ptr<uint8_t[], Deleter> ptr_{nullptr, Deleter{}};
};

} // namespace tensorpipe


================================================
FILE: tensorpipe/common/nop.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <nop/serializer.h>
#include <nop/status.h>
#include <nop/utility/buffer_reader.h>
#include <nop/utility/buffer_writer.h>

#include <tensorpipe/common/defs.h>
#include <tensorpipe/common/optional.h>

namespace tensorpipe {

// Libnop makes heavy use of templates, whereas TensorPipe is designed around
// polymorphism (abstract interfaces and concrete derived classes). The two
// don't mix well: for example, one can't have virtual method templates. One
// technique to get around this is type erasure, which is however tricky to get
// right because the "fundamental" operation(s) of libnop, (de)serialization,
// are simultaneously templated on two types: the reader/writer and the object.
// Ideally we'd like for both these sets of types to be dynamically extensible,
// as we want to allow transpors to provide their own specialized readers and
// writers, and channels could have their own custom objects that they want to
// (de)serialize. New transports and channel could be implemented by third
// parties and plugged in at runtime, so the sets of reader/writers and of
// objects that we must support can't be known in advance.

// We had originally found a solution to this pickle by doing two type erasures
// one after the other, first on the reader/writer, which deals with bytes and
// not objects and is thus not templated, and then on objects, leveraging the
// fact that there is one libnop (de)serializer that takes a *pointer* to a
// reader/writer giving us a "hook" on which to do polymorphism, by hardcoding a
// pointer to the base reader/writer class as template parameter, but then
// passing in an instance of a concrete subclass at runtime.

// However it turned out that this performed poorly, apparently due to the
// (de)serialization process consisting in many small calls to the reader/writer
// which each had to perform a vtable lookup. So, instead, we decided to not
// allow transports to utilize custom specialized readers/writers and to provide
// a single global reader/writer class that is able to cover the two main usage
// patterns we think are most likely to come up: reading/writing to a temporary
// contiguous buffer, and reading/writing to a ringbuffer.

// This reader and writer can operate either on one single buffer (ptr + len) or
// on two buffers: in the latter case, they first consume the first one and,
// when that fills up, they "spill over" into the second one. This is needed in
// order to support the "wrap around" point in ringbuffers.

class NopReader final {
 public:
  NopReader(const uint8_t* ptr, size_t len) : ptr1_(ptr), len1_(len) {}

  NopReader(const uint8_t* ptr1, size_t len1, const uint8_t* ptr2, size_t len2)
      : ptr1_(ptr1), len1_(len1), ptr2_(ptr2), len2_(len2) {}

  // NOLINTNEXTLINE(readability-identifier-naming)
  nop::Status<void> Ensure(size_t size) {
    if (likely(size <= len1_ + len2_)) {
      return nop::ErrorStatus::None;
    } else {
      return nop::ErrorStatus::ReadLimitReached;
    }
  }

  // NOLINTNEXTLINE(readability-identifier-naming)
  nop::Status<void> Read(uint8_t* byte) {
    if (unlikely(len1_ == 0)) {
      ptr1_ = ptr2_;
      len1_ = len2_;
      ptr2_ = nullptr;
      len2_ = 0;
    }

    *byte = *ptr1_;
    ptr1_++;
    len1_--;
    return nop::ErrorStatus::None;
  }

  // NOLINTNEXTLINE(readability-identifier-naming)
  nop::Status<void> Read(void* begin, void* end) {
    size_t size =
        reinterpret_cast<uint8_t*>(end) - reinterpret_cast<uint8_t*>(begin);

    if (unlikely(len1_ < size)) {
      std::memcpy(begin, ptr1_, len1_);
      begin = reinterpret_cast<uint8_t*>(begin) + len1_;
      size -= len1_;
      ptr1_ = ptr2_;
      len1_ = len2_;
      ptr2_ = nullptr;
      len2_ = 0;
    }

    std::memcpy(begin, ptr1_, size);
    ptr1_ += size;
    len1_ -= size;
    return nop::ErrorStatus::None;
  }

  // NOLINTNEXTLINE(readability-identifier-naming)
  nop::Status<void> Skip(size_t paddingBytes) {
    if (unlikely(len1_ < paddingBytes)) {
      paddingBytes -= len1_;
      ptr1_ = ptr2_;
      len1_ = len2_;
      ptr2_ = nullptr;
      len2_ = 0;
    }

    ptr1_ += paddingBytes;
    len1_ -= paddingBytes;
    return nop::ErrorStatus::None;
  }

 private:
  const uint8_t* ptr1_ = nullptr;
  size_t len1_ = 0;
  const uint8_t* ptr2_ = nullptr;
  size_t len2_ = 0;
};

class NopWriter final {
 public:
  NopWriter(uint8_t* ptr, size_t len) : ptr1_(ptr), len1_(len) {}
  NopWriter(uint8_t* ptr1, size_t len1, uint8_t* ptr2, size_t len2)
      : ptr1_(ptr1), len1_(len1), ptr2_(ptr2), len2_(len2) {}

  // NOLINTNEXTLINE(readability-identifier-naming)
  nop::Status<void> Prepare(size_t size) {
    if (likely(size <= len1_ + len2_)) {
      return nop::ErrorStatus::None;
    } else {
      return nop::ErrorStatus::WriteLimitReached;
    }
  }

  // NOLINTNEXTLINE(readability-identifier-naming)
  nop::Status<void> Write(uint8_t byte) {
    if (unlikely(len1_ == 0)) {
      ptr1_ = ptr2_;
      len1_ = len2_;
      ptr2_ = nullptr;
      len2_ = 0;
    }

    *ptr1_ = byte;
    ptr1_++;
    len1_--;
    return nop::ErrorStatus::None;
  }

  // NOLINTNEXTLINE(readability-identifier-naming)
  nop::Status<void> Write(const void* begin, const void* end) {
    size_t size = reinterpret_cast<const uint8_t*>(end) -
        reinterpret_cast<const uint8_t*>(begin);

    if (unlikely(len1_ < size)) {
      std::memcpy(ptr1_, begin, len1_);
      begin = reinterpret_cast<const uint8_t*>(begin) + len1_;
      size -= len1_;
      ptr1_ = ptr2_;
      len1_ = len2_;
      ptr2_ = nullptr;
      len2_ = 0;
    }

    std::memcpy(ptr1_, begin, size);
    ptr1_ += size;
    len1_ -= size;
    return nop::ErrorStatus::None;
  }

  // NOLINTNEXTLINE(readability-identifier-naming)
  nop::Status<void> Skip(size_t paddingBytes, uint8_t paddingValue) {
    if (unlikely(len1_ < paddingBytes)) {
      std::memset(ptr1_, paddingValue, paddingBytes);
      paddingBytes -= len1_;
      ptr1_ = ptr2_;
      len1_ = len2_;
      ptr2_ = nullptr;
      len2_ = 0;
    }

    std::memset(ptr1_, paddingValue, paddingBytes);
    ptr1_ += paddingBytes;
    len1_ -= paddingBytes;
    return nop::ErrorStatus::None;
  }

 private:
  uint8_t* ptr1_ = nullptr;
  size_t len1_ = 0;
  uint8_t* ptr2_ = nullptr;
  size_t len2_ = 0;
};

// The helpers to perform type erasure of the object type: a untemplated base
// class exposing the methods we need for (de)serialization, and then templated
// subclasses allowing to create a holder for each concrete libnop type.

class AbstractNopHolder {
 public:
  virtual size_t getSize() const = 0;
  virtual nop::Status<void> write(NopWriter& writer) const = 0;
  virtual nop::Status<void> read(NopReader& reader) = 0;
  virtual ~AbstractNopHolder() = default;
};

template <typename T>
class NopHolder : public AbstractNopHolder {
 public:
  T& getObject() {
    return object_;
  }

  const T& getObject() const {
    return object_;
  }

  size_t getSize() const override {
    return nop::Encoding<T>::Size(object_);
  }

  nop::Status<void> write(NopWriter& writer) const override {
    return nop::Encoding<T>::Write(object_, &writer);
  }

  nop::Status<void> read(NopReader& reader) override {
    return nop::Encoding<T>::Read(&object_, &reader);
  }

 private:
  T object_;
};

} // namespace tensorpipe

namespace nop {

// The `nop::Encoding` specialization for `tensorpipe::optional` was inspired
// by that of `nop::Optional`, available here:
// https://github.com/google/libnop/blob/master/include/nop/base/optional.h
template <typename T>
struct Encoding<tensorpipe::optional<T>> : EncodingIO<tensorpipe::optional<T>> {
  using Type = tensorpipe::optional<T>;

  // NOLINTNEXTLINE(readability-identifier-naming)
  static constexpr EncodingByte Prefix(const Type& value) {
    return value ? Encoding<T>::Prefix(value.value()) : EncodingByte::Nil;
  }

  // NOLINTNEXTLINE(readability-identifier-naming)
  static constexpr std::size_t Size(const Type& value) {
    return value ? Encoding<T>::Size(value.value())
                 : BaseEncodingSize(EncodingByte::Nil);
  }

  // NOLINTNEXTLINE(readability-identifier-naming)
  static constexpr bool Match(EncodingByte prefix) {
    return prefix == EncodingByte::Nil || Encoding<T>::Match(prefix);
  }

  template <typename Writer>
  // NOLINTNEXTLINE(readability-identifier-naming)
  static constexpr Status<void> WritePayload(
      EncodingByte prefix,
      const Type& value,
      Writer* writer) {
    if (value) {
      return Encoding<T>::WritePayload(prefix, value.value(), writer);
    } else {
      return {};
    }
  }

  template <typename Reader>
  // NOLINTNEXTLINE(readability-identifier-naming)
  static constexpr Status<void> ReadPayload(
      EncodingByte prefix,
      Type* value,
      Reader* reader) {
    if (prefix == EncodingByte::Nil) {
      value->reset();
    } else {
      T temp;
      auto status = Encoding<T>::ReadPayload(prefix, &temp, reader);
      if (!status) {
        return status;
      }

      *value = std::move(temp);
    }

    return {};
  }
};

} // namespace nop


================================================
FILE: tensorpipe/common/nvml_lib.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <memory>

#include <nvml.h>

#include <tensorpipe/common/defs.h>
#include <tensorpipe/common/dl.h>

#define TP_NVML_CHECK(nvml_lib, a)                                \
  do {                                                            \
    nvmlReturn_t error = (a);                                     \
    if (error != NVML_SUCCESS) {                                  \
      const char* errorStr;                                       \
      errorStr = (nvml_lib).errorString(error);                   \
      TP_THROW_ASSERT() << __TP_EXPAND_OPD(a) << " " << errorStr; \
    }                                                             \
  } while (false)

namespace tensorpipe {

// Master list of all symbols we care about from libnvidia-ml.

#define TP_FORALL_NVML_SYMBOLS(_)                                             \
  _(deviceGetComputeRunningProcesses,                                         \
    nvmlDeviceGetComputeRunningProcesses,                                     \
    nvmlReturn_t,                                                             \
    (nvmlDevice_t, unsigned int*, nvmlProcessInfo_t*))                        \
  _(deviceGetCount_v2, nvmlDeviceGetCount_v2, nvmlReturn_t, (unsigned int*))  \
  _(deviceGetHandleByIndex_v2,                                                \
    nvmlDeviceGetHandleByIndex_v2,                                            \
    nvmlReturn_t,                                                             \
    (unsigned int, nvmlDevice_t*))                                            \
  _(deviceGetHandleByUUID,                                                    \
    nvmlDeviceGetHandleByUUID,                                                \
    nvmlReturn_t,                                                             \
    (const char*, nvmlDevice_t*))                                             \
  _(deviceGetP2PStatus,                                                       \
    nvmlDeviceGetP2PStatus,                                                   \
    nvmlReturn_t,                                                             \
    (nvmlDevice_t, nvmlDevice_t, nvmlGpuP2PCapsIndex_t, nvmlGpuP2PStatus_t*)) \
  _(deviceGetUUID,                                                            \
    nvmlDeviceGetUUID,                                                        \
    nvmlReturn_t,                                                             \
    (nvmlDevice_t, char*, unsigned int))                                      \
  _(errorString, nvmlErrorString, const char*, (nvmlReturn_t))                \
  _(init_v2, nvmlInit_v2, nvmlReturn_t, ())                                   \
  _(shutdown, nvmlShutdown, nvmlReturn_t, ())

// Wrapper for libnvidia-ml.

class NvmlLib {
 private:
  explicit NvmlLib(DynamicLibraryHandle dlhandle)
      : dlhandle_(std::move(dlhandle)) {}

  DynamicLibraryHandle dlhandle_;
  bool inited_ = false;

#define TP_DECLARE_FIELD(method_name, function_name, return_type, args_types) \
  return_type(*function_name##_ptr_) args_types = nullptr;
  TP_FORALL_NVML_SYMBOLS(TP_DECLARE_FIELD)
#undef TP_DECLARE_FIELD

 public:
  NvmlLib() = default;

  // Implement another RAII layer (on top of the one of DynamicLibraryHandle) to
  // deal with nvmlInit_v2 and nvmlShutdown. The default move assignment would
  // fail to shutdown NVML when another instance is moved into it, and it would
  // cause the destructor to shutdown a moved-out instance.
  NvmlLib(const NvmlLib&) = delete;
  NvmlLib& operator=(const NvmlLib&) = delete;
  NvmlLib(NvmlLib&& other) {
    *this = std::move(other);
  }
  NvmlLib& operator=(NvmlLib&& other) {
    std::swap(dlhandle_, other.dlhandle_);
    std::swap(inited_, other.inited_);
#define TP_SWAP_FIELD(method_name, function_name, return_type, args_types) \
  std::swap(function_name##_ptr_, other.function_name##_ptr_);
    TP_FORALL_NVML_SYMBOLS(TP_SWAP_FIELD)
#undef TP_SWAP_FIELD
    return *this;
  }

#define TP_FORWARD_CALL(method_name, function_name, return_type, args_types) \
  template <typename... Args>                                                \
  auto method_name(Args&&... args) const {                                   \
    return (*function_name##_ptr_)(std::forward<Args>(args)...);             \
  }
  TP_FORALL_NVML_SYMBOLS(TP_FORWARD_CALL)
#undef TP_FORWARD_CALL

  static std::tuple<Error, NvmlLib> create() {
    Error error;
    DynamicLibraryHandle dlhandle;
    // To keep things "neat" and contained, we open in "local" mode (as
    // opposed to global) so that the cuda symbols can only be resolved
    // through this handle and are not exposed (a.k.a., "leaked") to other
    // shared objects.
    std::tie(error, dlhandle) = DynamicLibraryHandle::create(
        "libnvidia-ml.so.1", RTLD_LOCAL | RTLD_LAZY);
    if (error) {
      return std::make_tuple(std::move(error), NvmlLib());
    }
    // Log at level 9 as we can't know whether this will be used in a transport
    // or channel, thus err on the side of this being as low-level as possible
    // because we don't expect this to be of interest that often.
    TP_VLOG(9) << [&]() -> std::string {
      std::string filename;
      std::tie(error, filename) = dlhandle.getFilename();
      if (error) {
        return "Couldn't determine location of shared library libnvidia-ml.so.1: " +
            error.what();
      }
      return "Found shared library libnvidia-ml.so.1 at " + filename;
    }();
    NvmlLib lib(std::move(dlhandle));
#define TP_LOAD_SYMBOL(method_name, function_name, return_type, args_types) \
  {                                                                         \
    void* ptr;                                                              \
    std::tie(error, ptr) = lib.dlhandle_.loadSymbol(#function_name);        \
    if (error) {                                                            \
      return std::make_tuple(std::move(error), NvmlLib());                  \
    }                                                                       \
    TP_THROW_ASSERT_IF(ptr == nullptr);                                     \
    lib.function_name##_ptr_ =                                              \
        reinterpret_cast<decltype(function_name##_ptr_)>(ptr);              \
  }
    TP_FORALL_NVML_SYMBOLS(TP_LOAD_SYMBOL)
#undef TP_LOAD_SYMBOL
    TP_NVML_CHECK(lib, lib.init_v2());
    lib.inited_ = true;
    return std::make_tuple(Error::kSuccess, std::move(lib));
  }

  ~NvmlLib() {
    if (inited_) {
      TP_DCHECK(dlhandle_.hasValue());
      TP_NVML_CHECK(*this, shutdown());
    }
  }
};

#undef TP_FORALL_NVML_SYMBOLS

} // namespace tensorpipe


================================================
FILE: tensorpipe/common/optional.h
================================================
#pragma once

#include <optional>

namespace tensorpipe {

using std::optional;
using std::nullopt;

} // namespace tensorpipe


================================================
FILE: tensorpipe/common/queue.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <condition_variable>
#include <deque>
#include <mutex>

namespace tensorpipe {

template <typename T>
class Queue {
 public:
  explicit Queue(int capacity = 1) : capacity_(capacity) {}

  void push(T t) {
    std::unique_lock<std::mutex> lock(mutex_);
    while (items_.size() >= capacity_) {
      cv_.wait(lock);
    }
    items_.push_back(std::move(t));
    cv_.notify_all();
  }

  T pop() {
    std::unique_lock<std::mutex> lock(mutex_);
    while (items_.size() == 0) {
      cv_.wait(lock);
    }
    T t(std::move(items_.front()));
    items_.pop_front();
    cv_.notify_all();
    return t;
  }

 private:
  std::mutex mutex_;
  std::condition_variable cv_;
  const int capacity_;
  std::deque<T> items_;
};

} // namespace tensorpipe


================================================
FILE: tensorpipe/common/ringbuffer.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <sys/types.h>

#include <atomic>
#include <cstring>
#include <memory>
#include <type_traits>

#include <tensorpipe/common/system.h>

///
/// C++17 implementation of shared-memory friendly perf_event style ringbuffer.
/// It's designed to avoid parallel access and provide (almost) zero-copy
///
///
/// A ringbuffer has a header and a data members that can be allocated
/// independently from the ringbuffer object, allowing the ringbuffer object
/// to be stored in process' exclusive memory while header and data
/// could be in shared memory.
///
/// Multiple ringbuffers can reference the same header + data.
///
/// Multiple producers (or consumers) can reference the same ringbuffer.
///
/// Synchronization between all producers/consumers of all ringbuffers that
/// reference the same header + pair pairs is done using atomic operations
/// care is taken to guarantee lock-free implementations, reduce the usage
/// of LOCK prefixes and the access to non-exclusive cache lines by CPUs.
///
/// Producers write data atomically at ringbuffer's head, while Consumers
/// write data atomically at ringbuffer's tail.
///

namespace tensorpipe {

///
/// RingBufferHeader contains the head, tail and other control information
/// of the RingBuffer.
///
/// <kMinByteSize_> is the minimum byte size of the circular buffer. The actual
/// size is the smallest power of 2 larger than kMinByteSize_. Enforcing the
/// size to be a power of two avoids costly division/modulo operations.
///
template <int NumRoles>
class RingBufferHeader {
 public:
  static_assert(NumRoles > 0, "");
  const uint64_t kDataPoolByteSize;
  const uint64_t kDataModMask;

  RingBufferHeader(const RingBufferHeader&) = delete;
  RingBufferHeader(RingBufferHeader&&) = delete;

  // Implementation uses power of 2 arithmetic to avoid costly modulo.
  // So build the largest RingBuffer with size of the smallest power of 2 >=
  // <byte_size>.
  explicit RingBufferHeader(uint64_t minDataByteSize)
      : kDataPoolByteSize{nextPow2(minDataByteSize)},
        kDataModMask{kDataPoolByteSize - 1} {
    // Minimum size where implementation of bit shift arithmetic works.
    TP_DCHECK_GE(kDataPoolByteSize, 2)
        << "Minimum supported ringbuffer data size is 2 bytes";
    TP_DCHECK(isPow2(kDataPoolByteSize))
        << kDataPoolByteSize << " is not a power of 2";
    TP_DCHECK_LE(kDataPoolByteSize, std::numeric_limits<int>::max())
        << "Logic piggy-backs read/write size on ints, to be safe forbid"
           " buffer to ever be larger than what an int can hold";
    for (int roleIdx = 0; roleIdx < NumRoles; ++roleIdx) {
      inTx_[roleIdx].clear();
      markers_[roleIdx] = 0;
    }
  }

  // Being in a transaction (either a read or a write one) gives a user of the
  // ringbuffer (either a consumer or a producer, respectively) the right to
  // read the head and tail and to modify the one they are responsible for (the
  // tail and the head, respectively). Accessing the head or tail outside of a
  // transaction could lead to races. This also means we need memory barriers
  // around a transaction, to make sure side-effects of other users are visible
  // upon entering and our side effects become visible to others upon exiting.
  // We also must prevent the compiler from reordering memory accesses. Failure
  // to do so may result in our reads of head/tail to look like they occurred
  // before we entered the transaction, and writes to them to look like they
  // occurred after we exited it. In order to get the desired behavior, we use
  // the acquire memory order when starting a transaction (which means no later
  // memory access can be moved before it) and the release memory order when
  // ending it (no earlier memory access can be moved after it).

  template <int RoleIdx>
  [[nodiscard]] bool beginTransaction() {
    static_assert(0 <= RoleIdx && RoleIdx < NumRoles, "");
    return inTx_[RoleIdx].test_and_set(std::memory_order_acquire);
  }

  template <int RoleIdx>
  void endTransaction() {
    static_assert(0 <= RoleIdx && RoleIdx < NumRoles, "");
    inTx_[RoleIdx].clear(std::memory_order_release);
  }

  // Reading the head and tail is what gives a user of the ringbuffer (either a
  // consumer or a producer) the right to access the buffer's contents: the
  // producer can write on [head, tail) (modulo the size), the consumer can read
  // from [tail, head). And, when the producer increases the head, or when the
  // consumer increases the tail, they give users of the opposite type the right
  // to access some of the memory that was previously under their control. Thus,
  // just like we do for the transactions, we need memory barriers around reads
  // and writes to the head and tail, with the same reasoning for memory orders.

  template <int RoleIdx>
  uint64_t readMarker() const {
    static_assert(0 <= RoleIdx && RoleIdx < NumRoles, "");
    return markers_[RoleIdx].load(std::memory_order_acquire);
  }

  template <int RoleIdx>
  void incMarker(uint64_t inc) {
    static_assert(0 <= RoleIdx && RoleIdx < NumRoles, "");
    markers_[RoleIdx].fetch_add(inc, std::memory_order_release);
  }

 protected:
  std::array<std::atomic_flag, NumRoles> inTx_;
  std::array<std::atomic<uint64_t>, NumRoles> markers_;

  // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2007/n2427.html#atomics.lockfree
  // static_assert(
  //     decltype(markers_)::value_type::is_always_lock_free,
  //     "Only lock-free atomics are supported");
};

///
/// Process' view of a ring buffer.
/// This cannot reside in shared memory since it has pointers.
///
template <int NumRoles>
class RingBuffer final {
 public:
  RingBuffer() = default;

  RingBuffer(RingBufferHeader<NumRoles>* header, uint8_t* data)
      : header_(header), data_(data) {
    TP_THROW_IF_NULLPTR(header_) << "Header cannot be nullptr";
    TP_THROW_IF_NULLPTR(data_) << "Data cannot be nullptr";
  }

  const RingBufferHeader<NumRoles>& getHeader() const {
    return *header_;
  }

  RingBufferHeader<NumRoles>& getHeader() {
    return *header_;
  }

  const uint8_t* getData() const {
    return data_;
  }

  uint8_t* getData() {
    return data_;
  }

 protected:
  RingBufferHeader<NumRoles>* header_ = nullptr;
  uint8_t* data_ = nullptr;
};

} // namespace tensorpipe


================================================
FILE: tensorpipe/common/ringbuffer_read_write_ops.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <array>
#include <functional>
#include <memory>
#include <tuple>
#include <utility>

#include <tensorpipe/common/defs.h>
#include <tensorpipe/common/error.h>
#include <tensorpipe/common/nop.h>
#include <tensorpipe/common/ringbuffer_role.h>

namespace tensorpipe {

// Reads happen only if the user supplied a callback (and optionally
// a destination buffer). The callback is run from the event loop
// thread upon receiving a notification from our peer.
//
// The memory pointer argument to the callback is valid only for the
// duration of the callback. If the memory contents must be
// preserved for longer, it must be copied elsewhere.
//
class RingbufferReadOperation {
  enum Mode {
    READ_LENGTH,
    READ_PAYLOAD,
  };

 public:
  using read_callback_fn =
      std::function<void(const Error& error, const void* ptr, size_t len)>;
  // Read into a user-provided buffer of known length.
  inline RingbufferReadOperation(void* ptr, size_t len, read_callback_fn fn);
  // Read into an auto-allocated buffer, whose length is read from the wire.
  explicit inline RingbufferReadOperation(read_callback_fn fn);
  // Read into a user-provided libnop object, read length from the wire.
  inline RingbufferReadOperation(
      AbstractNopHolder* nopObject,
      read_callback_fn fn);

  // Processes a pending read.
  template <int NumRoles, int RoleIdx>
  inline size_t handleRead(RingBufferRole<NumRoles, RoleIdx>& inbox);

  bool completed() const {
    return (mode_ == READ_PAYLOAD && bytesRead_ == len_);
  }

  inline void handleError(const Error& error);

 private:
  Mode mode_{READ_LENGTH};
  void* ptr_{nullptr};
  AbstractNopHolder* nopObject_{nullptr};
  std::unique_ptr<uint8_t[]> buf_;
  size_t len_{0};
  size_t bytesRead_{0};
  read_callback_fn fn_;
  // Use a separare flag, rather than checking if ptr_ == nullptr, to catch the
  // case of a user explicitly passing in a nullptr with length zero, in which
  // case we must check that the length matches the header we see on the wire.
  const bool ptrProvided_;

  template <int NumRoles, int RoleIdx>
  inline ssize_t readNopObject(RingBufferRole<NumRoles, RoleIdx>& inbox);
};

// Writes happen only if the user supplied a memory pointer, the
// number of bytes to write, and a callback to execute upon
// completion of the write.
//
// The memory pointed to by the pointer may only be reused or freed
// after the callback has been called.
//
class RingbufferWriteOperation {
  enum Mode {
    WRITE_LENGTH,
    WRITE_PAYLOAD,
  };

 public:
  using write_callback_fn = std::function<void(const Error& error)>;
  // Write from a user-provided buffer of known length.
  inline RingbufferWriteOperation(
      const void* ptr,
      size_t len,
      write_callback_fn fn);
  // Write from a user-provided libnop object.
  inline RingbufferWriteOperation(
      const AbstractNopHolder* nopObject,
      write_callback_fn fn);

  template <int NumRoles, int RoleIdx>
  inline size_t handleWrite(RingBufferRole<NumRoles, RoleIdx>& outbox);

  bool completed() const {
    return (mode_ == WRITE_PAYLOAD && bytesWritten_ == len_);
  }

  inline void handleError(const Error& error);

 private:
  Mode mode_{WRITE_LENGTH};
  const void* ptr_{nullptr};
  const AbstractNopHolder* nopObject_{nullptr};
  size_t len_{0};
  size_t bytesWritten_{0};
  write_callback_fn fn_;

  template <int NumRoles, int RoleIdx>
  inline ssize_t writeNopObject(RingBufferRole<NumRoles, RoleIdx>& outbox);
};

RingbufferReadOperation::RingbufferReadOperation(
    void* ptr,
    size_t len,
    read_callback_fn fn)
    : ptr_(ptr), len_(len), fn_(std::move(fn)), ptrProvided_(true) {}

RingbufferReadOperation::RingbufferReadOperation(read_callback_fn fn)
    : fn_(std::move(fn)), ptrProvided_(false) {}

RingbufferReadOperation::RingbufferReadOperation(
    AbstractNopHolder* nopObject,
    read_callback_fn fn)
    : nopObject_(nopObject), fn_(std::move(fn)), ptrProvided_(false) {}

template <int NumRoles, int RoleIdx>
size_t RingbufferReadOperation::handleRead(
    RingBufferRole<NumRoles, RoleIdx>& inbox) {
  ssize_t ret;
  size_t bytesReadNow = 0;

  // Start read transaction. This end of the connection is the only consumer for
  // this ringbuffer, and all reads are done from the reactor thread, so there
  // cannot be another transaction already going on. Fail hard in case.
  ret = inbox.startTx();
  TP_THROW_SYSTEM_IF(ret < 0, -ret);

  if (mode_ == READ_LENGTH) {
    uint32_t length;
    ret = inbox.template readInTx</*AllowPartial=*/false>(
        &length, sizeof(length));
    if (likely(ret >= 0)) {
      mode_ = READ_PAYLOAD;
      bytesReadNow += ret;
      if (nopObject_ != nullptr) {
        len_ = length;
      } else if (ptrProvided_) {
        TP_DCHECK_EQ(length, len_);
      } else {
        len_ = length;
        buf_ = std::make_unique<uint8_t[]>(len_);
        ptr_ = buf_.get();
      }
    } else if (unlikely(ret != -ENODATA)) {
      TP_THROW_SYSTEM(-ret);
    }
  }

  if (mode_ == READ_PAYLOAD) {
    if (nopObject_ != nullptr) {
      ret = readNopObject(inbox);
    } else {
      ret = inbox.template readInTx</*AllowPartial=*/true>(
          reinterpret_cast<uint8_t*>(ptr_) + bytesRead_, len_ - bytesRead_);
    }
    if (likely(ret >= 0)) {
      bytesRead_ += ret;
      bytesReadNow += ret;
    } else if (unlikely(ret != -ENODATA)) {
      TP_THROW_SYSTEM(-ret);
    }
  }

  ret = inbox.commitTx();
  TP_THROW_SYSTEM_IF(ret < 0, -ret);

  if (completed()) {
    fn_(Error::kSuccess, ptr_, len_);
  }

  return bytesReadNow;
}

template <int NumRoles, int RoleIdx>
ssize_t RingbufferReadOperation::readNopObject(
    RingBufferRole<NumRoles, RoleIdx>& inbox) {
  TP_THROW_ASSERT_IF(len_ > inbox.getSize());

  ssize_t numBuffers;
  std::array<typename RingBufferRole<NumRoles, RoleIdx>::Buffer, 2> buffers;
  std::tie(numBuffers, buffers) =
      inbox.template accessContiguousInTx</*AllowPartial=*/false>(len_);
  if (unlikely(numBuffers < 0)) {
    return numBuffers;
  }

  NopReader reader(
      buffers[0].ptr, buffers[0].len, buffers[1].ptr, buffers[1].len);
  nop::Status<void> status = nopObject_->read(reader);
  if (status.error() == nop::ErrorStatus::ReadLimitReached) {
    return -ENODATA;
  } else if (status.has_error()) {
    return -EINVAL;
  }

  return len_;
}

void RingbufferReadOperation::handleError(const Error& error) {
  fn_(error, nullptr, 0);
}

RingbufferWriteOperation::RingbufferWriteOperation(
    const void* ptr,
    size_t len,
    write_callback_fn fn)
    : ptr_(ptr), len_(len), fn_(std::move(fn)) {}

RingbufferWriteOperation::RingbufferWriteOperation(
    const AbstractNopHolder* nopObject,
    write_callback_fn fn)
    : nopObject_(nopObject), len_(nopObject_->getSize()), fn_(std::move(fn)) {}

template <int NumRoles, int RoleIdx>
size_t RingbufferWriteOperation::handleWrite(
    RingBufferRole<NumRoles, RoleIdx>& outbox) {
  ssize_t ret;
  size_t bytesWrittenNow = 0;

  // Start write transaction. This end of the connection is the only producer
  // for this ringbuffer, and all writes are done from the reactor thread, so
  // there cannot be another transaction already going on. Fail hard in case.
  ret = outbox.startTx();
  TP_THROW_SYSTEM_IF(ret < 0, -ret);

  if (mode_ == WRITE_LENGTH) {
    uint32_t length = len_;
    ret = outbox.template writeInTx</*AllowPartial=*/false>(
        &length, sizeof(length));
    if (likely(ret >= 0)) {
      mode_ = WRITE_PAYLOAD;
      bytesWrittenNow += ret;
    } else if (unlikely(ret != -ENODATA)) {
      TP_THROW_SYSTEM(-ret);
    }
  }

  if (mode_ == WRITE_PAYLOAD) {
    if (nopObject_ != nullptr) {
      ret = writeNopObject(outbox);
    } else {
      ret = outbox.template writeInTx</*AllowPartial=*/true>(
          reinterpret_cast<const uint8_t*>(ptr_) + bytesWritten_,
          len_ - bytesWritten_);
    }
    if (likely(ret >= 0)) {
      bytesWritten_ += ret;
      bytesWrittenNow += ret;
    } else if (unlikely(ret != -ENODATA)) {
      TP_THROW_SYSTEM(-ret);
    }
  }

  ret = outbox.commitTx();
  TP_THROW_SYSTEM_IF(ret < 0, -ret);

  if (completed()) {
    fn_(Error::kSuccess);
  }

  return bytesWrittenNow;
}

template <int NumRoles, int RoleIdx>
ssize_t RingbufferWriteOperation::writeNopObject(
    RingBufferRole<NumRoles, RoleIdx>& outbox) {
  TP_THROW_ASSERT_IF(len_ > outbox.getSize());

  ssize_t numBuffers;
  std::array<typename RingBufferRole<NumRoles, RoleIdx>::Buffer, 2> buffers;
  std::tie(numBuffers, buffers) =
      outbox.template accessContiguousInTx</*AllowPartial=*/false>(len_);
  if (unlikely(numBuffers < 0)) {
    return numBuffers;
  }

  NopWriter writer(
      buffers[0].ptr, buffers[0].len, buffers[1].ptr, buffers[1].len);
  nop::Status<void> status = nopObject_->write(writer);
  if (status.error() == nop::ErrorStatus::WriteLimitReached) {
    return -ENODATA;
  } else if (status.has_error()) {
    return -EINVAL;
  }

  return len_;
}

void RingbufferWriteOperation::handleError(const Error& error) {
  fn_(error);
}

} // namespace tensorpipe


================================================
FILE: tensorpipe/common/ringbuffer_role.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <array>
#include <cstring>
#include <tuple>
#include <utility>

#include <tensorpipe/common/ringbuffer.h>

namespace tensorpipe {

///
/// Role of a RingBuffer.
///
/// Provides methods to read and write data into a ringbuffer.
///
template <int NumRoles, int RoleIdx>
class RingBufferRole {
 public:
  static_assert(0 <= RoleIdx && RoleIdx < NumRoles, "");

  RingBufferRole() = delete;

  explicit RingBufferRole(RingBuffer<NumRoles>& rb)
      : header_{rb.getHeader()}, data_{rb.getData()} {
    TP_THROW_IF_NULLPTR(data_);
  }

  RingBufferRole(const RingBufferRole&) = delete;
  RingBufferRole(RingBufferRole&&) = delete;

  RingBufferRole& operator=(const RingBufferRole&) = delete;
  RingBufferRole& operator=(RingBufferRole&&) = delete;

  ~RingBufferRole() noexcept {
    TP_THROW_ASSERT_IF(inTx());
  }

  size_t getSize() const {
    return header_.kDataPoolByteSize;
  }

  //
  // Transaction based API.
  //
  // Only one instance of a role can have an active transaction at any time.
  // *InTx* operations that fail do not cancel transaction.
  //
  bool inTx() const noexcept {
    return inTx_;
  }

  [[nodiscard]] ssize_t startTx() noexcept {
    if (unlikely(inTx())) {
      return -EBUSY;
    }
    if (header_.template beginTransaction<RoleIdx>()) {
      return -EAGAIN;
    }
    inTx_ = true;
    TP_DCHECK_EQ(txSize_, 0);
    return 0;
  }

  [[nodiscard]] ssize_t commitTx() noexcept {
    if (unlikely(!inTx())) {
      return -EINVAL;
    }
    header_.template incMarker<RoleIdx>(txSize_);
    txSize_ = 0;
    inTx_ = false;
    header_.template endTransaction<RoleIdx>();
    return 0;
  }

  [[nodiscard]] ssize_t cancelTx() noexcept {
    if (unlikely(!inTx())) {
      return -EINVAL;
    }
    txSize_ = 0;
    inTx_ = false;
    header_.template endTransaction<RoleIdx>();
    return 0;
  }

  struct Buffer {
    uint8_t* ptr{nullptr};
    size_t len{0};
  };

  // The first item is negative in case of error, otherwise it contains how many
  // elements of the array are valid (0, 1 or 2). The elements are ptr+len pairs
  // of contiguous areas of the ringbuffer that, chained together, represent a
  // slice of the requested size (or less if not enough data is available, and
  // AllowPartial is set to true).
  template <bool AllowPartial>
  [[nodiscard]] std::pair<ssize_t, std::array<Buffer, 2>> accessContiguousInTx(
      size_t size) noexcept {
    std::array<Buffer, 2> result;

    if (unlikely(!inTx())) {
      return {-EINVAL, result};
    }

    if (unlikely(size == 0)) {
      return {0, result};
    }

    const uint64_t tail = header_.template readMarker<RoleIdx>();
    const uint64_t head =
        header_.template readMarker<(RoleIdx + 1) % NumRoles>() +
        (RoleIdx + 1 == NumRoles ? header_.kDataPoolByteSize : 0);
    TP_DCHECK_LE(head - tail, header_.kDataPoolByteSize);

    const size_t avail = head - tail - txSize_;
    TP_DCHECK_GE(avail, 0);

    if (!AllowPartial && avail < size) {
      return {-ENODATA, result};
    }

    if (avail == 0) {
      return {0, result};
    }

    size = std::min(size, avail);

    const uint64_t start = (tail + txSize_) & header_.kDataModMask;
    const uint64_t end = (start + size) & header_.kDataModMask;

    txSize_ += size;

    // end == 0 is the same as end == bufferSize, in which case it doesn't wrap.
    const bool wrap = (start >= end && end > 0);
    if (likely(!wrap)) {
      result[0] = {.ptr = data_ + start, .len = size};
      return {1, result};
    } else {
      result[0] = {
          .ptr = data_ + start, .len = header_.kDataPoolByteSize - start};
      result[1] = {.ptr = data_, .len = end};
      return {2, result};
    }
  }

  // Increment our marker without doing anything, i.e., "skip" over the data.
  [[nodiscard]] ssize_t incMarkerInTx(size_t size) {
    // We could implement this from scratch but we'd rather re-use the logic
    // from accessContiguous as it's easy to get it wrong.
    ssize_t ret;
    std::array<Buffer, 2> buffers;
    std::tie(ret, buffers) = accessContiguousInTx</*allowPartial=*/false>(size);
    return ret;
  }

  // Copy data from the ringbuffer into the provided buffer, up to the given
  // size (only copy less data if AllowPartial is set to true).
  template <bool AllowPartial>
  [[nodiscard]] ssize_t readInTx(void* buffer, const size_t size) noexcept {
    ssize_t numBuffers;
    std::array<Buffer, 2> buffers;
    std::tie(numBuffers, buffers) = accessContiguousInTx<AllowPartial>(size);

    if (unlikely(numBuffers < 0)) {
      return numBuffers;
    }

    if (unlikely(numBuffers == 0)) {
      // Nothing to do.
      return 0;
    } else if (likely(numBuffers == 1)) {
      std::memcpy(buffer, buffers[0].ptr, buffers[0].len);
      return buffers[0].len;
    } else if (likely(numBuffers == 2)) {
      std::memcpy(buffer, buffers[0].ptr, buffers[0].len);
      std::memcpy(
          reinterpret_cast<uint8_t*>(buffer) + buffers[0].len,
          buffers[1].ptr,
          buffers[1].len);
      return buffers[0].len + buffers[1].len;
    } else {
      TP_THROW_ASSERT() << "Bad number of buffers: " << numBuffers;
      // Dummy return to make the compiler happy.
      return -EINVAL;
    }
  }

  // Copy data from the provided buffer into the ringbuffer, up to the given
  // size (only copy less data if AllowPartial is set to true).
  template <bool AllowPartial>
  [[nodiscard]] ssize_t writeInTx(
      const void* buffer,
      const size_t size) noexcept {
    ssize_t numBuffers;
    std::array<Buffer, 2> buffers;
    std::tie(numBuffers, buffers) = accessContiguousInTx<AllowPartial>(size);

    if (unlikely(numBuffers < 0)) {
      return numBuffers;
    }

    if (unlikely(numBuffers == 0)) {
      // Nothing to do.
      return 0;
    } else if (likely(numBuffers == 1)) {
      std::memcpy(buffers[0].ptr, buffer, buffers[0].len);
      return buffers[0].len;
    } else if (likely(numBuffers == 2)) {
      std::memcpy(buffers[0].ptr, buffer, buffers[0].len);
      std::memcpy(
          buffers[1].ptr,
          reinterpret_cast<const uint8_t*>(buffer) + buffers[0].len,
          buffers[1].len);
      return buffers[0].len + buffers[1].len;
    } else {
      TP_THROW_ASSERT() << "Bad number of buffers: " << numBuffers;
      // Dummy return to make the compiler happy.
      return -EINVAL;
    }
  }

  //
  // High-level atomic operations.
  //

  // Copy data from the ringbuffer into the provided buffer, exactly the given
  // size. Take care of opening and closing the transaction.
  [[nodiscard]] ssize_t read(void* buffer, const size_t size) noexcept {
    auto ret = startTx();
    if (0 > ret) {
      return ret;
    }

    ret = readInTx</*AllowPartial=*/false>(buffer, size);
    if (0 > ret) {
      auto r = cancelTx();
      TP_DCHECK_EQ(r, 0);
      return ret;
    }
    TP_DCHECK_EQ(ret, size);

    ret = commitTx();
    TP_DCHECK_EQ(ret, 0);

    return size;
  }

  // Copy data from the provided buffer into the ringbuffer, exactly the given
  // size. Take care of opening and closing the transaction.
  [[nodiscard]] ssize_t write(const void* buffer, size_t size) noexcept {
    auto ret = startTx();
    if (0 > ret) {
      return ret;
    }

    ret = writeInTx</*AllowPartial=*/false>(buffer, size);
    if (0 > ret) {
      auto r = cancelTx();
      TP_DCHECK_EQ(r, 0);
      return ret;
    }
    TP_DCHECK_EQ(ret, size);

    ret = commitTx();
    TP_DCHECK_EQ(ret, 0);

    return size;
  }

 private:
  RingBufferHeader<NumRoles>& header_;
  uint8_t* const data_;
  unsigned txSize_ = 0;
  bool inTx_{false};
};

} // namespace tensorpipe


================================================
FILE: tensorpipe/common/shm_ringbuffer.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <tensorpipe/common/fd.h>
#include <tensorpipe/common/ringbuffer.h>
#include <tensorpipe/common/shm_segment.h>

namespace tensorpipe {

/// Creates ringbuffer on shared memory.
///
/// <minRbByteSize> is the minimum size of the data section of the RingBuffer.
///
template <int NumRoles>
std::tuple<Error, ShmSegment, ShmSegment, RingBuffer<NumRoles>>
createShmRingBuffer(size_t minRbByteSize) {
  Error error;
  ShmSegment headerSegment;
  RingBufferHeader<NumRoles>* header;
  std::tie(error, headerSegment, header) =
      ShmSegment::create<RingBufferHeader<NumRoles>>(minRbByteSize);
  if (error) {
    return std::make_tuple(
        std::move(error), ShmSegment(), ShmSegment(), RingBuffer<NumRoles>());
  }

  ShmSegment dataSegment;
  uint8_t* data;
  std::tie(error, dataSegment, data) =
      ShmSegment::create<uint8_t[]>(header->kDataPoolByteSize);
  if (error) {
    return std::make_tuple(
        std::move(error), ShmSegment(), ShmSegment(), RingBuffer<NumRoles>());
  }

  // Note: cannot use implicit construction from initializer list on GCC 5.5:
  // "converting to XYZ from initializer list would use explicit constructor".
  return std::make_tuple(
      Error::kSuccess,
      std::move(headerSegment),
      std::move(dataSegment),
      RingBuffer<NumRoles>(header, data));
}

template <int NumRoles>
std::tuple<Error, ShmSegment, ShmSegment, RingBuffer<NumRoles>>
loadShmRingBuffer(Fd headerFd, Fd dataFd) {
  Error error;
  ShmSegment headerSegment;
  RingBufferHeader<NumRoles>* header;
  std::tie(error, headerSegment, header) =
      ShmSegment::load<RingBufferHeader<NumRoles>>(std::move(headerFd));
  if (error) {
    return std::make_tuple(
        std::move(error), ShmSegment(), ShmSegment(), RingBuffer<NumRoles>());
  }
  constexpr auto kHeaderSize = sizeof(RingBufferHeader<NumRoles>);
  if (unlikely(kHeaderSize != headerSegment.getSize())) {
    TP_THROW_SYSTEM(EPERM) << "Header segment of unexpected size";
  }

  ShmSegment dataSegment;
  uint8_t* data;
  std::tie(error, dataSegment, data) =
      ShmSegment::load<uint8_t[]>(std::move(dataFd));
  if (error) {
    return std::make_tuple(
        std::move(error), ShmSegment(), ShmSegment(), RingBuffer<NumRoles>());
  }
  if (unlikely(header->kDataPoolByteSize != dataSegment.getSize())) {
    TP_THROW_SYSTEM(EPERM) << "Data segment of unexpected size";
  }

  return std::make_tuple(
      Error::kSuccess,
      std::move(headerSegment),
      std::move(dataSegment),
      RingBuffer<NumRoles>(header, data));
}

} // namespace tensorpipe


================================================
FILE: tensorpipe/common/shm_segment.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/common/shm_segment.h>

#include <fcntl.h>
#include <linux/mman.h>
#include <sched.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <unistd.h>

#include <cstring>
#include <memory>
#include <sstream>
#include <thread>
#include <tuple>

namespace tensorpipe {

namespace {

// Our goal is to obtain a file descriptor that is backed by a region of memory.
// (We need an fd so we can pass it over a UNIX domain socket). We support two
// ways of doing so:
// - The memfd_create syscall, which does exactly what we need. Unfortunately
//   it was added in a recent-ish kernel and an even more recent glibc version.
// - As a fallback for older systems, we open a file in the /dev/shm directory,
//   which we expect to be a mountpoint of tmpfs type. We open it with O_TMPFILE
//   so it remains unnamed, which won't appear in the directory and can't thus
//   be opened by other processes and will be automatically cleaned up when we
//   exit. This method has some issues, as it depends on the availability of
//   /dev/shm and is capped to the size of that mountpoint (rather than the
//   total memory of the system), which are especially problematic in Docker.
// FIXME O_TMPFILE is also not that old, and some users have reported issues due
// to it. We could add a third method as a further fallback.

// Name to give to the memfds. This is just displayed when inspecting the file
// descriptor in /proc/self/fd to aid debugging, and doesn't have to be unique.
constexpr const char* kMemfdName = "tensorpipe_shm";

std::tuple<Error, Fd> createMemfd() {
  // We don't want to use the ::memfd_create function directly as it's harder to
  // detect its availability (we'd need to perform a feature check in CMake and
  // inject the result as a preprocessor flag) and because it would cause us to
  // link against glibc 2.27. PyTorch aims to support the manylinux2014 platform
  // (one of the standard platforms defined by Python for PyPI/pip), which has
  // glibc 2.17. Thus instead we issue the syscall directly, skipping the glibc
  // wrapper.
#ifdef SYS_memfd_create
  // We want to pass the MFD_CLOEXEC flag, but we can't rely on glibc exposing
  // it, thus we redefine its value if needed.
#ifndef MFD_CLOEXEC
// https://github.com/torvalds/linux/blob/master/include/uapi/linux/memfd.h
#define MFD_CLOEXEC 0x0001U
#endif
  int fd = static_cast<int>(::syscall(
      SYS_memfd_create,
      static_cast<const char*>(kMemfdName),
      static_cast<unsigned int>(MFD_CLOEXEC)));
  if (fd < 0) {
    return std::make_tuple(
        TP_CREATE_ERROR(SystemError, "memfd_create", errno), Fd());
  }
  return std::make_tuple(Error::kSuccess, Fd(fd));
#else // SYS_memfd_create
  return std::make_tuple(
      TP_CREATE_ERROR(SystemError, "memfd_create", ENOSYS), Fd());
#endif // SYS_memfd_create
}

// Default base path for all segments created.
constexpr const char* kBasePath = "/dev/shm";

std::tuple<Error, Fd> openTmpfileInDevShm() {
  // Some users are compiling on old pre-3.11 kernels. We'd like our backends to
  // only depend on runtime capabilities, and not on compile-time ones, hence we
  // "polyfill" the flag so the build will pass and we'll get a runtime error.
#ifndef O_TMPFILE
// https://github.com/torvalds/linux/blob/master/include/uapi/asm-generic/fcntl.h
#define O_TMPFILE (020000000 | 00200000)
#endif
  int flags = O_TMPFILE | O_EXCL | O_RDWR | O_CLOEXEC;
  int fd = ::open(kBasePath, flags, 0);
  if (fd < 0) {
    return std::make_tuple(TP_CREATE_ERROR(SystemError, "open", errno), Fd());
  }

  return std::make_tuple(Error::kSuccess, Fd(fd));
}

std::tuple<Error, Fd> createShmFd() {
  Error error;
  Fd fd;
  std::tie(error, fd) = createMemfd();
  if (error && error.isOfType<SystemError>() &&
      error.castToType<SystemError>()->errorCode() == ENOSYS) {
    std::tie(error, fd) = openTmpfileInDevShm();
  }
  return std::make_tuple(std::move(error), std::move(fd));
}

std::tuple<Error, MmappedPtr> mmapShmFd(int fd, size_t byteSize) {
  int flags = MAP_SHARED;
  int prot = PROT_READ | PROT_WRITE;
  return MmappedPtr::create(byteSize, prot, flags, fd);
}

} // namespace

ShmSegment::ShmSegment(Fd fd, MmappedPtr ptr)
    : fd_(std::move(fd)), ptr_(std::move(ptr)) {}

std::tuple<Error, ShmSegment> ShmSegment::alloc(size_t byteSize) {
  Error error;
  Fd fd;
  std::tie(error, fd) = createShmFd();
  if (error) {
    return std::make_tuple(std::move(error), ShmSegment());
  }

  // grow size to contain byte_size bytes.
  off_t len = static_cast<off_t>(byteSize);
  int ret = ::fallocate(fd.fd(), 0, 0, len);
  if (ret < 0) {
    return std::make_tuple(
        TP_CREATE_ERROR(SystemError, "fallocate", errno), ShmSegment());
  }

  MmappedPtr ptr;
  std::tie(error, ptr) = mmapShmFd(fd.fd(), byteSize);
  if (error) {
    return std::make_tuple(std::move(error), ShmSegment());
  }

  return std::make_tuple(
      Error::kSuccess, ShmSegment(std::move(fd), std::move(ptr)));
}

std::tuple<Error, ShmSegment> ShmSegment::access(Fd fd) {
  // Load whole file. Use fstat to obtain size.
  struct stat sb;
  int ret = ::fstat(fd.fd(), &sb);
  if (ret < 0) {
    return std::make_tuple(
        TP_CREATE_ERROR(SystemError, "fstat", errno), ShmSegment());
  }
  size_t byteSize = static_cast<size_t>(sb.st_size);

  Error error;
  MmappedPtr ptr;
  std::tie(error, ptr) = mmapShmFd(fd.fd(), byteSize);
  if (error) {
    return std::make_tuple(std::move(error), ShmSegment());
  }

  return std::make_tuple(
      Error::kSuccess, ShmSegment(std::move(fd), std::move(ptr)));
}

} // namespace tensorpipe


================================================
FILE: tensorpipe/common/shm_segment.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <fcntl.h>
#include <cstring>
#include <memory>
#include <sstream>

#include <tensorpipe/common/defs.h>
#include <tensorpipe/common/fd.h>
#include <tensorpipe/common/memory.h>
#include <tensorpipe/common/optional.h>

//
// A C++17 version of shared memory segments handler inspired on boost
// interprocess.
//

namespace tensorpipe {

class ShmSegment {
  ShmSegment(Fd fd, MmappedPtr ptr);

 public:
  ShmSegment() = default;

  static std::tuple<Error, ShmSegment> alloc(size_t byteSize);

  static std::tuple<Error, ShmSegment> access(Fd fd);

  /// Allocate shared memory to contain an object of type T and construct it.
  ///
  /// The Segment object owns the memory and frees it when destructed.
  /// The raw pointer to the object provides a view into the Segment but doesn't
  /// own it and may thus become invalid if the Segment isn't kept alive.
  template <
      typename T,
      typename... Args,
      std::enable_if_t<!std::is_array<T>::value, int> = 0>
  static std::tuple<Error, ShmSegment, T*> create(Args&&... args) {
    static_assert(
        std::is_trivially_copyable<T>::value,
        "Shared memory segments are restricted to only store objects that "
        "are trivially copyable (i.e. no pointers and no heap allocation");

    const auto byteSize = sizeof(T);
    Error error;
    ShmSegment segment;
    std::tie(error, segment) = ShmSegment::alloc(byteSize);
    if (error) {
      return std::make_tuple(std::move(error), ShmSegment(), nullptr);
    }
    TP_DCHECK_EQ(segment.getSize(), byteSize);

    // Initialize in place. Forward T's constructor arguments.
    T* ptr = new (segment.getPtr()) T(std::forward<Args>(args)...);
    TP_THROW_SYSTEM_IF(ptr != segment.getPtr(), EPERM)
        << "new's address cannot be different from segment.getPtr() "
        << "address. Some aligment assumption was incorrect";

    return std::make_tuple(Error::kSuccess, std::move(segment), ptr);
  }

  /// One-dimensional array version of create<T, ...Args>.
  // XXX: Fuse all versions of create.
  template <
      typename T,
      std::enable_if_t<std::is_array<T>::value, int> = 0,
      typename TScalar = typename std::remove_all_extents<T>::type>
  static std::tuple<Error, ShmSegment, TScalar*> create(size_t numElements) {
    static_assert(
        std::is_same<TScalar[], T>::value,
        "Only one-dimensional unbounded arrays are supported");
    static_assert(
        std::is_trivially_copyable<TScalar>::value,
        "Shared memory segments are restricted to only store objects that "
        "are trivially copyable (i.e. no pointers and no heap allocation");

    size_t byteSize = sizeof(TScalar) * numElements;
    Error error;
    ShmSegment segment;
    std::tie(error, segment) = ShmSegment::alloc(byteSize);
    if (error) {
      return std::make_tuple(std::move(error), ShmSegment(), nullptr);
    }
    TP_DCHECK_EQ(segment.getSize(), byteSize);

    // Initialize in place.
    TScalar* ptr = new (segment.getPtr()) TScalar[numElements]();
    TP_THROW_SYSTEM_IF(ptr != segment.getPtr(), EPERM)
        << "new's address cannot be different from segment.getPtr() "
        << "address. Some aligment assumption was incorrect";

    return std::make_tuple(Error::kSuccess, std::move(segment), ptr);
  }

  /// Load an existing shared memory region that already holds an object of type
  /// T, where T is NOT an array type.
  template <typename T, std::enable_if_t<!std::is_array<T>::value, int> = 0>
  static std::tuple<Error, ShmSegment, T*> load(Fd fd) {
    static_assert(
        std::is_trivially_copyable<T>::value,
        "Shared memory segments are restricted to only store objects that "
        "are trivially copyable (i.e. no pointers and no heap allocation");

    Error error;
    ShmSegment segment;
    std::tie(error, segment) = ShmSegment::access(std::move(fd));
    if (error) {
      return std::make_tuple(std::move(error), ShmSegment(), nullptr);
    }
    const size_t size = segment.getSize();
    // XXX: Do some checking other than the size that we are loading
    // the right type.
    TP_THROW_SYSTEM_IF(size != sizeof(T), EPERM)
        << "Shared memory file has unexpected size. "
        << "Got: " << size << " bytes, expected: " << sizeof(T) << ". "
        << "If there is a race between creation and loading of segments, "
        << "consider linking segment after it has been fully initialized.";
    auto ptr = static_cast<T*>(segment.getPtr());

    return std::make_tuple(Error::kSuccess, std::move(segment), ptr);
  }

  /// Load an existing shared memory region that already holds an object of type
  /// T, where T is an array type.
  template <
      typename T,
      std::enable_if_t<std::is_array<T>::value, int> = 0,
      typename TScalar = typename std::remove_all_extents<T>::type>
  static std::tuple<Error, ShmSegment, TScalar*> load(Fd fd) {
    static_assert(
        std::is_same<TScalar[], T>::value,
        "Only one-dimensional unbounded arrays are supported");
    static_assert(
        std::is_trivially_copyable<TScalar>::value,
        "Shared memory segments are restricted to only store objects that "
        "are trivially copyable (i.e. no pointers and no heap allocation");

    Error error;
    ShmSegment segment;
    std::tie(error, segment) = ShmSegment::access(std::move(fd));
    if (error) {
      return std::make_tuple(std::move(error), ShmSegment(), nullptr);
    }
    auto ptr = static_cast<TScalar*>(segment.getPtr());

    return std::make_tuple(Error::kSuccess, std::move(segment), ptr);
  }

  int getFd() const {
    return fd_.fd();
  }

  void* getPtr() {
    return ptr_.ptr();
  }

  const void* getPtr() const {
    return ptr_.ptr();
  }

  size_t getSize() const {
    return ptr_.getLength();
  }

 private:
  // The file descriptor of the shared memory file.
  Fd fd_;

  // Base pointer of mmmap'ed shared memory segment.
  MmappedPtr ptr_;
};

} // namespace tensorpipe


================================================
FILE: tensorpipe/common/socket.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/common/socket.h>

#include <fcntl.h>
#include <sys/un.h>
#include <unistd.h>

#include <cstring>

#include <tensorpipe/common/defs.h>

#ifndef SOCK_NONBLOCK
#define SOCK_NONBLOCK 0
#endif // SOCK_NONBLOCK

namespace tensorpipe {

std::tuple<Error, Socket> Socket::createForFamily(sa_family_t aiFamily) {
  auto rv = socket(aiFamily, SOCK_STREAM | SOCK_NONBLOCK, 0);
  if (rv == -1) {
    return std::make_tuple(
        TP_CREATE_ERROR(SystemError, "socket", errno), Socket());
  }
  Socket sock(rv);
#ifndef SOCK_NONBLOCK
  // The SOCK_NONBLOCK option of socket() is Linux-only. On OSX, we need to
  // manually set the socket to non-blocking after its creation.
  auto err = sock->block(false);
  if (err) {
    return std::make_tuple(err, Socket());
  }
#endif // SOCK_NONBLOCK
  return std::make_tuple(Error::kSuccess, std::move(sock));
}

Error Socket::block(bool on) {
  int rv;
  rv = fcntl(fd_, F_GETFL);
  if (rv == -1) {
    return TP_CREATE_ERROR(SystemError, "fcntl", errno);
  }
  if (!on) {
    // Set O_NONBLOCK
    rv |= O_NONBLOCK;
  } else {
    // Clear O_NONBLOCK
    rv &= ~O_NONBLOCK;
  }
  rv = fcntl(fd_, F_SETFL, rv);
  if (rv == -1) {
    return TP_CREATE_ERROR(SystemError, "fcntl", errno);
  }
  return Error::kSuccess;
}

Error Socket::reuseAddr(bool on) {
  int onInt = on ? 1 : 0;
  auto rv = setsockopt(fd_, SOL_SOCKET, SO_REUSEADDR, &onInt, sizeof(onInt));
  if (rv == -1) {
    return TP_CREATE_ERROR(SystemError, "setsockopt", errno);
  }
  return Error::kSuccess;
}

Error Socket::bind(const Sockaddr& addr) {
  auto rv = ::bind(fd_, addr.addr(), addr.addrlen());
  if (rv == -1) {
    return TP_CREATE_ERROR(SystemError, "bind", errno);
  }
  return Error::kSuccess;
}

Error Socket::listen(int backlog) {
  auto rv = ::listen(fd_, backlog);
  if (rv == -1) {
    return TP_CREATE_ERROR(SystemError, "listen", errno);
  }
  return Error::kSuccess;
}

std::tuple<Error, Socket> Socket::accept() {
  struct sockaddr_storage addr;
  socklen_t addrlen = sizeof(addr);
  int rv = -1;
  for (;;) {
    rv = ::accept(fd_, (struct sockaddr*)&addr, &addrlen);
    if (rv == -1) {
      if (errno == EINTR) {
        continue;
      }
      return std::make_tuple(
          TP_CREATE_ERROR(SystemError, "accept", errno), Socket());
    }
    break;
  }
  return std::make_tuple(Error::kSuccess, Socket(rv));
}

Error Socket::connect(const Sockaddr& addr) {
  for (;;) {
    auto rv = ::connect(fd_, addr.addr(), addr.addrlen());
    if (rv == -1) {
      if (errno == EINTR) {
        continue;
      }
      if (errno != EINPROGRESS) {
        return TP_CREATE_ERROR(SystemError, "connect", errno);
      }
    }
    break;
  }
  return Error::kSuccess;
}

std::tuple<Error, struct sockaddr_storage, socklen_t> Socket::getSockName()
    const {
  struct sockaddr_storage addr;
  socklen_t addrlen = sizeof(addr);
  int rv = ::getsockname(fd_, reinterpret_cast<sockaddr*>(&addr), &addrlen);
  if (rv < 0) {
    return std::make_tuple(
        TP_CREATE_ERROR(SystemError, "getsockname", errno), addr, addrlen);
  }
  return std::make_tuple(Error::kSuccess, addr, addrlen);
}

} // namespace tensorpipe


================================================
FILE: tensorpipe/common/socket.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <sys/socket.h>

#include <chrono>
#include <cstring>
#include <memory>

#include <tensorpipe/common/defs.h>
#include <tensorpipe/common/error.h>
#include <tensorpipe/common/error_macros.h>
#include <tensorpipe/common/fd.h>
#include <tensorpipe/common/optional.h>

namespace tensorpipe {

namespace {

void saveOneFdToArray(int& dst, const int& src) {
  dst = src;
}

void saveOneFdToArray(int& dst, const Fd& src) {
  dst = src.fd();
}

template <size_t... Idxs, typename... Fds>
void saveFdsToArray(
    int* array,
    std::index_sequence<Idxs...> /*unused*/,
    const Fds&... fds) {
  // This is a trick to do pack expansion of the function call.
  auto dummy = {(saveOneFdToArray(array[Idxs], fds), 0)...};
}

void loadOneFdFromArray(int& src, int& dst) {
  dst = src;
}

void loadOneFdFromArray(int& src, Fd& dst) {
  dst = Fd(src);
}

template <size_t... Idxs, typename... Fds>
void loadFdsFromArray(
    int* array,
    std::index_sequence<Idxs...> /*unused*/,
    Fds&... fds) {
  // This is a trick to do pack expansion of the function call.
  auto dummy = {(loadOneFdFromArray(array[Idxs], fds), 0)...};
}

} // namespace

template <typename T, typename... Fds>
[[nodiscard]] Error sendToSocket(
    int socketFd,
    const T& t1,
    const T& t2,
    const Fds&... fds) {
  using TPayload = int;

  // Build message.
  struct msghdr msg;
  msg.msg_name = nullptr;
  msg.msg_namelen = 0;
  msg.msg_flags = 0;

  // Build iov to write Ts.
  std::array<T, 2> tbuf = {t1, t2};
  struct iovec iov;
  iov.iov_base = tbuf.data();
  iov.iov_len = sizeof(tbuf);
  msg.msg_iov = &iov;
  msg.msg_iovlen = sizeof(iov) / sizeof(iovec);

  // Build control message.
  std::array<uint8_t, CMSG_SPACE(sizeof(TPayload) * sizeof...(Fds))> buf;
  msg.msg_control = buf.data();
  msg.msg_controllen = buf.size();

  struct cmsghdr* cmsg;
  cmsg = CMSG_FIRSTHDR(&msg);
  cmsg->cmsg_level = SOL_SOCKET;
  cmsg->cmsg_type = SCM_RIGHTS;
  cmsg->cmsg_len = CMSG_LEN(sizeof(TPayload) * sizeof...(Fds));
  auto payload = reinterpret_cast<TPayload*>(CMSG_DATA(cmsg));
  saveFdsToArray(payload, std::index_sequence_for<Fds...>{}, fds...);

  // Send message.
  for (;;) {
    auto rv = ::sendmsg(socketFd, &msg, 0);
    if (rv == -1) {
      if (errno == EINTR) {
        continue;
      }
      return TP_CREATE_ERROR(SystemError, "sendmsg", errno);
    }
    if (rv != iov.iov_len) {
      return TP_CREATE_ERROR(ShortWriteError, iov.iov_len, rv);
    }
    break;
  }

  return Error::kSuccess;
}

template <typename... Fds>
[[nodiscard]] Error sendFdsToSocket(int socketFd, const Fds&... fds) {
  char dummy = 0;
  return sendToSocket(socketFd, dummy, dummy, fds...);
}

template <typename T, typename... Fds>
[[nodiscard]] Error recvFromSocket(int socketFd, T& t1, T& t2, Fds&... fds) {
  using TPayload = int;

  // Build message.
  struct msghdr msg;
  msg.msg_name = nullptr;
  msg.msg_namelen = 0;
  msg.msg_flags = 0;

  // Build iov to read Ts.
  std::array<T, 2> tbuf;
  struct iovec iov;
  iov.iov_base = tbuf.data();
  iov.iov_len = sizeof(tbuf);
  msg.msg_iov = &iov;
  msg.msg_iovlen = sizeof(iov) / sizeof(iovec);

  // Build control message.
  std::array<uint8_t, CMSG_SPACE(sizeof(TPayload) * sizeof...(Fds))> buf;
  msg.msg_control = buf.data();
  msg.msg_controllen = buf.size();

  // Receive message.
  for (;;) {
    auto rv = ::recvmsg(socketFd, &msg, 0);
    if (rv == -1) {
      if (errno == EINTR) {
        continue;
      }
      return TP_CREATE_ERROR(SystemError, "recvmsg", errno);
    }
    if (rv != iov.iov_len) {
      return TP_CREATE_ERROR(ShortReadError, iov.iov_len, rv);
    }
    break;
  }

  t1 = tbuf[0];
  t2 = tbuf[1];

  // Read control message.
  struct cmsghdr* cmsg;
  cmsg = CMSG_FIRSTHDR(&msg);
  TP_DCHECK_NE(cmsg, static_cast<void*>(nullptr));
  TP_DCHECK_EQ(cmsg->cmsg_level, SOL_SOCKET);
  TP_DCHECK_EQ(cmsg->cmsg_type, SCM_RIGHTS);
  TP_DCHECK_EQ(cmsg->cmsg_len, CMSG_LEN(sizeof(TPayload) * sizeof...(Fds)));
  auto payload = reinterpret_cast<TPayload*>(CMSG_DATA(cmsg));
  loadFdsFromArray(payload, std::index_sequence_for<Fds...>{}, fds...);

  return Error::kSuccess;
}

template <typename... Fds>
[[nodiscard]] Error recvFdsFromSocket(int socketFd, Fds&... fds) {
  char dummy = 0;
  return recvFromSocket(socketFd, dummy, dummy, fds...);
}

class Sockaddr {
 public:
  virtual const struct sockaddr* addr() const = 0;

  virtual socklen_t addrlen() const = 0;

  virtual ~Sockaddr() = default;
};

class Socket final : public Fd {
 public:
  [[nodiscard]] static std::tuple<Error, Socket> createForFamily(
      sa_family_t aiFamily);

  Socket() = default;

  explicit Socket(int fd) : Fd(fd) {}

  // Configure if the socket is blocking or not.
  [[nodiscard]] Error block(bool on);

  // Set (or unset) the SO_REUSEADDR option on the socket.
  [[nodiscard]] Error reuseAddr(bool on);

  // Bind socket to address.
  [[nodiscard]] Error bind(const Sockaddr& addr);

  // Listen on socket.
  [[nodiscard]] Error listen(int backlog);

  // Accept new socket connecting to listening socket.
  [[nodiscard]] std::tuple<Error, Socket> accept();

  // Connect to address.
  [[nodiscard]] Error connect(const Sockaddr& addr);

  [[nodiscard]] std::tuple<Error, struct sockaddr_storage, socklen_t>
  getSockName() const;

  // Send file descriptor.
  template <typename... Fds>
  [[nodiscard]] Error sendFds(const Fds&... fds) {
    return sendFdsToSocket(fd_, fds...);
  }

  // Receive file descriptor.
  template <typename... Fds>
  [[nodiscard]] Error recvFds(Fds&... fds) {
    return recvFdsFromSocket(fd_, fds...);
  }

  // Send object and file descriptor.
  template <
      typename T,
      typename... Fds,
      typename std::enable_if<std::is_trivially_copyable<T>::value, bool>::
          type = false>
  [[nodiscard]] Error sendPayloadAndFds(
      const T& t1,
      const T& t2,
      const Fds&... fds) {
    return sendToSocket(fd_, t1, t2, fds...);
  }

  // Receive object and file descriptor.
  template <
      typename T,
      typename... Fds,
      typename std::enable_if<std::is_trivially_copyable<T>::value, bool>::
          type = false>
  [[nodiscard]] Error recvPayloadAndFds(T& t1, T& t2, Fds&... fds) {
    return recvFromSocket(fd_, t1, t2, fds...);
  }
};

} // namespace tensorpipe


================================================
FILE: tensorpipe/common/state_machine.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <cstdint>
#include <deque>
#include <utility>

namespace tensorpipe {

template <typename TSubject, typename TOp>
class OpsStateMachine {
 public:
  class Iter {
   public:
    TOp& operator*() const {
      return *opPtr_;
    }

    TOp* operator->() const {
      return opPtr_;
    }

   private:
    explicit Iter(TOp* opPtr) : opPtr_(opPtr) {}

    TOp* opPtr_{nullptr};

    friend OpsStateMachine;
  };

  using Transitioner = void (TSubject::*)(Iter, typename TOp::State);

  OpsStateMachine(TSubject& subject, Transitioner transitioner)
      : subject_(subject), transitioner_(transitioner) {}

  template <typename... TArgs>
  Iter emplaceBack(uint64_t sequenceNumber, TArgs&&... args) {
    ops_.emplace_back(std::forward<TArgs>(args)...);
    TOp& op = ops_.back();
    op.sequenceNumber = sequenceNumber;
    return Iter(&op);
  }

  void advanceOperation(Iter initialOpIter) {
    // Advancing one operation may unblock later ones that could have progressed
    // but were prevented from overtaking. Thus each time an operation manages
    // to advance we'll try to also advance the one after.
    for (int64_t sequenceNumber = initialOpIter->sequenceNumber;;
         ++sequenceNumber) {
      TOp* opPtr = findOperation(sequenceNumber);
      if (opPtr == nullptr || opPtr->state == TOp::FINISHED ||
          !advanceOneOperation(*opPtr)) {
        break;
      }
    }
  }

  void advanceAllOperations() {
    // We cannot just iterate over the operations here as advanceOneOperation
    // could potentially erase some of them, thus invalidating references and/or
    // iterators.
    if (ops_.empty()) {
      return;
    }
    for (int64_t sequenceNumber = ops_.front().sequenceNumber;;
         ++sequenceNumber) {
      TOp* opPtr = findOperation(sequenceNumber);
      if (opPtr == nullptr) {
        break;
      }
      advanceOneOperation(*opPtr);
    }
  }

  void attemptTransition(
      Iter opIter,
      typename TOp::State from,
      typename TOp::State to,
      bool cond,
      std::initializer_list<void (TSubject::*)(Iter)> actions) {
    if (opIter->state == from && cond) {
      for (const auto& action : actions) {
        (subject_.*action)(opIter);
      }
      opIter->state = to;
    }
  }

 private:
  TOp* findOperation(int64_t sequenceNumber) {
    if (ops_.empty()) {
      return nullptr;
    }
    int64_t offset = sequenceNumber - ops_.front().sequenceNumber;
    if (offset < 0 || offset >= ops_.size()) {
      return nullptr;
    }
    TOp& op = ops_[offset];
    TP_DCHECK_EQ(op.sequenceNumber, sequenceNumber);
    return &op;
  }

  bool advanceOneOperation(TOp& op) {
    // Due to the check in attemptTransition, each time that an operation
    // advances its state we must check whether this unblocks some later
    // operations that could progress but weren't allowed to overtake. In order
    // to detect whether this operation is advancing we store its state at the
    // beginning and then compare it with the state at the end.
    typename TOp::State initialState = op.state;

    // The operations must advance in order: later operations cannot "overtake"
    // earlier ones. Thus if this operation would reach a more advanced state
    // than previous operation we won't perform the transition.
    TOp* prevOpPtr = findOperation(op.sequenceNumber - 1);
    typename TOp::State prevOpState =
        prevOpPtr != nullptr ? prevOpPtr->state : TOp::FINISHED;

    (subject_.*transitioner_)(Iter(&op), prevOpState);

    // Compute return value now in case we next delete the operation.
    bool hasAdvanced = op.state != initialState;

    if (op.state == TOp::FINISHED) {
      // We can't remove the op if it's "in the middle". And, therefore, once we
      // remove the op at the front, we must check if other ops now also get
      // "unblocked". In other words, we always remove as much as we can from
      // the front.
      while (!ops_.empty() && ops_.front().state == TOp::FINISHED) {
        ops_.pop_front();
      }
    }

    return hasAdvanced;
  }

  TSubject& subject_;
  const Transitioner transitioner_;
  std::deque<TOp> ops_;
};

} // namespace tensorpipe


================================================
FILE: tensorpipe/common/stream_read_write_ops.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <array>
#include <functional>
#include <memory>
#include <tuple>
#include <utility>

#include <tensorpipe/common/defs.h>
#include <tensorpipe/common/error.h>
#include <tensorpipe/common/optional.h>

namespace tensorpipe {

// The read operation captures all state associated with reading a
// fixed length chunk of data from the underlying connection. All
// reads are required to include a word-sized header containing the
// number of bytes in the operation. This makes it possible for the
// read side of the connection to either 1) not know how many bytes
// to expected, and dynamically allocate, or 2) know how many bytes
// to expect, and preallocate the destination memory.
class StreamReadOperation {
  enum Mode {
    READ_LENGTH,
    READ_PAYLOAD,
    COMPLETE,
  };

 public:
  using read_callback_fn =
      std::function<void(const Error& error, const void* ptr, size_t len)>;

  explicit inline StreamReadOperation(read_callback_fn fn);

  inline StreamReadOperation(void* ptr, size_t length, read_callback_fn fn);

  // Called when a buffer is needed to read data from stream.
  inline void allocFromLoop(char** base, size_t* len);

  // Called when data has been read from stream.
  inline void readFromLoop(size_t nread);

  // Returns if this read operation is complete.
  inline bool completeFromLoop() const;

  // Invoke user callback.
  inline void callbackFromLoop(const Error& error);

 private:
  Mode mode_{READ_LENGTH};
  char* ptr_{nullptr};

  // Number of bytes as specified by the user (if applicable).
  optional<size_t> givenLength_;

  // Number of bytes to expect as read from the connection.
  size_t readLength_{0};

  // Number of bytes read from the connection.
  // This is reset to 0 when we advance from READ_LENGTH to READ_PAYLOAD.
  size_t bytesRead_{0};

  // Holds temporary allocation if no length was specified.
  std::unique_ptr<char[]> buffer_{nullptr};

  // User callback.
  read_callback_fn fn_;
};

StreamReadOperation::StreamReadOperation(read_callback_fn fn)
    : fn_(std::move(fn)) {}

StreamReadOperation::StreamReadOperation(
    void* ptr,
    size_t length,
    read_callback_fn fn)
    : ptr_(static_cast<char*>(ptr)), givenLength_(length), fn_(std::move(fn)) {}

void StreamReadOperation::allocFromLoop(char** base, size_t* len) {
  if (mode_ == READ_LENGTH) {
    TP_DCHECK_LT(bytesRead_, sizeof(readLength_));
    *base = reinterpret_cast<char*>(&readLength_) + bytesRead_;
    *len = sizeof(readLength_) - bytesRead_;
  } else if (mode_ == READ_PAYLOAD) {
    TP_DCHECK_LT(bytesRead_, readLength_);
    TP_DCHECK(ptr_ != nullptr);
    *base = ptr_ + bytesRead_;
    *len = readLength_ - bytesRead_;
  } else {
    TP_THROW_ASSERT() << "invalid mode " << mode_;
  }
}

void StreamReadOperation::readFromLoop(size_t nread) {
  bytesRead_ += nread;
  if (mode_ == READ_LENGTH) {
    TP_DCHECK_LE(bytesRead_, sizeof(readLength_));
    if (bytesRead_ == sizeof(readLength_)) {
      if (givenLength_.has_value()) {
        TP_DCHECK(ptr_ != nullptr || givenLength_.value() == 0);
        TP_DCHECK_EQ(readLength_, givenLength_.value());
      } else {
        TP_DCHECK(ptr_ == nullptr);
        buffer_ = std::make_unique<char[]>(readLength_);
        ptr_ = buffer_.get();
      }
      if (readLength_ == 0) {
        mode_ = COMPLETE;
      } else {
        mode_ = READ_PAYLOAD;
      }
      bytesRead_ = 0;
    }
  } else if (mode_ == READ_PAYLOAD) {
    TP_DCHECK_LE(bytesRead_, readLength_);
    if (bytesRead_ == readLength_) {
      mode_ = COMPLETE;
    }
  } else {
    TP_THROW_ASSERT() << "invalid mode " << mode_;
  }
}

bool StreamReadOperation::completeFromLoop() const {
  return mode_ == COMPLETE;
}

void StreamReadOperation::callbackFromLoop(const Error& error) {
  fn_(error, ptr_, readLength_);
}

// The write operation captures all state associated with writing a
// fixed length chunk of data from the underlying connection. The
// write includes a word-sized header containing the length of the
// write. This header is a member field on this class and therefore
// the instance must be kept alive and the reference to the instance
// must remain valid until the write callback has been called.
class StreamWriteOperation {
 public:
  using write_callback_fn = std::function<void(const Error& error)>;

  inline StreamWriteOperation(
      const void* ptr,
      size_t length,
      write_callback_fn fn);

  struct Buf {
    char* base;
    size_t len;
  };

  inline std::tuple<Buf*, size_t> getBufs();

  // Invoke user callback.
  inline void callbackFromLoop(const Error& error);

 private:
  const char* ptr_;
  const size_t length_;

  // Buffers (structs with pointers and lengths) to write to stream.
  std::array<Buf, 2> bufs_;

  // User callback.
  write_callback_fn fn_;
};

StreamWriteOperation::StreamWriteOperation(
    const void* ptr,
    size_t length,
    write_callback_fn fn)
    : ptr_(static_cast<const char*>(ptr)), length_(length), fn_(std::move(fn)) {
  bufs_[0].base = const_cast<char*>(reinterpret_cast<const char*>(&length_));
  bufs_[0].len = sizeof(length_);
  bufs_[1].base = const_cast<char*>(ptr_);
  bufs_[1].len = length_;
}

std::tuple<StreamWriteOperation::Buf*, size_t> StreamWriteOperation::getBufs() {
  size_t numBuffers = length_ == 0 ? 1 : 2;
  return std::make_tuple(bufs_.data(), numBuffers);
}

void StreamWriteOperation::callbackFromLoop(const Error& error) {
  fn_(error);
}

} // namespace tensorpipe


================================================
FILE: tensorpipe/common/strings.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <sstream>
#include <string>
#include <vector>

namespace tensorpipe {

inline std::string joinStrs(const std::vector<std::string>& strs) {
  if (strs.empty()) {
    return "";
  }
  std::ostringstream oss;
  oss << strs[0];
  for (size_t idx = 1; idx < strs.size(); idx++) {
    oss << ", " << strs[idx];
  }
  return oss.str();
}

template <typename T>
std::string formatMatrix(const std::vector<std::vector<T>>& matrix) {
  std::ostringstream oss;
  oss << "{";
  for (size_t rowIdx = 0; rowIdx < matrix.size(); rowIdx++) {
    if (rowIdx > 0) {
      oss << ", ";
    }
    oss << "{";
    for (size_t colIdx = 0; colIdx < matrix[rowIdx].size(); colIdx++) {
      if (colIdx > 0) {
        oss << ", ";
      }
      oss << matrix[rowIdx][colIdx];
    }
    oss << "}";
  }
  oss << "}";
  return oss.str();
}

// Since text manipulation is hard, let's use this to double-check our results.
inline bool isValidUuid(const std::string& uuid) {
  // Check it's in this format:
  // aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee
  // |0   |5   |10  |15  |20  |25  |30  |35
  if (uuid.size() != 36) {
    return false;
  }
  for (int i = 0; i < uuid.size(); i++) {
    if (i == 8 || i == 13 || i == 18 || i == 23) {
      if (uuid[i] != '-') {
        return false;
      }
    } else {
      if (!((uuid[i] >= '0' && uuid[i] <= '9') ||
            (uuid[i] >= 'a' && uuid[i] <= 'f'))) {
        return false;
      }
    }
  }
  return true;
}

} // namespace tensorpipe


================================================
FILE: tensorpipe/common/system.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/common/system.h>

#ifdef __linux__
#include <linux/capability.h>
#include <pthread.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
#endif

#ifdef __APPLE__
#include <IOKit/IOKitLib.h>
#endif

#include <array>
#include <cstring>
#include <fstream>
#include <iomanip>
#include <sstream>
#include <stdexcept>
#include <system_error>
#include <thread>

#ifdef __linux__

// This is a libc wrapper for the Linux syscall.
// I'm not sure why we need to declare it ourselves, but that's what libcap
// does too, and I couldn't find any libc header in which it's declared.
// Direct use of the syscall is strongly discouraged, in favor of libcap (which
// has a more friendly API and better backwards-compatibility). However we
// really don't want to add a dependency, and moreover libcap introduces an
// artificial limitation that only allows us to query the capabilities that were
// defined by the kernel headers when libcap was built, meaning we might miss
// some (new) capabilities if the kernel was updated in the meantime.
extern "C" {
extern int capget(cap_user_header_t header, const cap_user_data_t data);
}

#endif

namespace tensorpipe {

namespace {

#ifdef __APPLE__
optional<std::string> getBootIDInternal() {
  std::array<char, 128> buf;

  // See https://developer.apple.com/documentation/iokit/iokitlib_h for IOKitLib
  // API documentation.
  io_registry_entry_t ioRegistryRoot =
      IORegistryEntryFromPath(kIOMainPortDefault, "IOService:/");
  CFStringRef uuidCf = (CFStringRef)IORegistryEntryCreateCFProperty(
      ioRegistryRoot, CFSTR(kIOPlatformUUIDKey), kCFAllocatorDefault, 0);
  IOObjectRelease(ioRegistryRoot);
  CFStringGetCString(uuidCf, buf.data(), buf.size(), kCFStringEncodingMacRoman);
  CFRelease(uuidCf);

  return std::string(buf.data());
}

#elif defined(__linux__)
optional<std::string> getBootIDInternal() {
  std::ifstream f{"/proc/sys/kernel/random/boot_id"};
  if (!f.is_open()) {
    return nullopt;
  }
  std::string v;
  getline(f, v);
  f.close();
  return v;
}

// See namespaces(7).
std::string getPathForLinuxNamespace(LinuxNamespace ns) {
  std::ostringstream oss;
  oss << "/proc/self/ns/";
  switch (ns) {
    case LinuxNamespace::kIpc:
      oss << "ipc";
      break;
    case LinuxNamespace::kNet:
      oss << "net";
      break;
    case LinuxNamespace::kPid:
      oss << "pid";
      break;
    case LinuxNamespace::kUser:
      oss << "user";
      break;
    default:
      TP_THROW_ASSERT() << "Unknown namespace";
  }
  return oss.str();
}

#endif

} // namespace

std::string tstampToStr(TimeStamp ts) {
  if (ts == kInvalidTimeStamp) {
    return "NA";
  }
  // print timestaps in microseconds.
  constexpr TimeStamp kDiv = 1000u;
  std::stringstream ss;
  ss << std::setw(9) << std::setfill(' ') << ts / kDiv;
  ss << "." << std::setw(3) << std::setfill('0') << ts % kDiv << "us";
  return ss.str();
}

optional<std::string> getProcFsStr(const std::string& fileName, pid_t tid) {
  std::ostringstream oss;
  oss << "/proc/" << tid << "/" << fileName;
  std::ifstream f{oss.str()};
  if (!f.is_open()) {
    return nullopt;
  }
  std::string v;
  getline(f, v);
  f.close();
  return v;
}

std::string removeBlankSpaces(std::string s) {
  // Remove blanks.
  s.erase(
      std::remove_if(
          s.begin(), s.end(), [](unsigned char c) { return std::isspace(c); }),
      s.end());
  return s;
}

optional<std::string> getBootID() {
  static optional<std::string> bootID = getBootIDInternal();
  return bootID;
}

#ifdef __APPLE__

// OSX is a UNIX, so often we'd like some of our Linux backends to work there
// too, but its lack of support for namespaces poses issues. However, that's
// like saying that in OSX all processes are in the same namespace with respect
// to all resources, so we pretend namespaces are supported, with a constant ID.
optional<std::string> getLinuxNamespaceId(LinuxNamespace ns) {
  return std::string();
}

#elif defined(__linux__)

// According to namespaces(7):
// > Each process has a /proc/[pid]/ns/ subdirectory containing one entry for
// > each namespace [...]. If two processes are in the same namespace, then the
// > device IDs and inode numbers of their /proc/[pid]/ns/xxx symbolic links
// > will be the same; an application can check this using the stat.st_dev and
// > stat.st_ino fields returned by stat(2).
optional<std::string> getLinuxNamespaceId(LinuxNamespace ns) {
  struct stat statInfo;
  std::string procfsNamespacePath = getPathForLinuxNamespace(ns);
  // First use lstat to stat the link itself, to ensure it's indeed a link.
  int rv = ::lstat(procfsNamespacePath.c_str(), &statInfo);

  if (rv < 0 && errno == ENOENT) {
    // These files were first provided in Linux 3.0 (although some of them came
    // later), however namespaces already existed before then, hence the only
    // safe thing to do is assume all processes are in different namespaces.
    return nullopt;
  }
  // Other errors, like access/permission ones, are unexpected.
  TP_THROW_SYSTEM_IF(rv < 0, errno);

  // Between Linux 3.0 and 3.7 these files were hard links. In Linux 3.8 they
  // became symlinks and only then it became possible to identify namespaces
  // through these files' inode numbers.
  if (!S_ISLNK(statInfo.st_mode)) {
    return nullopt;
  }

  // Then stat the "file" the link points to, as it's its inode we care about.
  rv = ::stat(procfsNamespacePath.c_str(), &statInfo);
  TP_THROW_SYSTEM_IF(rv < 0, errno);

  // These fields are of types dev_t and ino_t, which I couldn't find described
  // anywhere. They appear to be unsigned longs, but all we care about is that
  // they are integers, so let's check that.
  static_assert(std::is_integral<decltype(statInfo.st_dev)>::value, "");
  static_assert(std::is_integral<decltype(statInfo.st_ino)>::value, "");
  std::ostringstream oss;
  oss << std::hex << statInfo.st_dev << '_' << statInfo.st_ino;
  return oss.str();
}

// According to https://www.kernel.org/doc/Documentation/security/LSM.txt:
// > A list of the active security modules can be found by reading
// > /sys/kernel/security/lsm. This is a comma separated list [...].
optional<std::vector<std::string>> getLinuxSecurityModules() {
  std::ifstream f{"/sys/kernel/security/lsm"};
  if (f.fail()) {
    return nullopt;
  }
  // We shouldn't have to worry about an entirely empty file, as according to
  // the doc "[this list] will always include the capability module".
  std::vector<std::string> res;
  while (!f.eof()) {
    std::string lsm;
    std::getline(f, lsm, ',');
    TP_THROW_ASSERT_IF(f.fail());
    res.push_back(std::move(lsm));
  }
  f.close();
  TP_THROW_ASSERT_IF(f.fail());
  return res;
}

// See ptrace(2) (the sections towards the end) and
// https://www.kernel.org/doc/Documentation/security/Yama.txt
optional<YamaPtraceScope> getYamaPtraceScope() {
  std::ifstream f{"/proc/sys/kernel/yama/ptrace_scope"};
  if (f.fail()) {
    return nullopt;
  }
  int scope;
  f >> scope;
  TP_THROW_ASSERT_IF(f.fail());
  f.close();
  TP_THROW_ASSERT_IF(f.fail());
  switch (scope) {
    case 0:
      return YamaPtraceScope::kClassicPtracePermissions;
    case 1:
      return YamaPtraceScope::kRestrictedPtrace;
    case 2:
      return YamaPtraceScope::kAdminOnlyAttach;
    case 3:
      return YamaPtraceScope::kNoAttach;
    default:
      TP_THROW_ASSERT() << "Unrecognized YAMA ptrace scope: " << scope;
      // Dummy return to make the compiler happy.
      return nullopt;
  }
}

optional<std::string> getPermittedCapabilitiesID() {
  std::remove_pointer<cap_user_header_t>::type header;
  std::array<std::remove_pointer<cap_user_data_t>::type, 2> data;

  // At the time of writing there are three versions of the syscall supported
  // by the kernel, and we're supposed to perform a "handshake" to agree on the
  // latest version supported both by us and by the kernel. However, this is
  // only needed if we want to support pre-2.6.26 kernels, which we don't. Hence
  // we'll fail if the kernel doesn't support the latest version (v3). On the
  // other hand there is no way to figure out if the kernel's version has
  // advanced past the one we support. This will occur once there will be more
  // than 64 capabilities, but given the current pace this shouldn't happen for
  // quite a while. Such a limitation probably comes from the capability system
  // being designed around querying for a specific capability (in which case a
  // program only needs to support the syscall version where that capability was
  // added); querying _all_ capabilities (as we do) is kinda out-of-scope.
  header.version = 0x20080522;
  header.pid = 0;

  int rv = ::capget(&header, data.data());
  TP_THROW_SYSTEM_IF(rv < 0, errno);

  // We'll create a bitmask of the capabilities, and then return its hex.
  uint64_t bitmask = static_cast<uint64_t>(data[0].permitted) |
      (static_cast<uint64_t>(data[1].permitted) << 32);
  std::ostringstream oss;
  oss << std::hex << bitmask;
  return oss.str();
}

#endif

void setThreadName(std::string name) {
#ifdef __linux__
// In glibc this non-standard call was added in version 2.12, hence we guard it.
#ifdef __GLIBC__
#if ((__GLIBC__ > 2) || ((__GLIBC__ == 2) && (__GLIBC_MINOR__ >= 12)))
  pthread_setname_np(pthread_self(), name.c_str());
#endif
// In other standard libraries we didn't check yet, hence we always enable it.
#else
  pthread_setname_np(pthread_self(), name.c_str());
#endif
#endif
}

} // namespace tensorpipe


================================================
FILE: tensorpipe/common/system.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <algorithm>
#include <chrono>
#include <fstream>
#include <set>
#include <sstream>
#include <string>
#include <vector>

#include <tensorpipe/common/defs.h>
#include <tensorpipe/common/optional.h>

namespace tensorpipe {

//
// TimeStamp is a 64 bit value representing
// a high-resolution clock. It is usually
// in nano-seconds or in TSC cycles.
//
using TimeStamp = uint64_t;
constexpr TimeStamp kInvalidTimeStamp = std::numeric_limits<TimeStamp>::max();

std::string tstampToStr(TimeStamp ts);

// std::chronos::duration to TSC.
template <class TDuration>
TimeStamp durationToTimeStamp(TDuration d) {
  auto ns = std::chrono::duration_cast<std::chrono::nanoseconds>(d).count();
  if (ns < 0) {
    TP_THROW_EINVAL() << "Negative time durations are not valid";
  }
  return static_cast<TimeStamp>(ns);
}

//
// Useful math functions to work with CPU and binary integers
//

/// Is it a Power of 2?
constexpr bool isPow2(uint64_t n) noexcept {
  return n > 0 && !((n - 1) & n);
}

/// Smallest power of 2 larger or equal to <n>.
constexpr uint32_t nextPow2(uint32_t n) noexcept {
  --n;

  n |= n >> 1;
  n |= n >> 2;
  n |= n >> 4;
  n |= n >> 8;
  n |= n >> 16;

  return n + 1;
}

/// Smallest power of 2 larger or equal to <n>
constexpr uint64_t nextPow2(uint64_t n) noexcept {
  --n;

  n |= n >> 1;
  n |= n >> 2;
  n |= n >> 4;
  n |= n >> 8;
  n |= n >> 16;
  n |= n >> 32;

  return n + 1;
}

/// Largest power of 2 less or equal to <n>
constexpr uint64_t maxPow2LessEqualThan(uint64_t n) noexcept {
  if (isPow2(n)) {
    return n;
  }
  return nextPow2(n) >> 1;
}

// Return contents of /proc/sys/kernel/random/boot_id.
optional<std::string> getBootID();

enum class LinuxNamespace {
  kIpc,
  kNet,
  kPid,
  kUser,
  // Add more entries as needed.
};

// Returns a string that uniquely identifies a namespace of a certain type.
// It is only valid within the same machine and for that fixed type.
optional<std::string> getLinuxNamespaceId(LinuxNamespace ns);

// Returns the names of the active Linux Security Modules, in the order in which
// they are employed by the kernel. The names could be arbitrary (as third-party
// LSMs could be in use) but contain values like "capability", "apparmor",
// "yama", "lockdown", ...
optional<std::vector<std::string>> getLinuxSecurityModules();

enum class YamaPtraceScope {
  kClassicPtracePermissions,
  kRestrictedPtrace,
  kAdminOnlyAttach,
  kNoAttach,
};

// YAMA is a Linux Security Module that specifically targets ptrace by locking
// down a process so it can only be targeted by its ancestors or by processes
// that it specifically selects. However YAMA can be disabled, or made even
// stricter. This function returns precisely what level YAMA is operating at.
optional<YamaPtraceScope> getYamaPtraceScope();

// Return a representation of the set of permitted capabilities of the process.
// We're talking about Linux kernel capabilities, see capabilities(7).
optional<std::string> getPermittedCapabilitiesID();

// Set the name of the current thread, if possible. Use only for debugging.
void setThreadName(std::string name);

} // namespace tensorpipe


================================================
FILE: tensorpipe/config.h.in
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#cmakedefine01 TENSORPIPE_HAS_SHM_TRANSPORT
#cmakedefine01 TENSORPIPE_HAS_IBV_TRANSPORT

#cmakedefine01 TENSORPIPE_HAS_CMA_CHANNEL


================================================
FILE: tensorpipe/config_cuda.h.in
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#cmakedefine01 TENSORPIPE_HAS_CUDA_IPC_CHANNEL
#cmakedefine01 TENSORPIPE_HAS_CUDA_GDR_CHANNEL


================================================
FILE: tensorpipe/core/context.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/core/context.h>

#include <memory>
#include <string>
#include <utility>
#include <vector>

#include <tensorpipe/core/context_impl.h>

namespace tensorpipe {

Context::Context(ContextOptions opts)
    : impl_(std::make_shared<ContextImpl>(std::move(opts))) {
  impl_->init();
}

void Context::registerTransport(
    int64_t priority,
    std::string transport,
    std::shared_ptr<transport::Context> context) {
  impl_->registerTransport(priority, std::move(transport), std::move(context));
}

void Context::registerChannel(
    int64_t priority,
    std::string channel,
    std::shared_ptr<channel::Context> context) {
  impl_->registerChannel(priority, std::move(channel), std::move(context));
}

std::shared_ptr<Listener> Context::listen(
    const std::vector<std::string>& urls) {
  return impl_->listen(urls);
}

std::shared_ptr<Pipe> Context::connect(
    const std::string& url,
    PipeOptions opts) {
  return impl_->connect(url, std::move(opts));
}

void Context::close() {
  impl_->close();
}

void Context::join() {
  impl_->join();
}

Context::~Context() {
  join();
}

} // namespace tensorpipe


================================================
FILE: tensorpipe/core/context.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <memory>
#include <string>
#include <utility>
#include <vector>

#include <tensorpipe/transport/context.h>

#include <tensorpipe/channel/context.h>

namespace tensorpipe {

class ContextImpl;
class Listener;
class Pipe;

class ContextOptions {
 public:
  // The name should be a semantically meaningful description of this context.
  // It will only be used for logging and debugging purposes, to identify the
  // endpoints of a pipe.
  ContextOptions&& name(std::string name) && {
    name_ = std::move(name);
    return std::move(*this);
  }

 private:
  std::string name_;

  friend ContextImpl;
};

class PipeOptions {
 public:
  // The name should be a semantically meaningful description of the context
  // that the pipe is connecting to. It will only be used for logging and
  // debugging purposes, to identify the endpoints of a pipe.
  PipeOptions&& remoteName(std::string remoteName) && {
    remoteName_ = std::move(remoteName);
    return std::move(*this);
  }

 private:
  std::string remoteName_;

  friend ContextImpl;
};

class Context final {
 public:
  explicit Context(ContextOptions opts = ContextOptions());

  void registerTransport(
      int64_t priority,
      std::string transport,
      std::shared_ptr<transport::Context> context);

  void registerChannel(
      int64_t priority,
      std::string channel,
      std::shared_ptr<channel::Context> context);

  std::shared_ptr<Listener> listen(const std::vector<std::string>& urls);

  std::shared_ptr<Pipe> connect(
      const std::string& url,
      PipeOptions opts = PipeOptions());

  // Put the context in a terminal state, in turn closing all of its pipes and
  // listeners, and release its resources. This may be done asynchronously, in
  // background.
  void close();

  // Wait for all resources to be released and all background activity to stop.
  void join();

  ~Context();

 private:
  // The implementation is managed by a shared_ptr because each child object
  // will also hold a shared_ptr to it. However, its lifetime is tied to the one
  // of this public object since when the latter is destroyed the implementation
  // is closed and joined.
  const std::shared_ptr<ContextImpl> impl_;
};

} // namespace tensorpipe


================================================
FILE: tensorpipe/core/context_impl.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/core/context_impl.h>

#include <sys/types.h>
#include <unistd.h>

#include <atomic>
#include <memory>
#include <string>
#include <tuple>
#include <utility>
#include <vector>

#include <tensorpipe/common/callback.h>
#include <tensorpipe/common/defs.h>
#include <tensorpipe/common/error_macros.h>
#include <tensorpipe/common/optional.h>
#include <tensorpipe/common/queue.h>
#include <tensorpipe/core/error.h>
#include <tensorpipe/core/listener.h>
#include <tensorpipe/core/listener_impl.h>
#include <tensorpipe/core/pipe.h>
#include <tensorpipe/core/pipe_impl.h>
#include <tensorpipe/transport/connection.h>

namespace tensorpipe {

namespace {

std::atomic<uint64_t> contextCouter{0};

std::string createContextId() {
  // Should we use argv[0] instead of the PID? It may be more semantically
  // meaningful and consistent across runs, but it may not be unique...
  // Also, should we add the hostname/the IP address in case the logs from
  // different hosts are merged into a single stream?
  // Eventually we'll have to replace getpid with something more portable.
  // Libuv offers a cross-platform function to get the process ID.
  return std::to_string(getpid()) + ":c" + std::to_string(contextCouter++);
}

} // namespace

ContextImpl::ContextImpl(ContextOptions opts)
    : id_(createContextId()), name_(std::move(opts.name_)) {
  TP_VLOG(1) << "Context " << id_ << " created";
  if (name_ != "") {
    TP_VLOG(1) << "Context " << id_ << " aliased as " << name_;
    id_ = name_;
  }
}

void ContextImpl::init() {
  deferToLoop([this]() { initFromLoop(); });
}

void ContextImpl::initFromLoop() {}

void ContextImpl::registerTransport(
    int64_t priority,
    std::string transport,
    std::shared_ptr<transport::Context> context) {
  TP_THROW_ASSERT_IF(transport.empty());
  TP_THROW_ASSERT_IF(transports_.find(transport) != transports_.end())
      << "transport " << transport << " already registered";
  TP_THROW_ASSERT_IF(
      transportsByPriority_.find(-priority) != transportsByPriority_.end())
      << "transport with priority " << priority << " already registered";
  if (!context->isViable()) {
    TP_VLOG(1) << "Context " << id_ << " is not registering transport "
               << transport << " because it is not viable";
    return;
  }
  TP_VLOG(1) << "Context " << id_ << " is registering transport " << transport;
  context->setId(id_ + ".tr_" + transport);
  transports_.emplace(transport, context);
  // Reverse the priority, as the pipe will pick the *first* available transport
  // it can find in the ordered map, so higher priorities should come first.
  transportsByPriority_.emplace(-priority, std::make_tuple(transport, context));
}

void ContextImpl::registerChannel(
    int64_t priority,
    std::string channel,
    std::shared_ptr<channel::Context> context) {
  TP_THROW_ASSERT_IF(channel.empty());
  TP_THROW_ASSERT_IF(channels_.find(channel) != channels_.end())
      << "channel " << channel << " already registered";
  TP_THROW_ASSERT_IF(
      channelsByPriority_.find(-priority) != channelsByPriority_.end())
      << "channel with priority " << priority << " already registered";
  if (!context->isViable()) {
    TP_VLOG(1) << "Context " << id_ << " is not registering channel " << channel
               << " because it is not viable";
    return;
  }
  TP_VLOG(1) << "Context " << id_ << " is registering channel " << channel;
  context->setId(id_ + ".ch_" + channel);
  channels_.emplace(channel, context);
  // Reverse the priority, as the pipe will pick the *first* available channel
  // it can find in the ordered map, so higher priorities should come first.
  channelsByPriority_.emplace(-priority, std::make_tuple(channel, context));
}

std::shared_ptr<Listener> ContextImpl::listen(
    const std::vector<std::string>& urls) {
  std::string listenerId =
      id_ + "[l" + std::to_string(listenerCounter_++) + "]";
  TP_VLOG(1) << "Context " << id_ << " is opening listener " << listenerId;
  return std::make_shared<Listener>(
      Listener::ConstructorToken(),
      shared_from_this(),
      std::move(listenerId),
      urls);
}

std::shared_ptr<Pipe> ContextImpl::connect(
    const std::string& url,
    PipeOptions opts) {
  std::string pipeId = id_ + ".p" + std::to_string(pipeCounter_++);
  TP_VLOG(1) << "Context " << id_ << " is opening pipe " << pipeId;
  std::string remoteContextName = std::move(opts.remoteName_);
  if (remoteContextName != "") {
    std::string aliasPipeId = id_ + "_to_" + remoteContextName;
    TP_VLOG(1) << "Pipe " << pipeId << " aliased as " << aliasPipeId;
    pipeId = std::move(aliasPipeId);
  }
  return std::make_shared<Pipe>(
      Pipe::ConstructorToken(),
      shared_from_this(),
      std::move(pipeId),
      std::move(remoteContextName),
      url);
}

std::shared_ptr<transport::Context> ContextImpl::getTransport(
    const std::string& transport) {
  auto iter = transports_.find(transport);
  if (iter == transports_.end()) {
    TP_THROW_EINVAL() << "unsupported transport " << transport;
  }
  return iter->second;
}

std::shared_ptr<channel::Context> ContextImpl::getChannel(
    const std::string& channel) {
  auto iter = channels_.find(channel);
  if (iter == channels_.end()) {
    TP_THROW_EINVAL() << "unsupported channel " << channel;
  }
  return iter->second;
}

const ContextImpl::TOrderedTransports& ContextImpl::getOrderedTransports() {
  return transportsByPriority_;
}

const ContextImpl::TOrderedChannels& ContextImpl::getOrderedChannels() {
  return channelsByPriority_;
}

const std::string& ContextImpl::getName() {
  return name_;
}

void ContextImpl::enroll(ListenerImpl& listener) {
  TP_DCHECK(inLoop());
  bool wasInserted;
  std::tie(std::ignore, wasInserted) =
      listeners_.emplace(&listener, listener.shared_from_this());
  TP_DCHECK(wasInserted);
}

void ContextImpl::enroll(PipeImpl& pipe) {
  TP_DCHECK(inLoop());
  bool wasInserted;
  std::tie(std::ignore, wasInserted) =
      pipes_.emplace(&pipe, pipe.shared_from_this());
  TP_DCHECK(wasInserted);
}

void ContextImpl::unenroll(ListenerImpl& listener) {
  TP_DCHECK(inLoop());
  auto numRemoved = listeners_.erase(&listener);
  TP_DCHECK_EQ(numRemoved, 1);
}

void ContextImpl::unenroll(PipeImpl& pipe) {
  TP_DCHECK(inLoop());
  auto numRemoved = pipes_.erase(&pipe);
  TP_DCHECK_EQ(numRemoved, 1);
}

bool ContextImpl::closed() {
  TP_DCHECK(inLoop());
  return error_;
}

void ContextImpl::deferToLoop(TTask fn) {
  loop_.deferToLoop(std::move(fn));
}

bool ContextImpl::inLoop() const {
  return loop_.inLoop();
}

void ContextImpl::close() {
  deferToLoop([this]() { closeFromLoop(); });
}

void ContextImpl::closeFromLoop() {
  TP_DCHECK(inLoop());
  TP_VLOG(1) << "Context " << id_ << " is closing";
  setError(TP_CREATE_ERROR(ContextClosedError));
  TP_VLOG(1) << "Context " << id_ << " done closing";
}

void ContextImpl::setError(Error error) {
  // Don't overwrite an error that's already set.
  if (error_ || !error) {
    return;
  }

  error_ = std::move(error);

  handleError();
}

void ContextImpl::handleError() {
  TP_DCHECK(inLoop());
  TP_VLOG(5) << "Context " << id_ << " is handling error " << error_.what();

  // Make a copy as they could unenroll themselves inline.
  auto listenersCopy = listeners_;
  auto pipesCopy = pipes_;
  // We call closeFromLoop, rather than just close, because we need these
  // objects to transition _immediately_ to error, "atomically". If we just
  // deferred closing to later, this could come after some already-enqueued
  // operations that could try to access the context, which would be closed,
  // and this could fail.
  for (auto& iter : listenersCopy) {
    iter.second->closeFromLoop();
  }
  for (auto& iter : pipesCopy) {
    iter.second->closeFromLoop();
  }

  for (auto& iter : transports_) {
    iter.second->close();
  }
  for (auto& iter : channels_) {
    iter.second->close();
  }
}

void ContextImpl::join() {
  close();

  if (!joined_.exchange(true)) {
    TP_VLOG(1) << "Context " << id_ << " is joining";

    // As closing is deferred to the loop, we must wait for close to be actually
    // called before we join, to avoid race conditions. For this, we defer
    // another task to the loop, which we know will run after the closing, and
    // then we wait for that task to be run.
    std::promise<void> hasClosed;
    deferToLoop([&]() { hasClosed.set_value(); });
    hasClosed.get_future().wait();

    for (auto& iter : transports_) {
      iter.second->join();
    }
    for (auto& iter : channels_) {
      iter.second->join();
    }

    TP_VLOG(1) << "Context " << id_ << " done joining";

    TP_DCHECK(listeners_.empty());
    TP_DCHECK(pipes_.empty());
  }
}

} // namespace tensorpipe


================================================
FILE: tensorpipe/core/context_impl.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <atomic>
#include <map>
#include <memory>
#include <string>
#include <tuple>
#include <unordered_map>
#include <vector>

#include <tensorpipe/channel/context.h>
#include <tensorpipe/common/callback.h>
#include <tensorpipe/core/context.h>
#include <tensorpipe/transport/context.h>

namespace tensorpipe {

class ListenerImpl;
class PipeImpl;

class ContextImpl final : public virtual DeferredExecutor,
                          public std::enable_shared_from_this<ContextImpl> {
 public:
  explicit ContextImpl(ContextOptions opts);

  void init();

  void registerTransport(
      int64_t priority,
      std::string transport,
      std::shared_ptr<transport::Context> context);

  void registerChannel(
      int64_t priority,
      std::string channel,
      std::shared_ptr<channel::Context> context);

  std::shared_ptr<Listener> listen(const std::vector<std::string>& urls);

  std::shared_ptr<Pipe> connect(const std::string& url, PipeOptions opts);

  std::shared_ptr<transport::Context> getTransport(
      const std::string& transport);
  std::shared_ptr<channel::Context> getChannel(const std::string& channel);

  using TOrderedTransports = std::map<
      int64_t,
      std::tuple<std::string, std::shared_ptr<transport::Context>>>;

  const TOrderedTransports& getOrderedTransports();

  using TOrderedChannels = std::
      map<int64_t, std::tuple<std::string, std::shared_ptr<channel::Context>>>;

  const TOrderedChannels& getOrderedChannels();

  // Return the name given to the context's constructor. It will be retrieved
  // by the pipes and listener in order to attach it to logged messages.
  const std::string& getName();

  // Enrolling dependent objects (listeners and pipes) causes them to be kept
  // alive for as long as the context exists. These objects should enroll
  // themselves as soon as they're created (in their initFromLoop method) and
  // unenroll themselves after they've completed handling an error (either right
  // in the handleError method or in a subsequent callback). The context, on the
  // other hand, should avoid terminating (i.e., complete joining) until all
  // objects have unenrolled themselves.
  void enroll(ListenerImpl& listener);
  void enroll(PipeImpl& pipe);
  void unenroll(ListenerImpl& listener);
  void unenroll(PipeImpl& pipe);

  // Return whether the context is in a closed state. To avoid race conditions,
  // this must be called from within the loop.
  bool closed();

  // Implement DeferredExecutor interface.
  void deferToLoop(TTask fn) override;
  bool inLoop() const override;

  void close();

  void join();

 private:
  OnDemandDeferredExecutor loop_;

  Error error_{Error::kSuccess};

  std::atomic<bool> joined_{false};

  // An identifier for the context, either consisting of the user-provided name
  // for this context (see below) or, by default, composed of unique information
  // about the host and process, combined with an increasing sequence number. It
  // will be used as a prefix for the identifiers of listeners and pipes. All of
  // them will only be used for logging and debugging purposes.
  std::string id_;

  // Sequence numbers for the listeners and pipes created by this context, used
  // to create their identifiers based off this context's identifier. They will
  // only be used for logging and debugging.
  std::atomic<uint64_t> listenerCounter_{0};
  std::atomic<uint64_t> pipeCounter_{0};

  // Store shared_ptrs to dependent objects that have enrolled themselves to
  // keep them alive. We use a map, indexed by raw pointers, rather than a set
  // of shared_ptrs so that we can erase objects without them having to create
  // a fresh shared_ptr just for that.
  std::unordered_map<ListenerImpl*, std::shared_ptr<ListenerImpl>> listeners_;
  std::unordered_map<PipeImpl*, std::shared_ptr<PipeImpl>> pipes_;

  // A user-provided name for this context which should be semantically
  // meaningful. It will only be used for logging and debugging purposes, to
  // identify the endpoints of a pipe.
  std::string name_;

  std::unordered_map<std::string, std::shared_ptr<transport::Context>>
      transports_;

  using TContextMap =
      std::unordered_map<std::string, std::shared_ptr<channel::Context>>;
  TContextMap channels_;

  TOrderedTransports transportsByPriority_;

  TOrderedChannels channelsByPriority_;

  CallbackWrapper<ContextImpl> callbackWrapper_{*this, *this};

  void initFromLoop();
  void closeFromLoop();
  void setError(Error error);
  void handleError();

  template <typename T>
  friend class CallbackWrapper;
};

} // namespace tensorpipe


================================================
FILE: tensorpipe/core/error.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/core/error.h>

#include <sstream>

namespace tensorpipe {

std::string LogicError::what() const {
  std::ostringstream ss;
  ss << "logic error: " << reason_;
  return ss.str();
}

std::string ContextClosedError::what() const {
  return "context closed";
}

std::string ListenerClosedError::what() const {
  return "listener closed";
}

std::string PipeClosedError::what() const {
  return "pipe closed";
}

} // namespace tensorpipe


================================================
FILE: tensorpipe/core/error.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <string>

#include <tensorpipe/common/error.h>

namespace tensorpipe {

class LogicError final : public BaseError {
 public:
  explicit LogicError(std::string reason) : reason_(std::move(reason)) {}

  std::string what() const override;

 private:
  const std::string reason_;
};

class ContextClosedError final : public BaseError {
 public:
  explicit ContextClosedError() {}

  std::string what() const override;
};

class ListenerClosedError final : public BaseError {
 public:
  explicit ListenerClosedError() {}

  std::string what() const override;
};

class PipeClosedError final : public BaseError {
 public:
  explicit PipeClosedError() {}

  std::string what() const override;
};

} // namespace tensorpipe


================================================
FILE: tensorpipe/core/listener.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/core/listener.h>

#include <map>
#include <memory>
#include <string>
#include <utility>
#include <vector>

#include <tensorpipe/core/listener_impl.h>

namespace tensorpipe {

Listener::Listener(
    ConstructorToken /* unused */,
    std::shared_ptr<ContextImpl> context,
    std::string id,
    const std::vector<std::string>& urls)
    : impl_(std::make_shared<ListenerImpl>(
          std::move(context),
          std::move(id),
          urls)) {
  impl_->init();
}

void Listener::close() {
  impl_->close();
}

Listener::~Listener() {
  close();
}

void Listener::accept(accept_callback_fn fn) {
  impl_->accept(std::move(fn));
}

const std::map<std::string, std::string>& Listener::addresses() const {
  return impl_->addresses();
}

const std::string& Listener::address(const std::string& transport) const {
  return impl_->address(transport);
}

std::string Listener::url(const std::string& transport) const {
  return impl_->url(transport);
}

} // namespace tensorpipe


================================================
FILE: tensorpipe/core/listener.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <functional>
#include <map>
#include <memory>
#include <string>
#include <vector>

#include <tensorpipe/common/error.h>

namespace tensorpipe {

class ContextImpl;
class ListenerImpl;
class Pipe;

// The listener.
//
// Listeners are used to produce pipes. Depending on the type of the
// context, listeners may use a variety of addresses to listen on. For
// example, for TCP/IP sockets they listen on an IPv4 or IPv6 address,
// for Unix domain sockets they listen on a path, etcetera.
//
// A pipe can only be accepted from this listener after it has been
// fully established. This means that both its connection and all its
// side channels have been established.
//
class Listener final {
  // Use the passkey idiom to allow make_shared to call what should be a private
  // constructor. See https://abseil.io/tips/134 for more information.
  struct ConstructorToken {};

 public:
  Listener(
      ConstructorToken token,
      std::shared_ptr<ContextImpl> context,
      std::string id,
      const std::vector<std::string>& urls);

  //
  // Entry points for user code
  //

  using accept_callback_fn =
      std::function<void(const Error&, std::shared_ptr<Pipe>)>;

  void accept(accept_callback_fn fn);

  // Returns map with the materialized address of listeners by transport.
  //
  // If you don't bind a transport listener to a specific port or address, it
  // may generate its address automatically. Then, in order to connect to the
  // listener, the user must use a separate mechanism to communicate the
  // materialized address to whoever wants to connect.
  //
  const std::map<std::string, std::string>& addresses() const;

  // Returns materialized address for specific transport.
  //
  // See `addresses()` for more information.
  //
  const std::string& address(const std::string& transport) const;

  // Returns URL with materialized address for specific transport.
  //
  // See `addresses()` for more information.
  //
  std::string url(const std::string& transport) const;

  // Put the listener in a terminal state, aborting its pending operations and
  // rejecting future ones, and release its resrouces. This may be carried out
  // asynchronously, in background. Since the pipes may occasionally use the
  // listener to open new connections, closing a listener may trigger errors
  // in the pipes.
  void close();

  ~Listener();

 private:
  // Using a shared_ptr allows us to detach the lifetime of the implementation
  // from the public object's one and perform the destruction asynchronously.
  const std::shared_ptr<ListenerImpl> impl_;

  // Allow context to access constructor token.
  friend ContextImpl;
};

} // namespace tensorpipe


================================================
FILE: tensorpipe/core/listener_impl.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/core/listener_impl.h>

#include <functional>
#include <map>
#include <memory>
#include <string>
#include <tuple>
#include <utility>
#include <vector>

#include <tensorpipe/common/address.h>
#include <tensorpipe/common/callback.h>
#include <tensorpipe/common/defs.h>
#include <tensorpipe/common/error_macros.h>
#include <tensorpipe/common/optional.h>
#include <tensorpipe/core/context_impl.h>
#include <tensorpipe/core/error.h>
#include <tensorpipe/core/nop_types.h>
#include <tensorpipe/core/pipe.h>
#include <tensorpipe/core/pipe_impl.h>
#include <tensorpipe/transport/connection.h>
#include <tensorpipe/transport/listener.h>

namespace tensorpipe {

ListenerImpl::ListenerImpl(
    std::shared_ptr<ContextImpl> context,
    std::string id,
    const std::vector<std::string>& urls)
    : context_(std::move(context)), id_(std::move(id)) {
  for (const auto& url : urls) {
    std::string transport;
    std::string address;
    std::tie(transport, address) = splitSchemeOfURL(url);
    std::shared_ptr<transport::Context> context =
        context_->getTransport(transport);
    std::shared_ptr<transport::Listener> listener = context->listen(address);
    listener->setId(id_ + ".tr_" + transport);
    addresses_.emplace(transport, listener->addr());
    listeners_.emplace(transport, std::move(listener));
  }
}

void ListenerImpl::init() {
  context_->deferToLoop(
      [impl{this->shared_from_this()}]() { impl->initFromLoop(); });
}

void ListenerImpl::initFromLoop() {
  TP_DCHECK(context_->inLoop());

  if (context_->closed()) {
    // Set the error without calling setError because we do not want to invoke
    // handleError as it would find itself in a weird state (since the rest of
    // initFromLoop wouldn't have been called).
    error_ = TP_CREATE_ERROR(ListenerClosedError);
    TP_VLOG(1) << "Listener " << id_ << " is closing (without initing)";
    return;
  }

  context_->enroll(*this);

  for (const auto& listener : listeners_) {
    armListener(listener.first);
  }
}

void ListenerImpl::close() {
  context_->deferToLoop(
      [impl{this->shared_from_this()}]() { impl->closeFromLoop(); });
}

void ListenerImpl::closeFromLoop() {
  TP_DCHECK(context_->inLoop());
  TP_VLOG(1) << "Listener " << id_ << " is closing";
  setError(TP_CREATE_ERROR(ListenerClosedError));
}

//
// Entry points for user code
//

void ListenerImpl::accept(accept_callback_fn fn) {
  context_->deferToLoop(
      [impl{this->shared_from_this()}, fn{std::move(fn)}]() mutable {
        impl->acceptFromLoop(std::move(fn));
      });
}

void ListenerImpl::acceptFromLoop(accept_callback_fn fn) {
  TP_DCHECK(context_->inLoop());

  uint64_t sequenceNumber = nextPipeBeingAccepted_++;
  TP_VLOG(1) << "Listener " << id_ << " received an accept request (#"
             << sequenceNumber << ")";

  fn = [this, sequenceNumber, fn{std::move(fn)}](
           const Error& error, std::shared_ptr<Pipe> pipe) {
    TP_DCHECK_EQ(sequenceNumber, nextAcceptCallbackToCall_++);
    TP_VLOG(1) << "Listener " << id_ << " is calling an accept callback (#"
               << sequenceNumber << ")";
    fn(error, std::move(pipe));
    TP_VLOG(1) << "Listener " << id_ << " done calling an accept callback (#"
               << sequenceNumber << ")";
  };

  if (error_) {
    fn(error_, std::shared_ptr<Pipe>());
    return;
  }

  acceptCallback_.arm(std::move(fn));
}

const std::map<std::string, std::string>& ListenerImpl::addresses() const {
  // As this is an immutable member (after it has been initialized in
  // the constructor), we'll access it without deferring to the loop.
  return addresses_;
}

const std::string& ListenerImpl::address(const std::string& transport) const {
  // As this is an immutable member (after it has been initialized in
  // the constructor), we'll access it without deferring to the loop.
  const auto it = addresses_.find(transport);
  TP_THROW_ASSERT_IF(it == addresses_.end())
      << ": transport '" << transport << "' not in use by this listener.";
  return it->second;
}

std::string ListenerImpl::url(const std::string& transport) const {
  // As this is an immutable member (after it has been initialized in
  // the constructor), we'll access it without deferring to the loop.
  return transport + "://" + address(transport);
}

//
// Entry points for internal code
//

uint64_t ListenerImpl::registerConnectionRequest(
    connection_request_callback_fn fn) {
  TP_DCHECK(context_->inLoop());

  uint64_t registrationId = nextConnectionRequestRegistrationId_++;

  TP_VLOG(1) << "Listener " << id_
             << " received a connection request registration (#"
             << registrationId << ")";

  fn = [this, registrationId, fn{std::move(fn)}](
           const Error& error,
           std::string transport,
           std::shared_ptr<transport::Connection> connection) {
    TP_VLOG(1) << "Listener " << id_
               << " is calling a connection request registration callback (#"
               << registrationId << ")";
    fn(error, std::move(transport), std::move(connection));
    TP_VLOG(1) << "Listener " << id_
               << " done calling a connection request registration callback (#"
               << registrationId << ")";
  };

  if (error_) {
    fn(error_, std::string(), std::shared_ptr<transport::Connection>());
  } else {
    connectionRequestRegistrations_.emplace(registrationId, std::move(fn));
  }

  return registrationId;
}

void ListenerImpl::unregisterConnectionRequest(uint64_t registrationId) {
  TP_DCHECK(context_->inLoop());

  TP_VLOG(1) << "Listener " << id_
             << " received a connection request de-registration (#"
             << registrationId << ")";

  connectionRequestRegistrations_.erase(registrationId);
}

//
// Error handling
//

void ListenerImpl::setError(Error error) {
  // Don't overwrite an error that's already set.
  if (error_ || !error) {
    return;
  }

  error_ = std::move(error);

  handleError();
}

void ListenerImpl::handleError() {
  TP_DCHECK(context_->inLoop());
  TP_VLOG(2) << "Listener " << id_ << " is handling error " << error_.what();

  acceptCallback_.triggerAll([&]() {
    return std::make_tuple(std::cref(error_), std::shared_ptr<Pipe>());
  });
  for (auto& iter : connectionRequestRegistrations_) {
    connection_request_callback_fn fn = std::move(iter.second);
    fn(error_, std::string(), std::shared_ptr<transport::Connection>());
  }
  connectionRequestRegistrations_.clear();

  for (const auto& listener : listeners_) {
    listener.second->close();
  }

  for (const auto& connection : connectionsWaitingForHello_) {
    connection->close();
  }
  connectionsWaitingForHello_.clear();

  context_->unenroll(*this);
}

//
// Everything else
//

void ListenerImpl::onAccept(
    std::string transport,
    std::shared_ptr<transport::Connection> connection) {
  TP_DCHECK(context_->inLoop());
  // Keep it alive until we figure out what to do with it.
  connectionsWaitingForHello_.insert(connection);
  auto nopHolderIn = std::make_shared<NopHolder<Packet>>();
  TP_VLOG(3) << "Listener " << id_
             << " is reading nop object (spontaneous or requested connection)";
  connection->read(
      *nopHolderIn,
      callbackWrapper_([nopHolderIn,
                        transport{std::move(transport)},
                        connection](ListenerImpl& impl) mutable {
        TP_VLOG(3)
            << "Listener " << impl.id_
            << " done reading nop object (spontaneous or requested connection)";
        if (impl.error_) {
          return;
        }
        impl.connectionsWaitingForHello_.erase(connection);
        impl.onConnectionHelloRead(
            std::move(transport),
            std::move(connection),
            nopHolderIn->getObject());
      }));
}

void ListenerImpl::armListener(std::string transport) {
  TP_DCHECK(context_->inLoop());
  auto iter = listeners_.find(transport);
  if (iter == listeners_.end()) {
    TP_THROW_EINVAL() << "unsupported transport " << transport;
  }
  auto transportListener = iter->second;
  TP_VLOG(3) << "Listener " << id_ << " is accepting connection on transport "
             << transport;
  transportListener->accept(
      callbackWrapper_([transport](
                           ListenerImpl& impl,
                           std::shared_ptr<transport::Connection> connection) {
        TP_VLOG(3) << "Listener " << impl.id_
                   << " done accepting connection on transport " << transport;
        if (impl.error_) {
          return;
        }
        impl.onAccept(transport, std::move(connection));
        impl.armListener(transport);
      }));
}

void ListenerImpl::onConnectionHelloRead(
    std::string transport,
    std::shared_ptr<transport::Connection> connection,
    const Packet& nopPacketIn) {
  TP_DCHECK(context_->inLoop());
  if (nopPacketIn.is<SpontaneousConnection>()) {
    const SpontaneousConnection& nopSpontaneousConnection =
        *nopPacketIn.get<SpontaneousConnection>();
    TP_VLOG(3) << "Listener " << id_ << " got spontaneous connection";
    std::string pipeId = id_ + ".p" + std::to_string(pipeCounter_++);
    TP_VLOG(1) << "Listener " << id_ << " is opening pipe " << pipeId;
    const std::string& remoteContextName = nopSpontaneousConnection.contextName;
    if (remoteContextName != "") {
      std::string aliasPipeId = id_ + "_from_" + remoteContextName;
      TP_VLOG(1) << "Pipe " << pipeId << " aliased as " << aliasPipeId;
      pipeId = std::move(aliasPipeId);
    }
    auto pipe = std::make_shared<PipeImpl>(
        context_,
        shared_from_this(),
        std::move(pipeId),
        remoteContextName,
        std::move(transport),
        std::move(connection));
    // We initialize the pipe from the loop immediately, inline, because the
    // initialization of a pipe accepted by a listener happens partly in the
    // listener and partly in the pipe's initFromLoop, and we need these two
    // steps to happen "atomically" to make it impossible for an error to occur
    // in between.
    pipe->initFromLoop();
    acceptCallback_.trigger(
        Error::kSuccess,
        std::make_shared<Pipe>(Pipe::ConstructorToken(), std::move(pipe)));
  } else if (nopPacketIn.is<RequestedConnection>()) {
    const RequestedConnection& nopRequestedConnection =
        *nopPacketIn.get<RequestedConnection>();
    uint64_t registrationId = nopRequestedConnection.registrationId;
    TP_VLOG(3) << "Listener " << id_ << " got requested connection (#"
               << registrationId << ")";
    auto iter = connectionRequestRegistrations_.find(registrationId);
    // The connection request may have already been deregistered, for example
    // because the pipe may have been closed.
    if (iter != connectionRequestRegistrations_.end()) {
      auto fn = std::move(iter->second);
      connectionRequestRegistrations_.erase(iter);
      fn(Error::kSuccess, std::move(transport), std::move(connection));
    }
  } else {
    TP_LOG_ERROR() << "packet contained unknown content: "
                   << nopPacketIn.index();
  }
}

} // namespace tensorpipe


================================================
FILE: tensorpipe/core/listener_impl.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <atomic>
#include <functional>
#include <map>
#include <memory>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>

#include <tensorpipe/common/callback.h>
#include <tensorpipe/common/deferred_executor.h>
#include <tensorpipe/common/error.h>
#include <tensorpipe/core/context_impl.h>
#include <tensorpipe/core/listener.h>
#include <tensorpipe/core/nop_types.h>
#include <tensorpipe/transport/context.h>

namespace tensorpipe {

class ContextImpl;

class ListenerImpl final : public std::enable_shared_from_this<ListenerImpl> {
 public:
  ListenerImpl(
      std::shared_ptr<ContextImpl> context,
      std::string id,
      const std::vector<std::string>& urls);

  // Called by the listener's constructor.
  void init();

  using accept_callback_fn = Listener::accept_callback_fn;

  void accept(accept_callback_fn fn);

  const std::map<std::string, std::string>& addresses() const;

  const std::string& address(const std::string& transport) const;

  std::string url(const std::string& transport) const;

  using connection_request_callback_fn = std::function<
      void(const Error&, std::string, std::shared_ptr<transport::Connection>)>;

  uint64_t registerConnectionRequest(connection_request_callback_fn fn);
  void unregisterConnectionRequest(uint64_t registrationId);

  void close();

 private:
  void acceptFromLoop(accept_callback_fn fn);

  void closeFromLoop();

  Error error_{Error::kSuccess};

  std::shared_ptr<ContextImpl> context_;

  // An identifier for the listener, composed of the identifier for the context,
  // combined with an increasing sequence number. It will be used as a prefix
  // for the identifiers of pipes. All of them will only be used for logging and
  // debugging purposes.
  std::string id_;

  // Sequence numbers for the pipes created by this listener, used to create
  // their identifiers based off this listener's identifier. They will only be
  // used for logging and debugging.
  std::atomic<uint64_t> pipeCounter_{0};

  std::unordered_map<std::string, std::shared_ptr<transport::Listener>>
      listeners_;
  std::map<std::string, std::string> addresses_;

  // A sequence number for the calls to accept.
  uint64_t nextPipeBeingAccepted_{0};

  // A sequence number for the invocations of the callbacks of accept.
  uint64_t nextAcceptCallbackToCall_{0};

  RearmableCallback<const Error&, std::shared_ptr<Pipe>> acceptCallback_;

  // Needed to keep them alive.
  std::unordered_set<std::shared_ptr<transport::Connection>>
      connectionsWaitingForHello_;

  uint64_t nextConnectionRequestRegistrationId_{0};

  // FIXME Consider using a (ordered) map, because keys are IDs which are
  // generated in sequence and thus we can do a quick (but partial) check of
  // whether a callback is in the map by comparing its ID with the smallest
  // and largest key, which in an ordered map are the first and last item.
  std::unordered_map<uint64_t, connection_request_callback_fn>
      connectionRequestRegistrations_;

  //
  // Initialization
  //

  void initFromLoop();

  //
  // Helpers to prepare callbacks from transports
  //

  CallbackWrapper<ListenerImpl> callbackWrapper_{*this, *this->context_};

  //
  // Error handling
  //

  void setError(Error error);

  void handleError();

  //
  // Everything else
  //

  void armListener(std::string transport);
  void onAccept(
      std::string transport,
      std::shared_ptr<transport::Connection> connection);
  void onConnectionHelloRead(
      std::string transport,
      std::shared_ptr<transport::Connection> connection,
      const Packet& nopPacketIn);

  template <typename T>
  friend class CallbackWrapper;

  // Contexts do sometimes need to call directly into closeFromLoop, in order to
  // make sure that some of their operations can happen "atomically" on the
  // connection, without possibly other operations occurring in between (e.g.,
  // an error).
  friend ContextImpl;
};

} // namespace tensorpipe


================================================
FILE: tensorpipe/core/message.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <cstddef>
#include <string>
#include <vector>

#include <tensorpipe/common/buffer.h>
#include <tensorpipe/common/optional.h>

namespace tensorpipe {

// Messages consist of a primary buffer and zero or more separate
// buffers. The primary buffer is always a host-side memory region that
// contains a serialized version of the message we're dealing with. This
// serialized message, in turn, may have references to the separate
// buffers that accompany the primary buffer. These separate buffers may
// point to any type of memory, host-side or device-side.
//
class Message final {
 public:
  std::string metadata;

  struct Payload {
    void* data{nullptr};
    size_t length{0};

    // Users may include arbitrary metadata in the following fields.
    // This may contain allocation hints for the receiver, for example.
    std::string metadata;
  };

  // Holds the payloads that are transferred over the primary connection.
  std::vector<Payload> payloads;

  struct Tensor {
    tensorpipe::Buffer buffer;
    size_t length{0};

    // Users may optionally specify the target device, on which the receiver
    // should allocate memory for this tensor. If left unset, the receiver will
    // choose one at their convenience.
    optional<Device> targetDevice;

    // Users may include arbitrary metadata in the following field.
    // This may contain allocation hints for the receiver, for example.
    std::string metadata;
  };

  // Holds the tensors that are offered to the side channels.
  std::vector<Tensor> tensors;
};

// Descriptors consist of metadata required by the receiver to allocate memory
// for an incoming message.
class Descriptor final {
 public:
  std::string metadata;

  struct Payload {
    size_t length{0};
    std::string metadata;
  };
  std::vector<Payload> payloads;

  struct Tensor {
    size_t length{0};

    // This is the sender-side device from which this tensor is being sent.
    Device sourceDevice;

    // The sender may optionally specify a target device, in which case the
    // receiver must allocate memory for this tensor on the specified device.
    optional<Device> targetDevice;

    std::string metadata;
  };
  std::vector<Tensor> tensors;
};

// Allocations consist of actual memory allocations provided by the receiver for
// an incoming message. They must match the length and target devices specified
// in the corresponding Descriptor.
class Allocation final {
 public:
  struct Payload {
    void* data{nullptr};
  };
  std::vector<Payload> payloads;

  struct Tensor {
    tensorpipe::Buffer buffer;
  };
  std::vector<Tensor> tensors;
};

} // namespace tensorpipe


================================================
FILE: tensorpipe/core/nop_types.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <string>
#include <unordered_map>
#include <vector>

#include <nop/serializer.h>
#include <nop/structure.h>
#include <nop/types/optional.h>
#include <nop/types/variant.h>

#include <tensorpipe/common/device.h>
#include <tensorpipe/common/optional.h>
#include <tensorpipe/core/message.h>

namespace tensorpipe {

struct SpontaneousConnection {
  std::string contextName;
  NOP_STRUCTURE(SpontaneousConnection, contextName);
};

struct RequestedConnection {
  uint64_t registrationId;
  NOP_STRUCTURE(RequestedConnection, registrationId);
};

NOP_EXTERNAL_STRUCTURE(Device, type, index);

struct Brochure {
  std::unordered_map<std::string, std::string> transportDomainDescriptors;
  std::unordered_map<std::string, std::unordered_map<Device, std::string>>
      channelDeviceDescriptors;
  NOP_STRUCTURE(Brochure, transportDomainDescriptors, channelDeviceDescriptors);
};

struct BrochureAnswer {
  std::string transport;
  std::string address;
  std::unordered_map<uint64_t, uint64_t> transportRegistrationIds;
  std::string transportDomainDescriptor;
  std::unordered_map<std::string, std::vector<uint64_t>> channelRegistrationIds;
  std::unordered_map<std::string, std::unordered_map<Device, std::string>>
      channelDeviceDescriptors;
  std::unordered_map<std::pair<Device, Device>, std::string>
      channelForDevicePair;
  NOP_STRUCTURE(
      BrochureAnswer,
      transport,
      address,
      transportRegistrationIds,
      transportDomainDescriptor,
      channelRegistrationIds,
      channelDeviceDescriptors,
      channelForDevicePair);
};

NOP_EXTERNAL_STRUCTURE(Descriptor::Payload, length, metadata);
NOP_EXTERNAL_STRUCTURE(
    Descriptor::Tensor,
    length,
    sourceDevice,
    targetDevice,
    metadata);
NOP_EXTERNAL_STRUCTURE(Descriptor, metadata, payloads, tensors);

struct DescriptorReply {
  std::vector<Device> targetDevices;
  NOP_STRUCTURE(DescriptorReply, targetDevices);
};

using Packet = nop::Variant<SpontaneousConnection, RequestedConnection>;

} // namespace tensorpipe


================================================
FILE: tensorpipe/core/pipe.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/core/pipe.h>

#include <memory>
#include <string>
#include <utility>

#include <tensorpipe/core/pipe_impl.h>

namespace tensorpipe {

Pipe::Pipe(
    ConstructorToken /* unused */,
    std::shared_ptr<ContextImpl> context,
    std::string id,
    std::string remoteName,
    const std::string& url)
    : impl_(std::make_shared<PipeImpl>(
          std::move(context),
          std::move(id),
          std::move(remoteName),
          url)) {
  impl_->init();
}

Pipe::Pipe(ConstructorToken /* unused */, std::shared_ptr<PipeImpl> impl)
    : impl_(std::move(impl)) {}

const std::string& Pipe::getRemoteName() {
  return impl_->getRemoteName();
}

Pipe::~Pipe() {
  close();
}

void Pipe::close() {
  impl_->close();
}

void Pipe::readDescriptor(read_descriptor_callback_fn fn) {
  impl_->readDescriptor(std::move(fn));
}

void Pipe::read(Allocation allocation, read_callback_fn fn) {
  impl_->read(std::move(allocation), std::move(fn));
}

void Pipe::write(Message message, write_callback_fn fn) {
  impl_->write(std::move(message), std::move(fn));
}

} // namespace tensorpipe


================================================
FILE: tensorpipe/core/pipe.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <functional>
#include <memory>
#include <string>

#include <tensorpipe/common/error.h>
#include <tensorpipe/core/message.h>
#include <tensorpipe/transport/context.h>

namespace tensorpipe {

class ContextImpl;
class ListenerImpl;
class PipeImpl;

// The pipe.
//
// Pipes represent a set of connections between a pair of processes.
// Unlike POSIX pipes, they are message oriented instead of byte
// oriented. Messages that are sent through the pipe may use whatever
// channels are at their disposal to make it happen. If the pair of
// processes happen to be colocated on the same machine, they may
// leverage a region of shared memory to communicate the primary
// buffer of a message. Secondary buffers may use shared memory as
// well, if they're located in CPU memory, or use a CUDA device to
// device copy if they're located in NVIDIA GPU memory. If the pair is
// located across the world, they may simply use a set of TCP
// connections to communicate.
//
class Pipe final {
  // Use the passkey idiom to allow make_shared to call what should be a private
  // constructor. See https://abseil.io/tips/134 for more information.
  struct ConstructorToken {};

 public:
  //
  // Initialization
  //

  Pipe(
      ConstructorToken token,
      std::shared_ptr<ContextImpl> context,
      std::string id,
      std::string remoteName,
      const std::string& url);

  Pipe(ConstructorToken token, std::shared_ptr<PipeImpl> impl);

  //
  // Entry points for user code
  //

  using read_descriptor_callback_fn =
      std::function<void(const Error&, Descriptor)>;

  void readDescriptor(read_descriptor_callback_fn fn);

  using read_callback_fn = std::function<void(const Error&)>;

  void read(Allocation allocation, read_callback_fn fn);

  using write_callback_fn = std::function<void(const Error&)>;

  void write(Message message, write_callback_fn fn);

  // Retrieve the user-defined name that was given to the constructor of the
  // context on the remote side, if any (if not, this will be the empty string).
  // This is intended to help in logging and debugging only.
  const std::string& getRemoteName();

  // Put the pipe in a terminal state, aborting its pending operations and
  // rejecting future ones, and release its resrouces. This may be carried out
  // asynchronously, in background.
  void close();

  ~Pipe();

 private:
  // Using a shared_ptr allows us to detach the lifetime of the implementation
  // from the public object's one and perform the destruction asynchronously.
  const std::shared_ptr<PipeImpl> impl_;

  // Allow context to access constructor token.
  friend ContextImpl;
  // Allow listener to access constructor token.
  friend ListenerImpl;
};

} // namespace tensorpipe


================================================
FILE: tensorpipe/core/pipe_impl.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/core/pipe_impl.h>

#include <map>
#include <memory>
#include <tuple>
#include <unordered_map>
#include <utility>

#include <tensorpipe/common/address.h>
#include <tensorpipe/common/defs.h>
#include <tensorpipe/common/error_macros.h>
#include <tensorpipe/core/context_impl.h>
#include <tensorpipe/core/error.h>
#include <tensorpipe/core/listener.h>
#include <tensorpipe/core/listener_impl.h>
#include <tensorpipe/transport/connection.h>

namespace tensorpipe {

namespace {

void parseDescriptorReplyOfMessage(
    WriteOperation& op,
    DescriptorReply nopDescriptorReply) {
  const int numTensors = op.message.tensors.size();
  size_t targetDeviceIdx = 0;
  for (size_t tensorIdx = 0; tensorIdx < numTensors; ++tensorIdx) {
    const Message::Tensor& tensor = op.message.tensors[tensorIdx];
    WriteOperation::Tensor& tensorBeingSent = op.tensors[tensorIdx];
    if (!tensor.targetDevice.has_value()) {
      tensorBeingSent.targetDevice =
          std::move(nopDescriptorReply.targetDevices[targetDeviceIdx++]);
    }
  }
  TP_DCHECK_EQ(targetDeviceIdx, nopDescriptorReply.targetDevices.size());
}

// Raise an error if the number of payloads and tensors in the allocation do not
// match the ones that are expected by the ReadOperation. Also checks that
// tensors are allocated on the correct devices.
void checkAllocationCompatibility(
    const Descriptor& descriptor,
    const Allocation& allocation) {
  size_t numPayloads = allocation.payloads.size();
  TP_THROW_ASSERT_IF(numPayloads != descriptor.payloads.size());

  size_t numTensors = allocation.tensors.size();
  TP_THROW_ASSERT_IF(numTensors != descriptor.tensors.size());
  for (size_t tensorIdx = 0; tensorIdx < numTensors; tensorIdx++) {
    const Allocation::Tensor& tensor = allocation.tensors[tensorIdx];
    const Descriptor::Tensor& tensorDescriptor = descriptor.tensors[tensorIdx];
    if (tensorDescriptor.targetDevice.has_value()) {
      TP_THROW_ASSERT_IF(
          !(tensor.buffer.device() == tensorDescriptor.targetDevice.value()));
    }
  }
}

// Produce a nop object containing a message descriptor using the information
// contained in the WriteOperation: number and sizes of payloads and tensors,
// tensor descriptors, ...
std::shared_ptr<NopHolder<Descriptor>> makeDescriptorForMessage(
    const WriteOperation& op) {
  auto nopHolderOut = std::make_shared<NopHolder<Descriptor>>();
  Descriptor& nopDescriptor = nopHolderOut->getObject();

  nopDescriptor.metadata = op.message.metadata;

  for (int payloadIdx = 0; payloadIdx < op.message.payloads.size();
       ++payloadIdx) {
    const Message::Payload& payload = op.message.payloads[payloadIdx];
    nopDescriptor.payloads.emplace_back();
    Descriptor::Payload& nopPayloadDescriptor = nopDescriptor.payloads.back();
    nopPayloadDescriptor.length = payload.length;
    nopPayloadDescriptor.metadata = payload.metadata;
  }

  TP_DCHECK_EQ(op.message.tensors.size(), op.tensors.size());
  for (int tensorIdx = 0; tensorIdx < op.tensors.size(); ++tensorIdx) {
    const Message::Tensor& tensor = op.message.tensors[tensorIdx];
    nopDescriptor.tensors.emplace_back();
    Descriptor::Tensor& nopTensorDescriptor = nopDescriptor.tensors.back();
    nopTensorDescriptor.metadata = tensor.metadata;
    nopTensorDescriptor.sourceDevice = tensor.buffer.device();
    if (tensor.targetDevice.has_value()) {
      nopTensorDescriptor.targetDevice = tensor.targetDevice.value();
    }
    nopTensorDescriptor.length = tensor.length;
  }

  return nopHolderOut;
}

std::shared_ptr<NopHolder<DescriptorReply>> makeDescriptorReplyForMessage(
    const ReadOperation& op) {
  auto nopHolderOut = std::make_shared<NopHolder<DescriptorReply>>();
  DescriptorReply& nopDescriptorReply = nopHolderOut->getObject();

  for (size_t tensorIdx = 0; tensorIdx < op.descriptor.tensors.size();
       ++tensorIdx) {
    if (!op.descriptor.tensors[tensorIdx].targetDevice.has_value()) {
      const Allocation::Tensor& tensor = op.allocation.tensors[tensorIdx];
      nopDescriptorReply.targetDevices.push_back(tensor.buffer.device());
    }
  }

  return nopHolderOut;
}

struct SelectedTransport {
  std::string name;
  std::string address;
  std::string domainDescriptor;
};

SelectedTransport selectTransport(
    const ContextImpl::TOrderedTransports& orderedTransports,
    const std::unordered_map<std::string, std::string>& remoteDomainDescriptors,
    const std::map<std::string, std::string>& addresses) {
  for (const auto& transportContextIter : orderedTransports) {
    const std::string& transportName = std::get<0>(transportContextIter.second);
    const transport::Context& transportContext =
        *(std::get<1>(transportContextIter.second));

    // This pipe's listener might not have an address for that transport.
    const auto addressIter = addresses.find(transportName);
    if (addressIter == addresses.cend()) {
      continue;
    }
    const auto& address = addressIter->second;

    const auto remoteDomainDescriptorsIter =
        remoteDomainDescriptors.find(transportName);
    if (remoteDomainDescriptorsIter == remoteDomainDescriptors.cend()) {
      continue;
    }
    const std::string& remoteDomainDescriptor =
        remoteDomainDescriptorsIter->second;
    if (!transportContext.canCommunicateWithRemote(remoteDomainDescriptor)) {
      continue;
    }

    return {transportName, address, transportContext.domainDescriptor()};
  }

  TP_THROW_ASSERT() << "Could not find a viable transport";
  // Returning dummy value to silence compiler warning.
  return {};
}

struct SelectedChannels {
  std::unordered_map<std::string, std::unordered_map<Device, std::string>>
      descriptorsMap;
  std::unordered_map<std::pair<Device, Device>, std::string>
      channelForDevicePair;
};

SelectedChannels selectChannels(
    const ContextImpl::TOrderedChannels& orderedChannels,
    const std::unordered_map<
        std::string,
        std::unordered_map<Device, std::string>>& remoteDescriptorsMap) {
  SelectedChannels result;

  for (const auto& channelIter : orderedChannels) {
    const std::string& channelName = std::get<0>(channelIter.second);
    const channel::Context& channelContext = *std::get<1>(channelIter.second);

    const auto& remoteDescriptorsMapIter =
        remoteDescriptorsMap.find(channelName);
    if (remoteDescriptorsMapIter == remoteDescriptorsMap.end()) {
      continue;
    }

    const std::unordered_map<Device, std::string>& localDeviceDescriptors =
        channelContext.deviceDescriptors();
    const std::unordered_map<Device, std::string>& remoteDeviceDescriptors =
        remoteDescriptorsMapIter->second;

    bool selected = false;
    for (const auto& localDescIter : localDeviceDescriptors) {
      const Device& localDevice = localDescIter.first;
      const std::string& localDeviceDescriptor = localDescIter.second;
      for (const auto& remoteDescIter : remoteDeviceDescriptors) {
        const Device& remoteDevice = remoteDescIter.first;
        const std::string& remoteDeviceDescriptor = remoteDescIter.second;

        if (!channelContext.canCommunicateWithRemote(
                localDeviceDescriptor, remoteDeviceDescriptor)) {
          continue;
        }

        if (result.channelForDevicePair.count({localDevice, remoteDevice}) !=
            0) {
          // A channel with higher priority has already been selected for this
          // device pair.
          continue;
        }

        selected = true;
        result.channelForDevicePair[{localDevice, remoteDevice}] = channelName;
      }
    }

    if (selected) {
      result.descriptorsMap[channelName] = localDeviceDescriptors;
    }
  }

  return result;
}

} // namespace

//
// Initialization
//

PipeImpl::PipeImpl(
    std::shared_ptr<ContextImpl> context,
    std::string id,
    std::string remoteName,
    const std::string& url)
    : state_(CLIENT_ABOUT_TO_SEND_HELLO_AND_BROCHURE),
      context_(std::move(context)),
      id_(std::move(id)),
      remoteName_(std::move(remoteName)) {
  std::string address;
  std::tie(transport_, address) = splitSchemeOfURL(url);
  descriptorConnection_ =
      context_->getTransport(transport_)->connect(std::move(address));
  descriptorConnection_->setId(id_ + ".d.tr_" + transport_);
}

PipeImpl::PipeImpl(
    std::shared_ptr<ContextImpl> context,
    std::shared_ptr<ListenerImpl> listener,
    std::string id,
    std::string remoteName,
    std::string transport,
    std::shared_ptr<transport::Connection> connection)
    : state_(SERVER_WAITING_FOR_BROCHURE),
      context_(std::move(context)),
      listener_(std::move(listener)),
      id_(std::move(id)),
      remoteName_(std::move(remoteName)),
      transport_(std::move(transport)),
      descriptorConnection_(std::move(connection)) {
  descriptorConnection_->setId(id_ + ".d.tr_" + transport_);
}

void PipeImpl::init() {
  context_->deferToLoop(
      [impl{this->shared_from_this()}]() { impl->initFromLoop(); });
}

void PipeImpl::initFromLoop() {
  TP_DCHECK(context_->inLoop());

  if (context_->closed()) {
    // Set the error without calling setError because we do not want to invoke
    // handleError as it would find itself in a weird state (since the rest of
    // initFromLoop wouldn't have been called).
    error_ = TP_CREATE_ERROR(PipeClosedError);
    TP_VLOG(1) << "Pipe " << id_ << " is closing (without initing)";
    return;
  }

  context_->enroll(*this);

  if (state_ == CLIENT_ABOUT_TO_SEND_HELLO_AND_BROCHURE) {
    auto nopHolderOut = std::make_shared<NopHolder<Packet>>();
    Packet& nopPacketOut = nopHolderOut->getObject();
    nopPacketOut.Become(nopPacketOut.index_of<SpontaneousConnection>());
    SpontaneousConnection& nopSpontaneousConnection =
        *nopPacketOut.get<SpontaneousConnection>();
    nopSpontaneousConnection.contextName = context_->getName();
    TP_VLOG(3) << "Pipe " << id_
               << " is writing nop object (spontaneous connection)";
    descriptorConnection_->write(
        *nopHolderOut, callbackWrapper_([nopHolderOut](PipeImpl& impl) {
          TP_VLOG(3) << "Pipe " << impl.id_
                     << " done writing nop object (spontaneous connection)";
        }));

    auto nopHolderOut2 = std::make_shared<NopHolder<Brochure>>();
    Brochure& nopBrochure = nopHolderOut2->getObject();
    for (const auto& transportContextIter : context_->getOrderedTransports()) {
      const std::string& transportName =
          std::get<0>(transportContextIter.second);
      const transport::Context& transportContext =
          *(std::get<1>(transportContextIter.second));
      nopBrochure.transportDomainDescriptors[transportName] =
          transportContext.domainDescriptor();
    }
    for (const auto& channelContextIter : context_->getOrderedChannels()) {
      const std::string& channelName = std::get<0>(channelContextIter.second);
      const channel::Context& channelContext =
          *(std::get<1>(channelContextIter.second));
      nopBrochure.channelDeviceDescriptors[channelName] =
          channelContext.deviceDescriptors();
    }
    TP_VLOG(3) << "Pipe " << id_ << " is writing nop object (brochure)";
    descriptorConnection_->write(
        *nopHolderOut2, callbackWrapper_([nopHolderOut2](PipeImpl& impl) {
          TP_VLOG(3) << "Pipe " << impl.id_
                     << " done writing nop object (brochure)";
        }));
    state_ = CLIENT_WAITING_FOR_BROCHURE_ANSWER;
    auto nopHolderIn = std::make_shared<NopHolder<BrochureAnswer>>();
    TP_VLOG(3) << "Pipe " << id_ << " is reading nop object (brochure answer)";
    descriptorConnection_->read(
        *nopHolderIn, callbackWrapper_([nopHolderIn](PipeImpl& impl) {
          TP_VLOG(3) << "Pipe " << impl.id_
                     << " done reading nop object (brochure answer)";
          if (!impl.error_) {
            impl.onReadWhileClientWaitingForBrochureAnswer(
                nopHolderIn->getObject());
          }
        }));
  }
  if (state_ == SERVER_WAITING_FOR_BROCHURE) {
    auto nopHolderIn = std::make_shared<NopHolder<Brochure>>();
    TP_VLOG(3) << "Pipe " << id_ << " is reading nop object (brochure)";
    descriptorConnection_->read(
        *nopHolderIn, callbackWrapper_([nopHolderIn](PipeImpl& impl) {
          TP_VLOG(3) << "Pipe " << impl.id_
                     << " done reading nop object (brochure)";
          if (!impl.error_) {
            impl.onReadWhileServerWaitingForBrochure(nopHolderIn->getObject());
          }
        }));
  }
}

const std::string& PipeImpl::getRemoteName() {
  return remoteName_;
}

void PipeImpl::close() {
  context_->deferToLoop(
      [impl{this->shared_from_this()}]() { impl->closeFromLoop(); });
}

void PipeImpl::closeFromLoop() {
  TP_DCHECK(context_->inLoop());
  TP_VLOG(1) << "Pipe " << id_ << " is closing";
  setError(TP_CREATE_ERROR(PipeClosedError));
}

//
// Entry points for user code
//

void PipeImpl::readDescriptor(read_descriptor_callback_fn fn) {
  context_->deferToLoop(
      [impl{this->shared_from_this()}, fn{std::move(fn)}]() mutable {
        impl->readDescriptorFromLoop(std::move(fn));
      });
}

void PipeImpl::readDescriptorFromLoop(read_descriptor_callback_fn fn) {
  TP_DCHECK(context_->inLoop());

  ReadOpIter opIter = readOps_.emplaceBack(nextMessageBeingRead_++);
  ReadOperation& op = *opIter;

  TP_VLOG(1) << "Pipe " << id_ << " received a readDescriptor request (#"
             << op.sequenceNumber << ")";

  fn = [this, sequenceNumber{op.sequenceNumber}, fn{std::move(fn)}](
           const Error& error, Descriptor descriptor) {
    TP_DCHECK_EQ(sequenceNumber, nextReadDescriptorCallbackToCall_++);
    TP_VLOG(1) << "Pipe " << id_ << " is calling a readDescriptor callback (#"
               << sequenceNumber << ")";
    fn(error, std::move(descriptor));
    TP_VLOG(1) << "Pipe " << id_ << " done calling a readDescriptor callback (#"
               << sequenceNumber << ")";
  };

  op.readDescriptorCallback = std::move(fn);

  readOps_.advanceOperation(opIter);
}

void PipeImpl::read(Allocation allocation, read_callback_fn fn) {
  context_->deferToLoop([impl{this->shared_from_this()},
                         allocation{std::move(allocation)},
                         fn{std::move(fn)}]() mutable {
    impl->readFromLoop(std::move(allocation), std::move(fn));
  });
}

void PipeImpl::readFromLoop(Allocation allocation, read_callback_fn fn) {
  TP_DCHECK(context_->inLoop());

  // This is such a bad logical error on the user's side that it doesn't deserve
  // to pass through the channel for "expected errors" (i.e., the callback).
  // This check fails when there is no message for which we are expecting an
  // allocation.
  TP_THROW_ASSERT_IF(!nextMessageGettingAllocation_.has_value());
  ReadOpIter opIter = nextMessageGettingAllocation_.value();
  ReadOperation& op = *opIter;
  nextMessageGettingAllocation_.reset();

  checkAllocationCompatibility(op.descriptor, allocation);

  fn = [this, sequenceNumber{op.sequenceNumber}, fn{std::move(fn)}](
           const Error& error) {
    TP_DCHECK_EQ(sequenceNumber, nextReadCallbackToCall_++);
    TP_VLOG(1) << "Pipe " << id_ << " is calling a read callback (#"
               << sequenceNumber << ")";
    fn(error);
    TP_VLOG(1) << "Pipe " << id_ << " done calling a read callback (#"
               << sequenceNumber << ")";
  };

  op.allocation = std::move(allocation);
  op.readCallback = std::move(fn);
  op.doneGettingAllocation = true;

  TP_VLOG(1) << "Pipe " << id_ << " received a read request (#"
             << op.sequenceNumber << ", containing "
             << op.allocation.payloads.size() << " payloads and "
             << op.allocation.tensors.size() << " tensors)";

  readOps_.advanceOperation(opIter);
}

void PipeImpl::readPayloadsOfMessage(ReadOpIter opIter) {
  TP_DCHECK(context_->inLoop());

  ReadOperation& op = *opIter;

  TP_VLOG(2) << "Pipe " << id_ << " is reading payloads of message #"
             << op.sequenceNumber;

  TP_DCHECK_EQ(connectionState_, AWAITING_PAYLOADS);
  TP_DCHECK_EQ(messageBeingReadFromConnection_, op.sequenceNumber);
  for (size_t payloadIdx = 0; payloadIdx < op.allocation.payloads.size();
       payloadIdx++) {
    Allocation::Payload& payload = op.allocation.payloads[payloadIdx];
    Descriptor::Payload& payloadDescriptor = op.descriptor.payloads[payloadIdx];
    TP_VLOG(3) << "Pipe " << id_ << " is reading payload #" << op.sequenceNumber
               << "." << payloadIdx;
    descriptorConnection_->read(
        payload.data,
        payloadDescriptor.length,
        callbackWrapper_(
            [opIter, payloadIdx](
                PipeImpl& impl, const void* /* unused */, size_t /* unused */) {
              TP_VLOG(3) << "Pipe " << impl.id_ << " done reading payload #"
                         << opIter->sequenceNumber << "." << payloadIdx;
              opIter->numPayloadsBeingRead--;
              impl.readOps_.advanceOperation(opIter);
            }));
    ++op.numPayloadsBeingRead;
  }
  connectionState_ = AWAITING_DESCRIPTOR;
  ++messageBeingReadFromConnection_;
}

void PipeImpl::receiveTensorsOfMessage(ReadOpIter opIter) {
  TP_DCHECK(context_->inLoop());

  ReadOperation& op = *opIter;

  TP_VLOG(2) << "Pipe " << id_ << " is receiving tensors of message #"
             << op.sequenceNumber;

  TP_DCHECK_EQ(op.descriptor.tensors.size(), op.allocation.tensors.size());
  for (size_t tensorIdx = 0; tensorIdx < op.descriptor.tensors.size();
       ++tensorIdx) {
    Allocation::Tensor& tensor = op.allocation.tensors[tensorIdx];
    const Descriptor::Tensor& tensorDescriptor =
        op.descriptor.tensors[tensorIdx];

    const Device& localDevice = tensor.buffer.device();
    const Device& remoteDevice = tensorDescriptor.sourceDevice;
    const auto& channelIter =
        channelForDevicePair_.find({localDevice, remoteDevice});
    TP_THROW_ASSERT_IF(channelIter == channelForDevicePair_.end())
        << "Could not find suitable channel for sending from local device "
        << localDevice.toString() << " to remote device "
        << remoteDevice.toString();

    const std::string& channelName = channelIter->second;
    channel::Channel& channel = *channels_.at(channelName);
    TP_VLOG(3) << "Pipe " << id_ << " is receiving tensor #"
               << op.sequenceNumber << "." << tensorIdx;

    channel.recv(
        tensor.buffer,
        tensorDescriptor.length,
        callbackWrapper_([opIter, tensorIdx](PipeImpl& impl) {
          TP_VLOG(3) << "Pipe " << impl.id_ << " done receiving tensor #"
                     << opIter->sequenceNumber << "." << tensorIdx;
          opIter->numTensorsBeingReceived--;
          impl.readOps_.advanceOperation(opIter);
        }));
    ++op.numTensorsBeingReceived;
  }
}

void PipeImpl::writeDescriptorReplyOfMessage(ReadOpIter opIter) {
  TP_DCHECK(context_->inLoop());

  ReadOperation& op = *opIter;

  TP_DCHECK(op.hasMissingTargetDevices);

  std::shared_ptr<NopHolder<DescriptorReply>> holder =
      makeDescriptorReplyForMessage(op);

  TP_VLOG(3) << "Pipe " << id_
             << " is writing nop object (message descriptor reply #"
             << op.sequenceNumber << ")";
  descriptorReplyConnection_->write(
      *holder,
      callbackWrapper_(
          [sequenceNumber{op.sequenceNumber}, holder](PipeImpl& impl) {
            TP_VLOG(3) << "Pipe " << impl.id_
                       << " done writing nop object (message descriptor reply #"
                       << sequenceNumber << ")";
          }));
}

void PipeImpl::write(Message message, write_callback_fn fn) {
  context_->deferToLoop([impl{this->shared_from_this()},
                         message{std::move(message)},
                         fn{std::move(fn)}]() mutable {
    impl->writeFromLoop(std::move(message), std::move(fn));
  });
}

void PipeImpl::writeFromLoop(Message message, write_callback_fn fn) {
  TP_DCHECK(context_->inLoop());

  WriteOpIter opIter = writeOps_.emplaceBack(nextMessageBeingWritten_++);
  WriteOperation& op = *opIter;

  TP_VLOG(1) << "Pipe " << id_ << " received a write request (#"
             << op.sequenceNumber << ", contaning " << message.payloads.size()
             << " payloads and " << message.tensors.size() << " tensors)";

  fn = [this, sequenceNumber{op.sequenceNumber}, fn{std::move(fn)}](
           const Error& error) {
    TP_DCHECK_EQ(sequenceNumber, nextWriteCallbackToCall_++);
    TP_VLOG(1) << "Pipe " << id_ << " is calling a write callback (#"
               << sequenceNumber << ")";
    fn(error);
    TP_VLOG(1) << "Pipe " << id_ << " done calling a write callback (#"
               << sequenceNumber << ")";
  };

  size_t numTensors = message.tensors.size();
  op.tensors.resize(numTensors);
  for (size_t tensorIdx = 0; tensorIdx < numTensors; ++tensorIdx) {
    const Message::Tensor& tensor = message.tensors[tensorIdx];
    WriteOperation::Tensor& tensorBeingSent = op.tensors[tensorIdx];
    tensorBeingSent.sourceDevice = tensor.buffer.device();
    if (tensor.targetDevice.has_value()) {
      tensorBeingSent.targetDevice = *tensor.targetDevice;
    } else {
      op.hasMissingTargetDevices = true;
    }
  }

  op.message = std::move(message);
  op.writeCallback = std::move(fn);

  writeOps_.advanceOperation(opIter);
}

//
// Helpers to schedule our callbacks into user code
//

void PipeImpl::callReadDescriptorCallback(ReadOpIter opIter) {
  TP_DCHECK(context_->inLoop());

  ReadOperation& op = *opIter;

  op.readDescriptorCallback(error_, op.descriptor);
  // Reset callback to release the resources it was holding.
  op.readDescriptorCallback = nullptr;
}

void PipeImpl::callReadCallback(ReadOpIter opIter) {
  TP_DCHECK(context_->inLoop());

  ReadOperation& op = *opIter;

  op.readCallback(error_);
  // Reset callback to release the resources it was holding.
  op.readCallback = nullptr;
}

void PipeImpl::callWriteCallback(WriteOpIter opIter) {
  TP_DCHECK(context_->inLoop());

  WriteOperation& op = *opIter;

  op.writeCallback(error_);
  // Reset callback to release the resources it was holding.
  op.writeCallback = nullptr;
}

//
// Error handling
//

void PipeImpl::setError(Error error) {
  // Don't overwrite an error that's already set.
  if (error_ || !error) {
    return;
  }

  error_ = std::move(error);

  handleError();
}

void PipeImpl::handleError() {
  TP_DCHECK(context_->inLoop());
  TP_VLOG(2) << "Pipe " << id_ << " is handling error " << error_.what();

  descriptorConnection_->close();

  if (descriptorReplyConnection_) {
    descriptorReplyConnection_->close();
  }

  for (auto& channelIter : channels_) {
    channelIter.second->close();
  }

  for (const auto& tokenIter : registrationIds_) {
    listener_->unregisterConnectionRequest(tokenIter.second);
  }
  registrationIds_.clear();

  for (const auto& iter : channelRegistrationIds_) {
    for (const auto& token : iter.second) {
      listener_->unregisterConnectionRequest(token);
    }
  }
  channelRegistrationIds_.clear();
  channelReceivedConnections_.clear();

  readOps_.advanceAllOperations();
  writeOps_.advanceAllOperations();

  context_->unenroll(*this);
}

//
// Everything else
//

void PipeImpl::advanceReadOperation(
    ReadOpIter opIter,
    ReadOperation::State prevOpState) {
  TP_DCHECK(context_->inLoop());

  ReadOperation& op = *opIter;

  // Needs to go after previous op to ensure ordering of callback invocations.
  readOps_.attemptTransition(
      opIter,
      /*from=*/ReadOperation::UNINITIALIZED,
      /*to=*/ReadOperation::ASKING_FOR_ALLOCATION,
      /*cond=*/error_ && prevOpState >= ReadOperation::ASKING_FOR_ALLOCATION,
      /*actions=*/{&PipeImpl::callReadDescriptorCallback});

  // The ordering on the "wire" (the primary connection) is descriptor of op N,
  // then payloads of op N, then descriptor of op N+1. Hence this transition
  // must happen after the previous op scheduled its payload read, not just its
  // descriptor read.
  readOps_.attemptTransition(
      opIter,
      /*from=*/ReadOperation::UNINITIALIZED,
      /*to=*/ReadOperation::READING_DESCRIPTOR,
      /*cond=*/!error_ && state_ == ESTABLISHED &&
          prevOpState >= ReadOperation::READING_PAYLOADS_AND_RECEIVING_TENSORS,
      /*actions=*/{&PipeImpl::readDescriptorOfMessage});

  // Needs to go after previous op to ensure ordering of callback invocations.
  readOps_.attemptTransition(
      opIter,
      /*from=*/ReadOperation::READING_DESCRIPTOR,
      /*to=*/ReadOperation::ASKING_FOR_ALLOCATION,
      /*cond=*/op.doneReadingDescriptor &&
          prevOpState >= ReadOperation::ASKING_FOR_ALLOCATION,
      /*actions=*/{&PipeImpl::callReadDescriptorCallback});

  // Needs to wait for previous op to have _received_ the read call, as we can
  // only have exactly one operation at a time for which we expect a read call.
  readOps_.attemptTransition(
      opIter,
      /*from=*/ReadOperation::ASKING_FOR_ALLOCATION,
      /*to=*/ReadOperation::ASKING_FOR_ALLOCATION_FIRST_IN_LINE,
      /*cond=*/op.doneReadingDescriptor &&
          prevOpState >= ReadOperation::READING_PAYLOADS_AND_RECEIVING_TENSORS,
      /*actions=*/{&PipeImpl::expectReadCall});

  // Needs to go after previous op to ensure ordering of callback invocations.
  readOps_.attemptTransition(
      opIter,
      /*from=*/ReadOperation::ASKING_FOR_ALLOCATION_FIRST_IN_LINE,
      /*to=*/ReadOperation::FINISHED,
      /*cond=*/error_ && op.doneGettingAllocation &&
          prevOpState >= ReadOperation::FINISHED,
      /*actions=*/{&PipeImpl::callReadCallback});

  // No need to order this with the previous operation, since all it needs is
  // to come after this own op's descriptor read.
  // This transition shortcuts writing the descriptor reply when all target
  // devices were provided by the sender.
  readOps_.attemptTransition(
      opIter,
      /*from=*/ReadOperation::ASKING_FOR_ALLOCATION_FIRST_IN_LINE,
      /*to=*/ReadOperation::READING_PAYLOADS_AND_RECEIVING_TENSORS,
      /*cond=*/!error_ && op.doneGettingAllocation &&
          !op.hasMissingTargetDevices,
      /*actions=*/
      {&PipeImpl::readPayloadsOfMessage, &PipeImpl::receiveTensorsOfMessage});

  // No need to order this with the previous operation, since all it needs is
  // to come after this own op's descriptor read.
  readOps_.attemptTransition(
      opIter,
      /*from=*/ReadOperation::ASKING_FOR_ALLOCATION_FIRST_IN_LINE,
      /*to=*/ReadOperation::READING_PAYLOADS_AND_RECEIVING_TENSORS,
      /*cond=*/!error_ && op.doneGettingAllocation &&
          op.hasMissingTargetDevices,
      /*actions=*/
      {&PipeImpl::readPayloadsOfMessage,
       &PipeImpl::writeDescriptorReplyOfMessage,
       &PipeImpl::receiveTensorsOfMessage});

  // Needs to go after previous op to ensure ordering of callback invocations.
  readOps_.attemptTransition(
      opIter,
      /*from=*/ReadOperation::READING_PAYLOADS_AND_RECEIVING_TENSORS,
      /*to=*/ReadOperation::FINISHED,
      /*cond=*/op.numPayloadsBeingRead == 0 &&
          op.numTensorsBeingReceived == 0 &&
          prevOpState >= ReadOperation::FINISHED,
      /*actions=*/{&PipeImpl::callReadCallback});
}

void PipeImpl::advanceWriteOperation(
    WriteOpIter opIter,
    WriteOperation::State prevOpState) {
  TP_DCHECK(context_->inLoop());

  WriteOperation& op = *opIter;

  // Needs to go after previous op to ensure ordering of callback invocations.
  writeOps_.attemptTransition(
      opIter,
      /*from=*/WriteOperation::UNINITIALIZED,
      /*to=*/WriteOperation::FINISHED,
      /*cond=*/error_ && prevOpState >= WriteOperation::FINISHED,
      /*actions=*/{&PipeImpl::callWriteCallback});

  // Needs to go after previous op to ensure predictable and consistent ordering
  // of write calls on the connection and send calls on the channels.
  // This transition shortcuts reading the target devices when they were all
  // provided by the user.
  writeOps_.attemptTransition(
      opIter,
      /*from=*/WriteOperation::UNINITIALIZED,
      /*to=*/WriteOperation::WRITING_PAYLOADS_AND_SENDING_TENSORS,
      /*cond=*/!error_ && state_ == ESTABLISHED &&
          !op.hasMissingTargetDevices &&
          prevOpState >= WriteOperation::WRITING_PAYLOADS_AND_SENDING_TENSORS,
      /*actions=*/
      {&PipeImpl::writeDescriptorOfMessage,
       &PipeImpl::writePayloadsOfMessage,
       &PipeImpl::sendTensorsOfMessage});

  // Needs to go after previous op to ensure predictable and consistent ordering
  // of write calls on the descriptor connection and read calls on the
  // descriptor reply connection.
  writeOps_.attemptTransition(
      opIter,
      /*from=*/WriteOperation::UNINITIALIZED,
      /*to=*/WriteOperation::WRITING_PAYLOADS_AND_READING_TARGET_DEVICES,
      /*cond=*/!error_ && state_ == ESTABLISHED && op.hasMissingTargetDevices &&
          prevOpState >=
              WriteOperation::WRITING_PAYLOADS_AND_READING_TARGET_DEVICES,
      /*actions=*/
      {&PipeImpl::writeDescriptorOfMessage,
       &PipeImpl::writePayloadsOfMessage,
       &PipeImpl::readDescriptorReplyOfMessage});

  // Needs to go after previous op to ensure ordering of callback invocations.
  writeOps_.attemptTransition(
      opIter,
      /*from=*/WriteOperation::WRITING_PAYLOADS_AND_READING_TARGET_DEVICES,
      /*to=*/WriteOperation::FINISHED,
      /*cond=*/error_ && op.numPayloadsBeingWritten == 0 &&
          op.doneReadingDescriptorReply &&
          prevOpState >= WriteOperation::FINISHED,
      /*actions=*/{&PipeImpl::callWriteCallback});

  // Needs to go after previous op to ensure predictable and consistent ordering
  // of send calls on channels.
  writeOps_.attemptTransition(
      opIter,
      /*from=*/WriteOperation::WRITING_PAYLOADS_AND_READING_TARGET_DEVICES,
      /*to=*/WriteOperation::WRITING_PAYLOADS_AND_SENDING_TENSORS,
      /*cond=*/!error_ && op.doneReadingDescriptorReply &&
          prevOpState >= WriteOperation::WRITING_PAYLOADS_AND_SENDING_TENSORS,
      /*actions=*/{&PipeImpl::sendTensorsOfMessage});

  // Needs to go after previous op to ensure ordering of callback invocations.
  writeOps_.attemptTransition(
      opIter,
      /*from=*/WriteOperation::WRITING_PAYLOADS_AND_SENDING_TENSORS,
      /*to=*/WriteOperation::FINISHED,
      /*cond=*/op.numPayloadsBeingWritten == 0 && op.numTensorsBeingSent == 0 &&
          prevOpState >= WriteOperation::FINISHED,
      /*actions=*/{&PipeImpl::callWriteCallback});
}

void PipeImpl::readDescriptorOfMessage(ReadOpIter opIter) {
  TP_DCHECK(context_->inLoop());

  ReadOperation& op = *opIter;

  TP_DCHECK_EQ(connectionState_, AWAITING_DESCRIPTOR);
  TP_DCHECK_EQ(messageBeingReadFromConnection_, op.sequenceNumber);
  auto nopHolderIn = std::make_shared<NopHolder<Descriptor>>();
  TP_VLOG(3) << "Pipe " << id_ << " is reading nop object (message descriptor #"
             << op.sequenceNumber << ")";
  descriptorConnection_->read(
      *nopHolderIn, callbackWrapper_([opIter, nopHolderIn](PipeImpl& impl) {
        TP_VLOG(3) << "Pipe " << impl.id_
                   << " done reading nop object (message descriptor #"
                   << opIter->sequenceNumber << ")";
        opIter->doneReadingDescriptor = true;
        if (!impl.error_) {
          opIter->descriptor = std::move(nopHolderIn->getObject());
          for (const auto& tensor : opIter->descriptor.tensors) {
            if (!tensor.targetDevice.has_value()) {
              opIter->hasMissingTargetDevices = true;
            }
          }
        }
        impl.readOps_.advanceOperation(opIter);
      }));
  connectionState_ = AWAITING_PAYLOADS;
}

void PipeImpl::expectReadCall(ReadOpIter opIter) {
  TP_DCHECK(context_->inLoop());

  ReadOperation& op = *opIter;

  TP_DCHECK(!nextMessageGettingAllocation_.has_value());
  nextMessageGettingAllocation_ = opIter;
}

void PipeImpl::sendTensorsOfMessage(WriteOpIter opIter) {
  TP_DCHECK(context_->inLoop());

  WriteOperation& op = *opIter;

  TP_VLOG(2) << "Pipe " << id_ << " is sending tensors of message #"
             << op.sequenceNumber;

  TP_DCHECK_EQ(op.message.tensors.size(), op.tensors.size());
  for (size_t tensorIdx = 0; tensorIdx < op.message.tensors.size();
       ++tensorIdx) {
    const auto& tensor = op.message.tensors[tensorIdx];

    const Device& localDevice = op.tensors[tensorIdx].sourceDevice;
    TP_DCHECK(op.tensors[tensorIdx].targetDevice.has_value());
    const Device& remoteDevice = *op.tensors[tensorIdx].targetDevice;
    const auto& channelIter =
        channelForDevicePair_.find({localDevice, remoteDevice});
    TP_THROW_ASSERT_IF(channelIter == channelForDevicePair_.end())
        << "Could not find suitable channel for sending from local device "
        << localDevice.toString() << " to remote device "
        << remoteDevice.toString();
    const std::string& channelName = channelIter->second;

    channel::Channel& channel = *channels_[channelName];

    TP_VLOG(3) << "Pipe " << id_ << " is sending tensor #" << op.sequenceNumber
               << "." << tensorIdx;

    channel.send(
        tensor.buffer,
        tensor.length,
        callbackWrapper_([opIter, tensorIdx](PipeImpl& impl) {
          TP_VLOG(3) << "Pipe " << impl.id_ << " done sending tensor #"
                     << opIter->sequenceNumber << "." << tensorIdx;
          opIter->numTensorsBeingSent--;
          impl.writeOps_.advanceOperation(opIter);
        }));

    ++op.numTensorsBeingSent;
  }
}

void PipeImpl::writeDescriptorOfMessage(WriteOpIter opIter) {
  TP_DCHECK(context_->inLoop());

  WriteOperation& op = *opIter;

  std::shared_ptr<NopHolder<Descriptor>> holder = makeDescriptorForMessage(op);

  TP_VLOG(3) << "Pipe " << id_ << " is writing nop object (message descriptor #"
             << op.sequenceNumber << ")";
  descriptorConnection_->write(
      *holder,
      callbackWrapper_(
          [sequenceNumber{op.sequenceNumber}, holder](PipeImpl& impl) {
            TP_VLOG(3) << "Pipe " << impl.id_
                       << " done writing nop object (message descriptor #"
                       << sequenceNumber << ")";
          }));
}

void PipeImpl::writePayloadsOfMessage(WriteOpIter opIter) {
  TP_DCHECK(context_->inLoop());

  WriteOperation& op = *opIter;

  TP_VLOG(2) << "Pipe " << id_ << " is writing payloads of message #"
             << op.sequenceNumber;

  for (size_t payloadIdx = 0; payloadIdx < op.message.payloads.size();
       payloadIdx++) {
    Message::Payload& payload = op.message.payloads[payloadIdx];
    TP_VLOG(3) << "Pipe " << id_ << " is writing payload #" << op.sequenceNumber
               << "." << payloadIdx;
    descriptorConnection_->write(
        payload.data,
        payload.length,
        callbackWrapper_([opIter, payloadIdx](PipeImpl& impl) {
          TP_VLOG(3) << "Pipe " << impl.id_ << " done writing payload #"
                     << opIter->sequenceNumber << "." << payloadIdx;
          opIter->numPayloadsBeingWritten--;
          impl.writeOps_.advanceOperation(opIter);
        }));
    ++op.numPayloadsBeingWritten;
  }
}

void PipeImpl::readDescriptorReplyOfMessage(WriteOpIter opIter) {
  TP_DCHECK(context_->inLoop());

  WriteOperation& op = *opIter;

  TP_DCHECK(op.hasMissingTargetDevices);

  auto nopHolderIn = std::make_shared<NopHolder<DescriptorReply>>();
  TP_VLOG(3) << "Pipe " << id_
             << " is reading nop object (message descriptor reply #"
             << op.sequenceNumber << ")";
  descriptorReplyConnection_->read(
      *nopHolderIn, callbackWrapper_([opIter, nopHolderIn](PipeImpl& impl) {
        TP_VLOG(3) << "Pipe " << impl.id_
                   << " done reading nop object (message descriptor reply #"
                   << opIter->sequenceNumber << ")";
        opIter->doneReadingDescriptorReply = true;
        if (!impl.error_) {
          parseDescriptorReplyOfMessage(
              *opIter, std::move(nopHolderIn->getObject()));
        }
        impl.writeOps_.advanceOperation(opIter);
      }));
}

void PipeImpl::onReadWhileServerWaitingForBrochure(
    const Brochure& nopBrochure) {
  TP_DCHECK(context_->inLoop());
  TP_DCHECK_EQ(state_, SERVER_WAITING_FOR_BROCHURE);

  auto nopHolderOut = std::make_shared<NopHolder<BrochureAnswer>>();
  BrochureAnswer& nopBrochureAnswer = nopHolderOut->getObject();

  auto transport = selectTransport(
      context_->getOrderedTransports(),
      nopBrochure.transportDomainDescriptors,
      listener_->addresses());

  if (transport.name != transport_) {
    transport_ = transport.name;
    nopBrochureAnswer.transportRegistrationIds[ConnectionId::DESCRIPTOR] =
        registerTransport(ConnectionId::DESCRIPTOR);
  }
  nopBrochureAnswer.transportRegistrationIds[ConnectionId::DESCRIPTOR_REPLY] =
      registerTransport(ConnectionId::DESCRIPTOR_REPLY);

  nopBrochureAnswer.transport = transport.name;
  nopBrochureAnswer.address = transport.address;
  nopBrochureAnswer.transportDomainDescriptor = transport.domainDescriptor;

  SelectedChannels selectedChannels = selectChannels(
      context_->getOrderedChannels(), nopBrochure.channelDeviceDescriptors);
  channelForDevicePair_ = std::move(selectedChannels.channelForDevicePair);
  nopBrochureAnswer.channelForDevicePair = channelForDevicePair_;

  for (auto& descriptorsIter : selectedChannels.descriptorsMap) {
    const std::string& channelName = descriptorsIter.first;
    nopBrochureAnswer.channelRegistrationIds[channelName] =
        registerChannel(channelName);
    std::unordered_map<Device, std::string>& deviceDescriptors =
        descriptorsIter.second;
    nopBrochureAnswer.channelDeviceDescriptors[channelName] =
        std::move(deviceDescriptors);
  }

  TP_VLOG(3) << "Pipe " << id_ << " is writing nop object (brochure answer)";
  descriptorConnection_->write(
      *nopHolderOut, callbackWrapper_([nopHolderOut](PipeImpl& impl) {
        TP_VLOG(3) << "Pipe " << impl.id_
                   << " done writing nop object (brochure answer)";
      }));

  if (!pendingRegistrations()) {
    state_ = ESTABLISHED;
    readOps_.advanceAllOperations();
    writeOps_.advanceAllOperations();
  } else {
    state_ = SERVER_WAITING_FOR_CONNECTIONS;
  }
}

uint64_t PipeImpl::registerTransport(ConnectionId connId) {
  TP_DCHECK(registrationIds_.count(connId) == 0);
  TP_VLOG(3) << "Pipe " << id_ << " is requesting connection (as replacement)";
  uint64_t token = listener_->registerConnectionRequest(
      callbackWrapper_([connId](
                           PipeImpl& impl,
                           std::string transport,
                           std::shared_ptr<transport::Connection> connection) {
        TP_VLOG(3) << "Pipe " << impl.id_
                   << " done requesting connection (as replacement)";
        if (!impl.error_) {
          impl.onAcceptWhileServerWaitingForConnection(
              connId, std::move(transport), std::move(connection));
        }
      }));
  registrationIds_[connId] = token;

  return token;
}

std::vector<uint64_t>& PipeImpl::registerChannel(
    const std::string& channelName) {
  const channel::Context& channelContext = *context_->getChannel(channelName);
  const size_t numConnectionsNeeded = channelContext.numConnectionsNeeded();
  auto& channelRegistrationIds = channelRegistrationIds_[channelName];
  channelRegistrationIds.resize(numConnectionsNeeded);
  auto& channelReceivedConnections = channelReceivedConnections_[channelName];
  channelReceivedConnections.resize(numConnectionsNeeded);
  for (size_t connId = 0; connId < numConnectionsNeeded; ++connId) {
    TP_VLOG(3) << "Pipe " << id_ << " is requesting connection " << connId
               << "/" << numConnectionsNeeded << " (for channel " << channelName
               << ")";
    uint64_t token = listener_->registerConnectionRequest(callbackWrapper_(
        [channelName, connId, numConnectionsNeeded](
            PipeImpl& impl,
            std::string transport,
            std::shared_ptr<transport::Connection> connection) {
          TP_VLOG(3) << "Pipe " << impl.id_ << " done requesting connection "
                     << connId << "/" << numConnectionsNeeded
                     << " (for channel " << channelName << ")";
          if (!impl.error_) {
            impl.onAcceptWhileServerWaitingForChannel(
                channelName,
                connId,
                std::move(transport),
                std::move(connection));
          }
        }));
    channelRegistrationIds[connId] = token;
  }

  return channelRegistrationIds;
}

void PipeImpl::onReadWhileClientWaitingForBrochureAnswer(
    const BrochureAnswer& nopBrochureAnswer) {
  TP_DCHECK(context_->inLoop());
  TP_DCHECK_EQ(state_, CLIENT_WAITING_FOR_BROCHURE_ANSWER);

  const std::string& transport = nopBrochureAnswer.transport;
  std::string address = nopBrochureAnswer.address;
  std::shared_ptr<transport::Context> transportContext =
      context_->getTransport(transport);
  TP_DCHECK(transportContext->canCommunicateWithRemote(
      nopBrochureAnswer.transportDomainDescriptor))
      << "The two endpoints disagree on whether transport " << transport
      << " can be used to communicate";

  if (transport != transport_) {
    TP_VLOG(3) << "Pipe " << id_
               << " is opening connection (descriptor, as replacement)";
    std::shared_ptr<transport::Connection> connection =
        transportContext->connect(address);
    connection->setId(id_ + ".d.tr_" + transport);
    const auto& transportRegistrationIter =
        nopBrochureAnswer.transportRegistrationIds.find(
            ConnectionId::DESCRIPTOR);
    TP_DCHECK(
        transportRegistrationIter !=
        nopBrochureAnswer.transportRegistrationIds.end());
    initConnection(*connection, transportRegistrationIter->second);

    transport_ = transport;
    descriptorConnection_ = std::move(connection);
  }

  {
    TP_VLOG(3) << "Pipe " << id_ << " is opening connection (descriptor_reply)";
    std::shared_ptr<transport::Connection> connection =
        transportContext->connect(address);
    connection->setId(id_ + ".r.tr_" + transport);
    const auto& transportRegistrationIter =
        nopBrochureAnswer.transportRegistrationIds.find(
            ConnectionId::DESCRIPTOR_REPLY);
    TP_DCHECK(
        transportRegistrationIter !=
        nopBrochureAnswer.transportRegistrationIds.end());
    initConnection(*connection, transportRegistrationIter->second);

    descriptorReplyConnection_ = std::move(connection);
  }

  // Recompute the channel map based on this side's channels and priorities.
  SelectedChannels selectedChannels = selectChannels(
      context_->getOrderedChannels(),
      nopBrochureAnswer.channelDeviceDescriptors);
  channelForDevicePair_ = std::move(selectedChannels.channelForDevicePair);

  // Verify that the locally and remotely computed channel maps are consistent.
  TP_THROW_ASSERT_IF(
      nopBrochureAnswer.channelForDevicePair.size() !=
      channelForDevicePair_.size())
      << "Inconsistent channel selection";
  for (const auto& iter : channelForDevicePair_) {
    Device localDevice;
    Device remoteDevice;
    std::tie(localDevice, remoteDevice) = iter.first;
    const std::string& channelName = iter.second;

    const auto& answerIter = nopBrochureAnswer.channelForDevicePair.find(
        {remoteDevice, localDevice});

    TP_THROW_ASSERT_IF(
        answerIter == nopBrochureAnswer.channelForDevicePair.end())
        << "Inconsistent channel selection";
    TP_THROW_ASSERT_IF(answerIter->second != channelName)
        << "Inconsistent channel selection";
  }

  for (const auto& channelDeviceDescriptorsIter :
       selectedChannels.descriptorsMap) {
    const std::string& channelName = channelDeviceDescriptorsIter.first;
    std::shared_ptr<channel::Context> channelContext =
        context_->getChannel(channelName);

    const std::vector<uint64_t>& registrationIds =
        nopBrochureAnswer.channelRegistrationIds.at(channelName);
    const size_t numConnectionsNeeded = channelContext->numConnectionsNeeded();
    TP_DCHECK_EQ(numConnectionsNeeded, registrationIds.size());
    std::vector<std::shared_ptr<transport::Connection>> connections(
        numConnectionsNeeded);
    for (size_t connId = 0; connId < numConnectionsNeeded; ++connId) {
      TP_VLOG(3) << "Pipe " << id_ << " is opening connection " << connId << "/"
                 << numConnectionsNeeded << " (for channel " << channelName
                 << ")";
      std::shared_ptr<transport::Connection> connection =
          transportContext->connect(address);
      connection->setId(
          id_ + ".ch_" + channelName + "_" + std::to_string(connId));
      initConnection(*connection, registrationIds[connId]);
      connections[connId] = std::move(connection);
    }

    std::shared_ptr<channel::Channel> channel = channelContext->createChannel(
        std::move(connections), channel::Endpoint::kConnect);
    channel->setId(id_ + ".ch_" + channelName);
    channels_.emplace(channelName, std::move(channel));
  }

  state_ = ESTABLISHED;
  readOps_.advanceAllOperations();
  writeOps_.advanceAllOperations();
}

void PipeImpl::initConnection(
    transport::Connection& connection,
    uint64_t token) {
  auto nopHolderOut = std::make_shared<NopHolder<Packet>>();
  Packet& nopPacketOut = nopHolderOut->getObject();
  nopPacketOut.Become(nopPacketOut.index_of<RequestedConnection>());
  RequestedConnection& nopRequestedConnection =
      *nopPacketOut.get<RequestedConnection>();
  nopRequestedConnection.registrationId = token;
  TP_VLOG(3) << "Pipe " << id_
             << " is writing nop object (requested connection)";
  connection.write(
      *nopHolderOut, callbackWrapper_([nopHolderOut](PipeImpl& impl) {
        TP_VLOG(3) << "Pipe " << impl.id_
                   << " done writing nop object (requested connection)";
      }));
}

void PipeImpl::onAcceptWhileServerWaitingForConnection(
    ConnectionId connId,
    std::string receivedTransport,
    std::shared_ptr<transport::Connection> receivedConnection) {
  TP_DCHECK(context_->inLoop());
  TP_DCHECK_EQ(state_, SERVER_WAITING_FOR_CONNECTIONS);
  const auto& registrationIdIter = registrationIds_.find(connId);
  TP_DCHECK(registrationIdIter != registrationIds_.end());
  size_t token = registrationIdIter->second;
  listener_->unregisterConnectionRequest(token);
  registrationIds_.erase(registrationIdIter);
  TP_DCHECK_EQ(transport_, receivedTransport);

  switch (connId) {
    case ConnectionId::DESCRIPTOR:
      receivedConnection->setId(id_ + ".d.tr_" + receivedTransport);
      descriptorConnection_ = std::move(receivedConnection);
      break;
    case ConnectionId::DESCRIPTOR_REPLY:
      receivedConnection->setId(id_ + ".r.tr_" + receivedTransport);
      descriptorReplyConnection_ = std::move(receivedConnection);
      break;
    default:
      TP_THROW_ASSERT() << "Unrecognized connection identifier";
  }

  if (!pendingRegistrations()) {
    state_ = ESTABLISHED;
    readOps_.advanceAllOperations();
    writeOps_.advanceAllOperations();
  }
}

void PipeImpl::onAcceptWhileServerWaitingForChannel(
    std::string channelName,
    size_t connId,
    std::string receivedTransport,
    std::shared_ptr<transport::Connection> receivedConnection) {
  TP_DCHECK(context_->inLoop());
  TP_DCHECK_EQ(state_, SERVER_WAITING_FOR_CONNECTIONS);
  TP_DCHECK_EQ(transport_, receivedTransport);
  auto channelRegistrationIdsIter = channelRegistrationIds_.find(channelName);
  TP_DCHECK(channelRegistrationIdsIter != channelRegistrationIds_.end());
  listener_->unregisterConnectionRequest(
      channelRegistrationIdsIter->second[connId]);
  receivedConnection->setId(
      id_ + ".ch_" + channelName + "_" + std::to_string(connId));

  channelReceivedConnections_[channelName][connId] =
      std::move(receivedConnection);
  // TODO: If we can guarantee the order in which the accept() calls happen,
  // this check can be replaced with `if (connId == numConnectionsNeeded -
  // 1)`.
  for (const auto& conn : channelReceivedConnections_[channelName]) {
    if (conn == nullptr) {
      return;
    }
  }

  std::shared_ptr<channel::Context> channelContext =
      context_->getChannel(channelName);

  std::shared_ptr<channel::Channel> channel = channelContext->createChannel(
      std::move(channelReceivedConnections_[channelName]),
      channel::Endpoint::kListen);
  channel->setId(id_ + ".ch_" + channelName);

  channelRegistrationIds_.erase(channelRegistrationIdsIter);
  channelReceivedConnections_.erase(channelName);

  TP_DCHECK(channels_.find(channelName) == channels_.end());
  channels_.emplace(channelName, std::move(channel));

  if (!pendingRegistrations()) {
    state_ = ESTABLISHED;
    readOps_.advanceAllOperations();
    writeOps_.advanceAllOperations();
  }
}

bool PipeImpl::pendingRegistrations() {
  if (!registrationIds_.empty()) {
    return true;
  }

  if (!channelRegistrationIds_.empty()) {
    return true;
  }

  return false;
}

} // namespace tensorpipe


================================================
FILE: tensorpipe/core/pipe_impl.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <deque>
#include <map>
#include <memory>
#include <string>
#include <tuple>
#include <unordered_map>
#include <vector>

#include <tensorpipe/channel/channel.h>
#include <tensorpipe/common/buffer.h>
#include <tensorpipe/common/callback.h>
#include <tensorpipe/common/deferred_executor.h>
#include <tensorpipe/common/error.h>
#include <tensorpipe/common/optional.h>
#include <tensorpipe/common/state_machine.h>
#include <tensorpipe/core/context_impl.h>
#include <tensorpipe/core/message.h>
#include <tensorpipe/core/nop_types.h>
#include <tensorpipe/core/pipe.h>
#include <tensorpipe/transport/context.h>

namespace tensorpipe {

class ContextImpl;
class ListenerImpl;

struct ReadOperation {
  enum State {
    UNINITIALIZED,
    READING_DESCRIPTOR,
    ASKING_FOR_ALLOCATION,
    ASKING_FOR_ALLOCATION_FIRST_IN_LINE,
    READING_PAYLOADS_AND_RECEIVING_TENSORS,
    FINISHED
  };

  // Fields used by the state machine
  uint64_t sequenceNumber{0};
  State state{UNINITIALIZED};

  // Progress flags
  bool doneReadingDescriptor{false};
  bool doneGettingAllocation{false};
  uint64_t numPayloadsBeingRead{0};
  uint64_t numTensorsBeingReceived{0};

  // Callbacks.
  Pipe::read_descriptor_callback_fn readDescriptorCallback;
  Pipe::read_callback_fn readCallback;

  // Arguments at creation
  bool hasMissingTargetDevices{false};

  Descriptor descriptor;
  // Buffers allocated by the user.
  Allocation allocation;
};

struct WriteOperation {
  enum State {
    UNINITIALIZED,
    WRITING_PAYLOADS_AND_READING_TARGET_DEVICES,
    WRITING_PAYLOADS_AND_SENDING_TENSORS,
    FINISHED
  };

  // Fields used by the state machine
  uint64_t sequenceNumber{0};
  State state{UNINITIALIZED};

  // Progress flags
  bool doneReadingDescriptorReply{false};
  uint64_t numPayloadsBeingWritten{0};
  uint64_t numTensorsBeingSent{0};

  // Callbacks.
  Pipe::write_callback_fn writeCallback;

  // Arguments at creation
  bool hasMissingTargetDevices{false};

  Message message;

  struct Tensor {
    Device sourceDevice;
    optional<Device> targetDevice;
  };
  std::vector<Tensor> tensors;
};

class PipeImpl final : public std::enable_shared_from_this<PipeImpl> {
 public:
  PipeImpl(
      std::shared_ptr<ContextImpl> context,
      std::string id,
      std::string remoteName,
      const std::string& url);

  PipeImpl(
      std::shared_ptr<ContextImpl> context,
      std::shared_ptr<ListenerImpl> listener,
      std::string id,
      std::string remoteName,
      std::string transport,
      std::shared_ptr<transport::Connection> connection);

  // Called by the pipe's constructor.
  void init();

  using read_descriptor_callback_fn = Pipe::read_descriptor_callback_fn;
  using read_callback_fn = Pipe::read_callback_fn;
  using write_callback_fn = Pipe::write_callback_fn;

  void readDescriptor(read_descriptor_callback_fn fn);
  void read(Allocation allocation, read_callback_fn fn);
  void write(Message message, write_callback_fn fn);

  const std::string& getRemoteName();

  void close();

 private:
  void initFromLoop();

  void readDescriptorFromLoop(read_descriptor_callback_fn fn);

  void readFromLoop(Allocation allocation, read_callback_fn fn);

  void writeFromLoop(Message message, write_callback_fn fn);

  void closeFromLoop();

  enum State {
    INITIALIZING,
    CLIENT_ABOUT_TO_SEND_HELLO_AND_BROCHURE,
    SERVER_WAITING_FOR_BROCHURE,
    CLIENT_WAITING_FOR_BROCHURE_ANSWER,
    SERVER_WAITING_FOR_CONNECTIONS,
    ESTABLISHED
  };

  State state_{INITIALIZING};

  std::shared_ptr<ContextImpl> context_;
  std::shared_ptr<ListenerImpl> listener_;

  // An identifier for the pipe, composed of the identifier for the context or
  // listener, combined with an increasing sequence number. It will only be used
  // for logging and debugging purposes.
  std::string id_;

  // The name the user has given to the connect method of the local context (for
  // outgoing pipes) or to the constructor of the context on the remote end (for
  // incoming pipes).
  std::string remoteName_;

  std::string transport_;
  enum ConnectionId { DESCRIPTOR, DESCRIPTOR_REPLY };
  std::shared_ptr<transport::Connection> descriptorConnection_;
  std::shared_ptr<transport::Connection> descriptorReplyConnection_;

  std::unordered_map<std::string, std::shared_ptr<channel::Channel>> channels_;
  std::unordered_map<std::pair<Device, Device>, std::string>
      channelForDevicePair_;

  // The server will set this up when it tell the client to switch to a
  // different connection or to open some channels.
  std::unordered_map<uint64_t, uint64_t> registrationIds_;

  std::unordered_map<std::string, std::vector<uint64_t>>
      channelRegistrationIds_;

  std::unordered_map<
      std::string,
      std::vector<std::shared_ptr<transport::Connection>>>
      channelReceivedConnections_;

  OpsStateMachine<PipeImpl, ReadOperation> readOps_{
      *this,
      &PipeImpl::advanceReadOperation};
  using ReadOpIter = decltype(readOps_)::Iter;
  OpsStateMachine<PipeImpl, WriteOperation> writeOps_{
      *this,
      &PipeImpl::advanceWriteOperation};
  using WriteOpIter = decltype(writeOps_)::Iter;

  // A sequence number for the calls to read and write.
  uint64_t nextMessageBeingRead_{0};
  uint64_t nextMessageBeingWritten_{0};

  // A sequence number for the invocations of the callbacks of read and write.
  uint64_t nextReadDescriptorCallbackToCall_{0};
  uint64_t nextReadCallbackToCall_{0};
  uint64_t nextWriteCallbackToCall_{0};

  // When reading, we first read the descriptor, then signal this to the user,
  // and only once the user has allocated the memory we read the payloads. These
  // members store where we are in this loop, i.e., whether the next buffer we
  // will read from the connection will be a descriptor or a payload, and the
  // sequence number of which message that will be for.
  enum ConnectionState { AWAITING_DESCRIPTOR, AWAITING_PAYLOADS };
  ConnectionState connectionState_{AWAITING_DESCRIPTOR};
  uint64_t messageBeingReadFromConnection_{0};

  // When reading, each message will be presented to the user in order for some
  // memory to be allocated for its payloads and tensors (this happens by
  // calling the readDescriptor callback and waiting for a read call). Under
  // normal operation there will be either 0 or 1 messages whose allocation is
  // pending, but there could be more after an error occurs, as we'll flush all
  // callbacks. We need to remember which is the first such operation for which
  // we're waiting for allocation in order to match calls to read to the right
  // message and for sanity checks. We do so by using a special state in the
  // state machine to identify the next operation that will receive a read call,
  // and store its iterator in this field.
  optional<ReadOpIter> nextMessageGettingAllocation_;

  Error error_{Error::kSuccess};

  //
  // Helpers to prepare callbacks from transports and listener
  //

  CallbackWrapper<PipeImpl> callbackWrapper_{*this, *this->context_};

  //
  // Error handling
  //

  void setError(Error error);

  void handleError();

  //
  // State machines
  //

  // Transitions for the pipe's initial handshake.
  // On the client side:
  void onReadWhileClientWaitingForBrochureAnswer(
      const BrochureAnswer& nopBrochureAnswer);
  // On the server side:
  void onReadWhileServerWaitingForBrochure(const Brochure& nopBrochure);
  void onAcceptWhileServerWaitingForConnection(
      ConnectionId connId,
      std::string receivedTransport,
      std::shared_ptr<transport::Connection> receivedConnection);
  void onAcceptWhileServerWaitingForChannel(
      std::string channelName,
      size_t connId,
      std::string receivedTransport,
      std::shared_ptr<transport::Connection> receivedConnection);

  // State machines for read and write ops.
  void advanceReadOperation(
      ReadOpIter opIter,
      ReadOperation::State prevOpState);
  void advanceWriteOperation(
      WriteOpIter opIter,
      WriteOperation::State prevOpState);

  // Actions (i.e., methods that begin a state transition).
  // For read operations:
  void readDescriptorOfMessage(ReadOpIter opIter);
  void callReadDescriptorCallback(ReadOpIter opIter);
  void expectReadCall(ReadOpIter opIter);
  void readPayloadsOfMessage(ReadOpIter opIter);
  void receiveTensorsOfMessage(ReadOpIter opIter);
  void writeDescriptorReplyOfMessage(ReadOpIter opIter);
  void callReadCallback(ReadOpIter opIter);
  // For write operations:
  void writeDescriptorOfMessage(WriteOpIter opIter);
  void writePayloadsOfMessage(WriteOpIter opIter);
  void readDescriptorReplyOfMessage(WriteOpIter opIter);
  void sendTensorsOfMessage(WriteOpIter opIter);
  void callWriteCallback(WriteOpIter opIter);

  //
  // Everything else
  //

  void initConnection(transport::Connection& connection, uint64_t token);
  uint64_t registerTransport(ConnectionId connId);
  std::vector<uint64_t>& registerChannel(const std::string& channelName);

  bool pendingRegistrations();

  template <typename T>
  friend class CallbackWrapper;

  // Contexts and listeners do sometimes need to call directly into initFromLoop
  // and closeFromLoop, in order to make sure that some of their operations can
  // happen "atomically" on the connection, without possibly other operations
  // occurring in between (e.g., an error).
  friend ContextImpl;
  friend ListenerImpl;
};

} // namespace tensorpipe


================================================
FILE: tensorpipe/misc/CMakeLists.txt
================================================
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

add_executable(dump_state_machine dump_state_machine.cc)
find_package(Clang REQUIRED)
target_include_directories(dump_state_machine PRIVATE ${CLANG_INCLUDE_DIRS})
target_link_libraries(dump_state_machine PRIVATE
  clangTooling
  clangBasic
  clangASTMatchers)


================================================
FILE: tensorpipe/misc/dump_state_machine.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <iostream>
#include <regex>
#include <unordered_set>

#include <clang/ASTMatchers/ASTMatchFinder.h>
#include <clang/ASTMatchers/ASTMatchers.h>
#include <clang/Frontend/FrontendActions.h>
#include <clang/Tooling/CommonOptionsParser.h>
#include <clang/Tooling/Tooling.h>
#include <llvm/Support/CommandLine.h>

using namespace clang::ast_matchers;
using namespace clang::tooling;
using namespace llvm;

namespace {

std::string exprToString(const clang::Expr& e) {
  std::string statement;
  raw_string_ostream stream(statement);
  e.printPretty(stream, nullptr, clang::PrintingPolicy(clang::LangOptions()));
  stream.flush();

  return statement;
}

std::string cleanUp(const std::string& s) {
  std::string res = s;
  res = std::regex_replace(res, std::regex("(struct|class) [a-zA-Z_]+::"), "");
  res = std::regex_replace(res, std::regex("this->"), "");
  return res;
}

std::string escape(const std::string& s) {
  std::string res = s;
  res = std::regex_replace(res, std::regex("\\{"), "\\{");
  res = std::regex_replace(res, std::regex("\\}"), "\\}");
  res = std::regex_replace(res, std::regex(">"), "\\>");
  res = std::regex_replace(res, std::regex("<"), "\\<");
  res = std::regex_replace(res, std::regex("\\|"), "\\|");
  return res;
}

class MethodPrinter : public MatchFinder::MatchCallback {
  std::unordered_set<std::string> nodes_;

  void addNode(const std::string& label) {
    std::cout << label << " [label=<<b>" << label
              << "</b>>,group=states,fontstyle=\"bold\"];" << std::endl;
    nodes_.insert(label);
  }

 public:
  void run(const MatchFinder::MatchResult& result) override {
    static int edgeCount = 0;

    const clang::CallExpr& e = *result.Nodes.getNodeAs<clang::CallExpr>("x");
    std::string edgeId = "edge" + std::to_string(edgeCount++);
    std::string fromId = cleanUp(exprToString(*e.getArg(1)));
    std::string toId = cleanUp(exprToString(*e.getArg(2)));

    if (nodes_.count(fromId) == 0) {
      addNode(fromId);
    }

    if (nodes_.count(toId) == 0) {
      addNode(toId);
    }

    std::string edgeColor = "orange3";
    int edgeWeight = 100;
    std::string cond = cleanUp(exprToString(*e.getArg(3)));
    if (std::regex_search(cond, std::regex("^error_"))) {
      edgeColor = "red3";
      edgeWeight = 0;
    }
    if (std::regex_search(cond, std::regex("^!error_"))) {
      edgeColor = "forestgreen";
    }
    cond = std::regex_replace(cond, std::regex(" \\&\\&"), "\\n");
    cond = escape(cond);

    std::string actions = cleanUp(exprToString(*e.getArg(4)));
    actions = std::regex_replace(actions, std::regex("(\\{|\\})"), "");
    actions = std::regex_replace(actions, std::regex(", "), "\\n");
    actions = std::regex_replace(actions, std::regex("\\&"), "");

    std::cout << edgeId << " [label=\"{" << cond << "|" << actions
              << "}\",shape=record,style=\"rounded,dashed\",color=\""
              << edgeColor << "\"];" << std::endl;

    std::cout << fromId << " -> " << edgeId << "[dir=\"none\",color=\""
              << edgeColor << "\",style=\"dashed\",weight=" << edgeWeight
              << "];" << std::endl;

    std::cout << edgeId << " -> " << toId << "[color=\"" << edgeColor
              << "\",style=\"dashed\",weight=" << edgeWeight << "];"
              << std::endl;
  }
};

} // namespace

int main(int argc, const char* argv[]) {
  cl::OptionCategory category("dump_state_machine");
  cl::opt<std::string> methodName(
      "method",
      cl::Required,
      cl::cat(category),
      cl::desc(
          "Name of the method implementing the state machine's transitions."),
      cl::value_desc("method_name"));

  CommonOptionsParser optionsParser(argc, argv, category, cl::Required);
  ClangTool tool(
      optionsParser.getCompilations(), optionsParser.getSourcePathList());
  auto methodMatcher = callExpr(
                           callee(cxxMethodDecl(hasName("attemptTransition"))),
                           hasAncestor(cxxMethodDecl(hasName(methodName))))
                           .bind("x");
  MethodPrinter printer;
  MatchFinder finder;
  finder.addMatcher(methodMatcher, &printer);
  std::cout << "digraph {" << std::endl
            << "graph [rankdir=TB]" << std::endl
            << "node [shape=box]" << std::endl;
  int res = tool.run(newFrontendActionFactory(&finder).get());
  std::cout << "}" << std::endl;
  return res;
}


================================================
FILE: tensorpipe/python/CMakeLists.txt
================================================
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

if(NOT (COMMAND pybind11_add_module))
  add_subdirectory(
    ${PROJECT_SOURCE_DIR}/third_party/pybind11
    ${PROJECT_BINARY_DIR}/third_party/pybind11
    EXCLUDE_FROM_ALL)
endif()

set(PYBIND11_CPP_STANDARD -std=c++17)
pybind11_add_module(pytensorpipe tensorpipe.cc)
target_link_libraries(pytensorpipe PRIVATE tensorpipe)


================================================
FILE: tensorpipe/python/tensorpipe.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <memory>
#include <string>
#include <vector>

#include <pybind11/functional.h>
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>

#include <tensorpipe/common/defs.h>
#include <tensorpipe/common/optional.h>
#include <tensorpipe/tensorpipe.h>

namespace py = pybind11;

namespace {

using tensorpipe::optional;

// RAII wrapper to reliably release every buffer we get.
class BufferWrapper {
 public:
  BufferWrapper(const py::buffer& buffer, int flags) {
    if (PyObject_GetBuffer(buffer.ptr(), &buffer_, flags) != 0) {
      throw py::error_already_set();
    }
  }

  BufferWrapper(const BufferWrapper& other) = delete;

  BufferWrapper(BufferWrapper&& other) = delete;

  BufferWrapper& operator=(const BufferWrapper& other) = delete;

  BufferWrapper& operator=(BufferWrapper&& other) = delete;

  ~BufferWrapper() {
    PyBuffer_Release(&buffer_);
  }

  void* ptr() const {
    return buffer_.buf;
  }

  size_t length() const {
    return buffer_.len;
  }

  py::buffer_info getBuffer() {
    return py::buffer_info(
        buffer_.buf,
        1,
        py::format_descriptor<unsigned char>::format(),
        1,
        {static_cast<size_t>(buffer_.len)},
        {1});
  }

 private:
  Py_buffer buffer_;
};

class OutgoingPayload {
 public:
  BufferWrapper buffer;
  BufferWrapper metadata;

  OutgoingPayload(const py::buffer& buffer, const py::buffer& metadata)
      : buffer(buffer, PyBUF_SIMPLE), metadata(metadata, PyBUF_SIMPLE) {}
};

class OutgoingTensor {
 public:
  BufferWrapper buffer;
  BufferWrapper metadata;

  OutgoingTensor(const py::buffer& buffer, const py::buffer& metadata)
      : buffer(buffer, PyBUF_SIMPLE), metadata(metadata, PyBUF_SIMPLE) {}
};

class OutgoingMessage {
 public:
  BufferWrapper metadata;
  std::vector<std::shared_ptr<OutgoingPayload>> payloads;
  std::vector<std::shared_ptr<OutgoingTensor>> tensors;

  OutgoingMessage(
      const py::buffer& metadata,
      const std::vector<std::shared_ptr<OutgoingPayload>>& payloads,
      const std::vector<std::shared_ptr<OutgoingTensor>>& tensors)
      : metadata(metadata, PyBUF_SIMPLE),
        payloads(payloads),
        tensors(tensors) {}
};

tensorpipe::Message prepareToWrite(std::shared_ptr<OutgoingMessage> pyMessage) {
  tensorpipe::Message tpMessage{
      {reinterpret_cast<char*>(pyMessage->metadata.ptr()),
       pyMessage->metadata.length()}};
  tpMessage.payloads.reserve(pyMessage->payloads.size());
  for (const auto& pyPayload : pyMessage->payloads) {
    tensorpipe::Message::Payload tpPayload{
        .data = pyPayload->buffer.ptr(),
        .length = pyPayload->buffer.length(),
        .metadata =
            {reinterpret_cast<char*>(pyPayload->metadata.ptr()),
             pyPayload->metadata.length()},
    };
    tpMessage.payloads.push_back(std::move(tpPayload));
  }
  tpMessage.tensors.reserve(pyMessage->tensors.size());
  for (const auto& pyTensor : pyMessage->tensors) {
    tensorpipe::Message::Tensor tpTensor{
        .buffer = tensorpipe::CpuBuffer{.ptr = pyTensor->buffer.ptr()},
        .length = pyTensor->buffer.length(),
        .metadata =
            {reinterpret_cast<char*>(pyTensor->metadata.ptr()),
             pyTensor->metadata.length()},
    };
    tpMessage.tensors.push_back(std::move(tpTensor));
  }
  return tpMessage;
}

class IncomingPayload {
 public:
  size_t length;
  optional<BufferWrapper> buffer;
  py::bytes metadata;

  IncomingPayload(size_t length, py::bytes metadata)
      : length(length), metadata(metadata) {}

  void set_buffer(const py::buffer& pyBuffer) {
    TP_THROW_ASSERT_IF(buffer.has_value()) << "Buffer already set";
    buffer.emplace(pyBuffer, PyBUF_SIMPLE | PyBUF_WRITABLE);
    if (buffer->length() != length) {
      buffer.reset();
      TP_THROW_ASSERT() << "Bad length";
    }
  }
};

class IncomingTensor {
 public:
  size_t length;
  optional<BufferWrapper> buffer;
  py::bytes metadata;

  IncomingTensor(size_t length, py::bytes metadata)
      : length(length), metadata(metadata) {}

  void set_buffer(const py::buffer& pyBuffer) {
    TP_THROW_ASSERT_IF(buffer.has_value()) << "Buffer already set";
    buffer.emplace(pyBuffer, PyBUF_SIMPLE | PyBUF_WRITABLE);
    if (buffer->length() != length) {
      buffer.reset();
      TP_THROW_ASSERT() << "Bad length";
    }
  }
};

class IncomingMessage {
 public:
  py::bytes metadata;
  std::vector<std::shared_ptr<IncomingPayload>> payloads;
  std::vector<std::shared_ptr<IncomingTensor>> tensors;

  IncomingMessage(
      py::bytes metadata,
      std::vector<std::shared_ptr<IncomingPayload>> payloads,
      std::vector<std::shared_ptr<IncomingTensor>> tensors)
      : metadata(metadata), payloads(payloads), tensors(tensors) {}
};

std::shared_ptr<IncomingMessage> prepareToAllocate(
    const tensorpipe::Descriptor& tpDescriptor) {
  std::vector<std::shared_ptr<IncomingPayload>> pyPayloads;
  pyPayloads.reserve(tpDescriptor.payloads.size());
  for (const auto& tpPayload : tpDescriptor.payloads) {
    pyPayloads.push_back(std::make_shared<IncomingPayload>(
        tpPayload.length, tpPayload.metadata));
  }
  std::vector<std::shared_ptr<IncomingTensor>> pyTensors;
  pyTensors.reserve(tpDescriptor.tensors.size());
  for (const auto& tpTensor : tpDescriptor.tensors) {
    pyTensors.push_back(
        std::make_shared<IncomingTensor>(tpTensor.length, tpTensor.metadata));
  }
  auto pyMessage = std::make_shared<IncomingMessage>(
      tpDescriptor.metadata, std::move(pyPayloads), std::move(pyTensors));
  return pyMessage;
}

tensorpipe::Allocation prepareToRead(
    std::shared_ptr<IncomingMessage> pyMessage) {
  tensorpipe::Allocation tpAllocation;
  tpAllocation.payloads.reserve(pyMessage->payloads.size());
  for (const auto& pyPayload : pyMessage->payloads) {
    TP_THROW_ASSERT_IF(!pyPayload->buffer.has_value()) << "No buffer";
    tensorpipe::Allocation::Payload tpPayload{
        .data = pyPayload->buffer.value().ptr(),
    };
    tpAllocation.payloads.push_back(std::move(tpPayload));
  }
  tpAllocation.tensors.reserve(pyMessage->tensors.size());
  for (const auto& pyTensor : pyMessage->tensors) {
    TP_THROW_ASSERT_IF(!pyTensor->buffer.has_value()) << "No buffer";
    tensorpipe::Allocation::Tensor tpTensor{
        .buffer = tensorpipe::CpuBuffer{.ptr = pyTensor->buffer.value().ptr()},
    };
    tpAllocation.tensors.push_back(std::move(tpTensor));
  }
  return tpAllocation;
}

template <typename T>
using shared_ptr_class_ = py::class_<T, std::shared_ptr<T>>;

} // namespace

PYBIND11_MODULE(pytensorpipe, module) {
  py::print(
      "These bindings are EXPERIMENTAL, intended to give a PREVIEW of the API, "
      "and, as such, may CHANGE AT ANY TIME.");

  shared_ptr_class_<tensorpipe::Context> context(module, "Context");
  shared_ptr_class_<tensorpipe::Listener> listener(module, "Listener");
  shared_ptr_class_<tensorpipe::Pipe> pipe(module, "Pipe");

  shared_ptr_class_<OutgoingPayload> outgoingPayload(module, "OutgoingPayload");
  outgoingPayload.def(
      py::init<py::buffer, py::buffer>(),
      py::arg("buffer"),
      py::arg("metadata"));
  shared_ptr_class_<OutgoingTensor> outgoingTensor(module, "OutgoingTensor");
  outgoingTensor.def(
      py::init<py::buffer, py::buffer>(),
      py::arg("buffer"),
      py::arg("metadata"));
  shared_ptr_class_<OutgoingMessage> outgoingMessage(module, "OutgoingMessage");
  outgoingMessage.def(
      py::init<
          py::buffer,
          const std::vector<std::shared_ptr<OutgoingPayload>>,
          const std::vector<std::shared_ptr<OutgoingTensor>>>(),
      py::arg("metadata"),
      py::arg("payloads"),
      py::arg("tensors"));

  shared_ptr_class_<IncomingPayload> incomingPayload(
      module, "IncomingPayload", py::buffer_protocol());
  incomingPayload.def_readonly("length", &IncomingPayload::length);
  incomingPayload.def_readonly("metadata", &IncomingPayload::metadata);
  incomingPayload.def_property(
      "buffer",
      [](IncomingPayload& pyPayload) -> py::buffer_info {
        TP_THROW_ASSERT_IF(!pyPayload.buffer.has_value()) << "No buffer";
        return pyPayload.buffer->getBuffer();
      },
      &IncomingPayload::set_buffer);
  shared_ptr_class_<IncomingTensor> incomingTensor(
      module, "IncomingTensor", py::buffer_protocol());
  incomingTensor.def_readonly("length", &IncomingTensor::length);
  incomingTensor.def_readonly("metadata", &IncomingTensor::metadata);
  incomingTensor.def_property(
      "buffer",
      [](IncomingTensor& pyTensor) -> py::buffer_info {
        TP_THROW_ASSERT_IF(!pyTensor.buffer.has_value()) << "No buffer";
        return pyTensor.buffer->getBuffer();
      },
      &IncomingTensor::set_buffer);
  shared_ptr_class_<IncomingMessage> incomingMessage(
      module, "IncomingMessage", py::buffer_protocol());
  incomingMessage.def_readonly("metadata", &IncomingMessage::metadata);
  incomingMessage.def_readonly("payloads", &IncomingMessage::payloads);
  incomingMessage.def_readonly("tensors", &IncomingMessage::tensors);

  // Creators.

  context.def(py::init<>());
  context.def(
      "listen",
      [](std::shared_ptr<tensorpipe::Context> context,
         const std::vector<std::string>& urls) {
        return context->listen(urls);
      },
      py::arg("urls"));
  context.def(
      "connect",
      [](std::shared_ptr<tensorpipe::Context> context, const std::string& url) {
        return context->connect(url);
      },
      py::arg("url"));

  context.def(
      "join",
      &tensorpipe::Context::join,
      py::call_guard<py::gil_scoped_release>());

  // Callback registration.

  listener.def(
      "listen",
      [](std::shared_ptr<tensorpipe::Listener> listener, py::object callback) {
        listener->accept([callback{std::move(callback)}](
                             const tensorpipe::Error& error,
                             std::shared_ptr<tensorpipe::Pipe> pipe) mutable {
          if (error) {
            TP_LOG_ERROR() << error.what();
            return;
          }
          TP_THROW_ASSERT_IF(!pipe) << "No pipe";
          py::gil_scoped_acquire acquire;
          try {
            callback(std::move(pipe));
          } catch (const py::error_already_set& err) {
            TP_LOG_ERROR() << "Callback raised exception: " << err.what();
          }
          // Leaving the scope will decrease the refcount of callback which
          // may cause it to get destructed, which might segfault since we
          // won't be holding the GIL anymore. So we reset callback now,
          // while we're still holding the GIL.
          callback = py::object();
        });
      });

  pipe.def(
      "read_descriptor",
      [](std::shared_ptr<tensorpipe::Pipe> pipe, py::object callback) {
        pipe->readDescriptor([callback{std::move(callback)}](
                                 const tensorpipe::Error& error,
                                 tensorpipe::Descriptor descriptor) mutable {
          if (error) {
            TP_LOG_ERROR() << error.what();
            return;
          }
          py::gil_scoped_acquire acquire;
          try {
            callback(prepareToAllocate(std::move(descriptor)));
          } catch (const py::error_already_set& err) {
            TP_LOG_ERROR() << "Callback raised exception: " << err.what();
          }
          // Leaving the scope will decrease the refcount of callback which
          // may cause it to get destructed, which might segfault since we
          // won't be holding the GIL anymore. So we reset callback now,
          // while we're still holding the GIL.
          callback = py::object();
        });
      });

  pipe.def(
      "read",
      [](std::shared_ptr<tensorpipe::Pipe> pipe,
         std::shared_ptr<IncomingMessage> pyMessage,
         py::object callback) {
        tensorpipe::Allocation tpAllocation =
            prepareToRead(std::move(pyMessage));
        pipe->read(
            std::move(tpAllocation),
            [callback{std::move(callback)}](
                const tensorpipe::Error& error) mutable {
              if (error) {
                TP_LOG_ERROR() << error.what();
                return;
              }
              py::gil_scoped_acquire acquire;
              try {
                callback();
              } catch (const py::error_already_set& err) {
                TP_LOG_ERROR() << "Callback raised exception: " << err.what();
              }
              // Leaving the scope will decrease the refcount of callback which
              // may cause it to get destructed, which might segfault since we
              // won't be holding the GIL anymore. So we reset callback now,
              // while we're still holding the GIL.
              callback = py::object();
            });
      });

  pipe.def(
      "write",
      [](std::shared_ptr<tensorpipe::Pipe> pipe,
         std::shared_ptr<OutgoingMessage> pyMessage,
         py::object callback) {
        tensorpipe::Message tpMessage = prepareToWrite(std::move(pyMessage));
        pipe->write(
            std::move(tpMessage),
            [callback{std::move(callback)}](
                const tensorpipe::Error& error) mutable {
              if (error) {
                TP_LOG_ERROR() << error.what();
                return;
              }
              py::gil_scoped_acquire acquire;
              try {
                callback();
              } catch (const py::error_already_set& err) {
                TP_LOG_ERROR() << "Callback raised exception: " << err.what();
              }
              // Leaving the scope will decrease the refcount of callback which
              // may cause it to get destructed, which might segfault since we
              // won't be holding the GIL anymore. So we reset callback now,
              // while we're still holding the GIL.
              callback = py::object();
            });
      });

  // Transports and channels

  shared_ptr_class_<tensorpipe::transport::Context> abstractTransport(
      module, "AbstractTransport");

  module.def("create_uv_transport", &tensorpipe::transport::uv::create);

#if TENSORPIPE_HAS_SHM_TRANSPORT
  module.def("create_shm_transport", &tensorpipe::transport::shm::create);
#endif // TENSORPIPE_HAS_SHM_TRANSPORT

  context.def(
      "register_transport",
      &tensorpipe::Context::registerTransport,
      py::arg("priority"),
      py::arg("name"),
      py::arg("transport"));

  shared_ptr_class_<tensorpipe::channel::Context> abstractChannel(
      module, "AbstractChannel");

  module.def("create_basic_channel", &tensorpipe::channel::basic::create);

#if TENSORPIPE_HAS_CMA_CHANNEL
  module.def("create_cma_channel", &tensorpipe::channel::cma::create);
#endif // TENSORPIPE_HAS_CMA_CHANNEL

  context.def(
      "register_channel",
      &tensorpipe::Context::registerChannel,
      py::arg("priority"),
      py::arg("name"),
      py::arg("channel"));

  // Helpers

  listener.def("get_url", &tensorpipe::Listener::url, py::arg("transport"));
}


================================================
FILE: tensorpipe/tensorpipe.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <tensorpipe/config.h>

// High-level API

#include <tensorpipe/core/context.h>
#include <tensorpipe/core/error.h>
#include <tensorpipe/core/listener.h>
#include <tensorpipe/core/message.h>
#include <tensorpipe/core/pipe.h>

#include <tensorpipe/common/buffer.h>

#include <tensorpipe/common/cpu_buffer.h>

// Transports

#include <tensorpipe/transport/context.h>
#include <tensorpipe/transport/error.h>

#include <tensorpipe/transport/uv/error.h>
#include <tensorpipe/transport/uv/factory.h>
#include <tensorpipe/transport/uv/utility.h>

#if TENSORPIPE_HAS_SHM_TRANSPORT
#include <tensorpipe/transport/shm/factory.h>
#endif // TENSORPIPE_HAS_SHM_TRANSPORT

#if TENSORPIPE_HAS_IBV_TRANSPORT
#include <tensorpipe/transport/ibv/error.h>
#include <tensorpipe/transport/ibv/factory.h>
#include <tensorpipe/transport/ibv/utility.h>
#endif // TENSORPIPE_HAS_IBV_TRANSPORT

// Channels

#include <tensorpipe/channel/context.h>
#include <tensorpipe/channel/error.h>

#include <tensorpipe/channel/basic/factory.h>
#include <tensorpipe/channel/mpt/factory.h>
#include <tensorpipe/channel/xth/factory.h>

#if TENSORPIPE_HAS_CMA_CHANNEL
#include <tensorpipe/channel/cma/factory.h>
#endif // TENSORPIPE_HAS_CMA_CHANNEL


================================================
FILE: tensorpipe/tensorpipe_cuda.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <tensorpipe/config_cuda.h>

// High-level API

#include <tensorpipe/common/cuda_buffer.h>

// Channels

#include <tensorpipe/channel/cuda_basic/factory.h>
#include <tensorpipe/channel/cuda_xth/factory.h>

#if TENSORPIPE_HAS_CUDA_GDR_CHANNEL
#include <tensorpipe/channel/cuda_gdr/factory.h>
#endif // TENSORPIPE_HAS_CUDA_GDR_CHANNEL

#if TENSORPIPE_HAS_CUDA_IPC_CHANNEL
#include <tensorpipe/channel/cuda_ipc/factory.h>
#endif // TENSORPIPE_HAS_CUDA_IPC_CHANNEL


================================================
FILE: tensorpipe/test/CMakeLists.txt
================================================
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

# List of source files that we need to build tensorpipe_test executable.
set(TP_TEST_SRCS)

# TP_TEST_LINK_LIBRARIES is list of dependent libraries to be linked
set(TP_TEST_LINK_LIBRARIES)

# TP_TEST_INCLUDE_DIRS is list of include path to be used
set(TP_TEST_INCLUDE_DIRS)

# TP_TEST_COMPILE_DEFS is list of compile definitions to be used
set(TP_TEST_COMPILE_DEFS)

list(APPEND TP_TEST_SRCS
  test.cc
  test_environment.cc
  transport/context_test.cc
  transport/connection_test.cc
  transport/uv/uv_test.cc
  transport/uv/context_test.cc
  transport/uv/loop_test.cc
  transport/uv/connection_test.cc
  transport/uv/sockaddr_test.cc
  transport/listener_test.cc
  core/context_test.cc
  core/pipe_test.cc
  channel/basic/basic_test.cc
  channel/xth/xth_test.cc
  channel/mpt/mpt_test.cc
  channel/channel_test.cc
  channel/channel_test_cpu.cc
  common/system_test.cc
  common/defs_test.cc
  )

if(TP_ENABLE_SHM)
  list(APPEND TP_TEST_SRCS
    common/epoll_loop_test.cc
    common/ringbuffer_test.cc
    common/shm_ringbuffer_test.cc
    common/shm_segment_test.cc
    transport/shm/reactor_test.cc
    transport/shm/connection_test.cc
    transport/shm/listener_test.cc
    transport/shm/sockaddr_test.cc
    transport/shm/shm_test.cc
    )
endif()

if(TP_ENABLE_IBV)
  list(APPEND TP_TEST_SRCS
    common/epoll_loop_test.cc
    common/ringbuffer_test.cc
    transport/ibv/connection_test.cc
    transport/ibv/ibv_test.cc
    transport/ibv/sockaddr_test.cc
    )
endif()

if(TP_ENABLE_CMA)
  list(APPEND TP_TEST_SRCS
    channel/cma/cma_test.cc
    )
  add_subdirectory(channel/cma)
endif()

if(TP_USE_CUDA)
  find_package(CUDA REQUIRED)
  list(APPEND TP_TEST_LINK_LIBRARIES ${CUDA_LIBRARIES})
  list(APPEND TP_TEST_INCLUDE_DIRS ${CUDA_INCLUDE_DIRS})
  list(APPEND TP_TEST_COMPILE_DEFS TP_USE_CUDA)

  list(APPEND TP_TEST_SRCS
    channel/channel_test_cuda.cc
    channel/channel_test_cuda_multi_gpu.cc
    channel/channel_test_cuda_xdtt.cc
    common/cuda_test.cc
    core/pipe_cuda_test.cc
    )

  list(APPEND TP_TEST_SRCS
    channel/cuda_xth/cuda_xth_test.cc
    channel/cuda_basic/cuda_basic_test.cc
    )

  if(TP_ENABLE_CUDA_IPC)
    list(APPEND TP_TEST_SRCS
      channel/cuda_ipc/cuda_ipc_test.cc
      )
  endif()

  list(APPEND TP_TEST_SRCS
    channel/cuda_gdr/cuda_gdr_test.cc
    )

  cuda_add_library(tensorpipe_cuda_kernel channel/kernel.cu)
  list(APPEND TP_TEST_LINK_LIBRARIES tensorpipe_cuda_kernel)

  list(APPEND TP_TEST_LINK_LIBRARIES tensorpipe_cuda)
endif()

add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/googletest
  ${PROJECT_BINARY_DIR}/third_party/googletest EXCLUDE_FROM_ALL)

list(APPEND TP_TEST_LINK_LIBRARIES
  tensorpipe
  uv::uv
  gmock
  gtest_main)

add_executable(tensorpipe_test ${TP_TEST_SRCS})

# Add all the dependent link libraries to the tensorpipe_test target
target_link_libraries(tensorpipe_test PRIVATE ${TP_TEST_LINK_LIBRARIES})
target_include_directories(tensorpipe_test PUBLIC ${TP_TEST_INCLUDE_DIRS})
target_compile_definitions(tensorpipe_test PRIVATE ${TP_TEST_COMPILE_DEFS})


================================================
FILE: tensorpipe/test/channel/basic/basic_test.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/channel/basic/factory.h>
#include <tensorpipe/test/channel/channel_test_cpu.h>

namespace {

class BasicChannelTestHelper : public CpuChannelTestHelper {
 protected:
  std::shared_ptr<tensorpipe::channel::Context> makeContextInternal(
      std::string id) override {
    auto context = tensorpipe::channel::basic::create();
    context->setId(std::move(id));
    return context;
  }
};

BasicChannelTestHelper helper;

} // namespace

INSTANTIATE_TEST_CASE_P(Basic, ChannelTestSuite, ::testing::Values(&helper));

INSTANTIATE_TEST_CASE_P(Basic, CpuChannelTestSuite, ::testing::Values(&helper));


================================================
FILE: tensorpipe/test/channel/channel_test.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/test/channel/channel_test.h>

#include <numeric>

using namespace tensorpipe;
using namespace tensorpipe::channel;

// Implement this in a subprocess as in some cases it may initialize CUDA and
// thus would otherwise "pollute" the parent process.
class DeviceDescriptorsTest : public ChannelTestCase {
 public:
  void run(ChannelTestHelper* helper) override {
    auto peerGroup = helper->makePeerGroup();
    peerGroup->spawn(
        [&] {
          std::shared_ptr<Context> context1 = helper->makeContext("ctx1");
          std::shared_ptr<Context> context2 = helper->makeContext("ctx2");
          const auto& descriptors1 = context1->deviceDescriptors();
          const auto& descriptors2 = context2->deviceDescriptors();

          EXPECT_FALSE(descriptors1.empty());
          EXPECT_FALSE(descriptors2.empty());

          EXPECT_EQ(descriptors1.size(), descriptors2.size());
          for (const auto& deviceIter : descriptors1) {
            EXPECT_FALSE(deviceIter.second.empty());
            EXPECT_EQ(descriptors2.count(deviceIter.first), 1);
            EXPECT_EQ(deviceIter.second, descriptors2.at(deviceIter.first));
          }
        },
        [] {});
  }
};

CHANNEL_TEST(ChannelTestSuite, DeviceDescriptors);

class ClientToServerTest : public ClientServerChannelTestCase {
 public:
  static constexpr int kDataSize = 256;

  void server(std::shared_ptr<Channel> channel) override {
    // Initialize with sequential values.
    std::vector<uint8_t> data(kDataSize);
    std::iota(data.begin(), data.end(), 0);
    std::unique_ptr<DataWrapper> wrappedData = helper_->makeDataWrapper(data);

    // Perform send and wait for completion.
    std::future<Error> sendFuture = sendWithFuture(channel, *wrappedData);
    Error sendError = sendFuture.get();
    EXPECT_FALSE(sendError) << sendError.what();

    this->peers_->done(PeerGroup::kServer);
    this->peers_->join(PeerGroup::kServer);
  }

  void client(std::shared_ptr<Channel> channel) override {
    std::unique_ptr<DataWrapper> wrappedData =
        helper_->makeDataWrapper(kDataSize);

    // Perform recv and wait for completion.
    std::future<Error> recvFuture = recvWithFuture(channel, *wrappedData);
    Error recvError = recvFuture.get();
    EXPECT_FALSE(recvError) << recvError.what();

    // Validate contents of vector.
    auto unwrappedData = wrappedData->unwrap();
    for (auto i = 0; i < kDataSize; i++) {
      EXPECT_EQ(unwrappedData[i], i);
    }

    this->peers_->done(PeerGroup::kClient);
    this->peers_->join(PeerGroup::kClient);
  }
};

CHANNEL_TEST(ChannelTestSuite, ClientToServer);

class ServerToClientTest : public ClientServerChannelTestCase {
  static constexpr int kDataSize = 256;

 public:
  void server(std::shared_ptr<Channel> channel) override {
    std::unique_ptr<DataWrapper> wrappedData =
        helper_->makeDataWrapper(kDataSize);

    // Perform recv and wait for completion.
    std::future<Error> recvFuture = recvWithFuture(channel, *wrappedData);
    Error recvError = recvFuture.get();
    EXPECT_FALSE(recvError) << recvError.what();

    // Validate contents of vector.
    auto unwrappedData = wrappedData->unwrap();
    for (auto i = 0; i < kDataSize; i++) {
      EXPECT_EQ(unwrappedData[i], i);
    }

    this->peers_->done(PeerGroup::kServer);
    this->peers_->join(PeerGroup::kServer);
  }

  void client(std::shared_ptr<Channel> channel) override {
    // Initialize with sequential values.
    std::vector<uint8_t> data(kDataSize);
    std::iota(data.begin(), data.end(), 0);
    std::unique_ptr<DataWrapper> wrappedData = helper_->makeDataWrapper(data);

    // Perform send and wait for completion.
    std::future<Error> sendFuture = sendWithFuture(channel, *wrappedData);
    Error sendError = sendFuture.get();
    EXPECT_FALSE(sendError) << sendError.what();

    this->peers_->done(PeerGroup::kClient);
    this->peers_->join(PeerGroup::kClient);
  }
};

CHANNEL_TEST(ChannelTestSuite, ServerToClient);

class SendMultipleTensorsTest : public ClientServerChannelTestCase {
  // FIXME This is very puzzling, as in CircleCI making this field static (and
  // possibly even constexpr) causes a undefined symbol link error.
  const int dataSize_ = 256 * 1024; // 256KB
  static constexpr int kNumTensors = 100;

 public:
  void server(std::shared_ptr<Channel> channel) override {
    // Initialize with sequential values.
    std::vector<uint8_t> data(dataSize_);
    std::iota(data.begin(), data.end(), 0);
    std::unique_ptr<DataWrapper> wrappedData = helper_->makeDataWrapper(data);

    // Error futures
    std::vector<std::future<Error>> sendFutures;

    // Perform send and wait for completion.
    for (int i = 0; i < kNumTensors; i++) {
      std::future<Error> sendFuture = sendWithFuture(channel, *wrappedData);
      sendFutures.push_back(std::move(sendFuture));
    }
    for (auto& sendFuture : sendFutures) {
      Error sendError = sendFuture.get();
      EXPECT_FALSE(sendError) << sendError.what();
    }

    this->peers_->done(PeerGroup::kServer);
    this->peers_->join(PeerGroup::kServer);
  }

  void client(std::shared_ptr<Channel> channel) override {
    std::vector<std::unique_ptr<DataWrapper>> wrappedDataVec;
    for (int i = 0; i < kNumTensors; i++) {
      wrappedDataVec.push_back(helper_->makeDataWrapper(dataSize_));
    }

    // Error futures
    std::vector<std::future<Error>> recvFutures;

    // Perform recv and wait for completion.
    for (auto& wrappedData : wrappedDataVec) {
      std::future<Error> recvFuture = recvWithFuture(channel, *wrappedData);
      recvFutures.push_back(std::move(recvFuture));
    }
    for (auto& recvFuture : recvFutures) {
      Error recvError = recvFuture.get();
      EXPECT_FALSE(recvError) << recvError.what();
    }

    // Validate contents of vector.
    for (auto& wrappedData : wrappedDataVec) {
      auto unwrappedData = wrappedData->unwrap();
      for (int i = 0; i < dataSize_; i++) {
        EXPECT_EQ(unwrappedData[i], i % 256);
      }
    }

    this->peers_->done(PeerGroup::kClient);
    this->peers_->join(PeerGroup::kClient);
  }
};

CHANNEL_TEST(ChannelTestSuite, SendMultipleTensors);

class SendTensorsBothWaysTest : public ClientServerChannelTestCase {
  static constexpr int kDataSize = 256;

  void server(std::shared_ptr<Channel> channel) override {
    // Initialize sendBuffer with sequential values.
    std::vector<uint8_t> sendData(kDataSize);
    std::iota(sendData.begin(), sendData.end(), 0);
    std::unique_ptr<DataWrapper> wrappedSendData =
        helper_->makeDataWrapper(sendData);

    // Recv buffer.
    std::unique_ptr<DataWrapper> wrappedRecvData =
        helper_->makeDataWrapper(kDataSize);

    // Perform send.
    std::future<Error> sendFuture = sendWithFuture(channel, *wrappedSendData);
    // Perform recv.
    std::future<Error> recvFuture = recvWithFuture(channel, *wrappedRecvData);

    // Wait for completion of both.
    Error sendError = sendFuture.get();
    EXPECT_FALSE(sendError) << sendError.what();
    Error recvError = recvFuture.get();
    EXPECT_FALSE(recvError) << recvError.what();

    // Verify recvd buffers.
    auto unwrappedData = wrappedRecvData->unwrap();
    for (int i = 0; i < kDataSize; i++) {
      EXPECT_EQ(unwrappedData[i], i % 256);
    }

    this->peers_->done(PeerGroup::kServer);
    this->peers_->join(PeerGroup::kServer);
  }

  void client(std::shared_ptr<Channel> channel) override {
    // Initialize sendBuffer with sequential values.
    std::vector<uint8_t> sendData(kDataSize);
    std::iota(sendData.begin(), sendData.end(), 0);
    std::unique_ptr<DataWrapper> wrappedSendData =
        helper_->makeDataWrapper(sendData);

    // Recv buffer.
    std::unique_ptr<DataWrapper> wrappedRecvData =
        helper_->makeDataWrapper(kDataSize);

    // Perform send.
    std::future<Error> sendFuture = sendWithFuture(channel, *wrappedSendData);
    // Perform recv.
    std::future<Error> recvFuture = recvWithFuture(channel, *wrappedRecvData);

    // Wait for completion of both.
    Error sendError = sendFuture.get();
    EXPECT_FALSE(sendError) << sendError.what();
    Error recvError = recvFuture.get();
    EXPECT_FALSE(recvError) << recvError.what();

    // Verify recvd buffers.
    auto unwrappedData = wrappedRecvData->unwrap();
    for (int i = 0; i < kDataSize; i++) {
      EXPECT_EQ(unwrappedData[i], i % 256);
    }

    this->peers_->done(PeerGroup::kClient);
    this->peers_->join(PeerGroup::kClient);
  }
};

CHANNEL_TEST(ChannelTestSuite, SendTensorsBothWays);

// Call send and recv with a length of 0 but a non-null pointer.
class EmptyTensorTest : public ClientServerChannelTestCase {
  void server(std::shared_ptr<Channel> channel) override {
    // Allocate a non-empty vector so that its .data() pointer is non-null.
    std::vector<uint8_t> data(1);
    std::unique_ptr<DataWrapper> wrappedData = helper_->makeDataWrapper(data);
    Buffer buffer = wrappedData->buffer();

    // Perform send and wait for completion.
    std::future<Error> sendFuture = sendWithFuture(channel, buffer, 0);
    Error sendError = sendFuture.get();
    EXPECT_FALSE(sendError) << sendError.what();

    this->peers_->done(PeerGroup::kServer);
    this->peers_->join(PeerGroup::kServer);
  }

  void client(std::shared_ptr<Channel> channel) override {
    // Allocate a non-empty vector so that its .data() pointer is non-null.
    std::unique_ptr<DataWrapper> wrappedData = helper_->makeDataWrapper(1);
    Buffer buffer = wrappedData->buffer();

    // Perform recv and wait for completion.
    std::future<Error> recvFuture = recvWithFuture(channel, buffer, 0);
    Error recvError = recvFuture.get();
    EXPECT_FALSE(recvError) << recvError.what();

    this->peers_->done(PeerGroup::kClient);
    this->peers_->join(PeerGroup::kClient);
  }
};

CHANNEL_TEST(ChannelTestSuite, EmptyTensor);

// Call send and recv with a length of 0, between sends and recvs with
// positive length.
class EmptyAndNonEmptyTensorsTest : public ClientServerChannelTestCase {
  void server(std::shared_ptr<Channel> channel) override {
    std::vector<uint8_t> data(1);
    std::unique_ptr<DataWrapper> wrappedData = helper_->makeDataWrapper(data);
    Buffer buffer = wrappedData->buffer();

    std::vector<std::future<Error>> sendFutures;
    sendFutures.push_back(sendWithFuture(channel, buffer, 1));
    sendFutures.push_back(sendWithFuture(channel, buffer, 0));
    sendFutures.push_back(sendWithFuture(channel, buffer, 1));

    for (auto& f : sendFutures) {
      Error sendError = f.get();
      EXPECT_FALSE(sendError) << sendError.what();
    }

    this->peers_->done(PeerGroup::kServer);
    this->peers_->join(PeerGroup::kServer);
  }

  void client(std::shared_ptr<Channel> channel) override {
    std::unique_ptr<DataWrapper> wrappedData = helper_->makeDataWrapper(1);
    Buffer buffer = wrappedData->buffer();

    std::vector<std::future<Error>> sendFutures;
    sendFutures.push_back(recvWithFuture(channel, buffer, 1));
    sendFutures.push_back(recvWithFuture(channel, buffer, 0));
    sendFutures.push_back(recvWithFuture(channel, buffer, 1));

    for (auto& f : sendFutures) {
      Error sendError = f.get();
      EXPECT_FALSE(sendError) << sendError.what();
    }

    this->peers_->done(PeerGroup::kClient);
    this->peers_->join(PeerGroup::kClient);
  }
};

CHANNEL_TEST(ChannelTestSuite, EmptyAndNonEmptyTensors);


================================================
FILE: tensorpipe/test/channel/channel_test.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <future>
#include <memory>
#include <string>
#include <thread>
#include <tuple>
#include <utility>
#include <vector>

#include <gtest/gtest.h>

#include <tensorpipe/channel/channel.h>
#include <tensorpipe/channel/context.h>
#include <tensorpipe/common/buffer.h>
#include <tensorpipe/common/cpu_buffer.h>
#include <tensorpipe/test/peer_group.h>
#include <tensorpipe/transport/connection.h>
#include <tensorpipe/transport/listener.h>
#include <tensorpipe/transport/uv/factory.h>

class DataWrapper {
 public:
  virtual tensorpipe::Buffer buffer() const = 0;

  virtual size_t bufferLength() const = 0;

  virtual std::vector<uint8_t> unwrap() = 0;

  virtual ~DataWrapper() = default;
};

class ChannelTestHelper {
 public:
  virtual ~ChannelTestHelper() = default;

  std::shared_ptr<tensorpipe::channel::Context> makeContext(
      std::string id,
      bool skipViabilityCheck = false) {
    std::shared_ptr<tensorpipe::channel::Context> ctx =
        makeContextInternal(std::move(id));
    if (!skipViabilityCheck) {
      EXPECT_TRUE(ctx->isViable());
    }
    return ctx;
  }

  virtual std::shared_ptr<PeerGroup> makePeerGroup() {
    return std::make_shared<ThreadPeerGroup>();
  }

  virtual std::unique_ptr<DataWrapper> makeDataWrapper(size_t length) = 0;
  virtual std::unique_ptr<DataWrapper> makeDataWrapper(
      std::vector<uint8_t> v) = 0;

 protected:
  virtual std::shared_ptr<tensorpipe::channel::Context> makeContextInternal(
      std::string id) = 0;
};

[[nodiscard]] inline std::future<tensorpipe::Error> sendWithFuture(
    std::shared_ptr<tensorpipe::channel::Channel> channel,
    tensorpipe::Buffer buffer,
    size_t length) {
  auto promise = std::make_shared<std::promise<tensorpipe::Error>>();
  auto future = promise->get_future();

  channel->send(
      buffer,
      length,
      [promise{std::move(promise)}](const tensorpipe::Error& error) {
        promise->set_value(error);
      });
  return future;
}

[[nodiscard]] inline std::future<tensorpipe::Error> sendWithFuture(
    std::shared_ptr<tensorpipe::channel::Channel> channel,
    const DataWrapper& dataWrapper) {
  return sendWithFuture(
      std::move(channel), dataWrapper.buffer(), dataWrapper.bufferLength());
}

[[nodiscard]] inline std::future<tensorpipe::Error> recvWithFuture(
    std::shared_ptr<tensorpipe::channel::Channel> channel,
    tensorpipe::Buffer buffer,
    size_t length) {
  auto promise = std::make_shared<std::promise<tensorpipe::Error>>();
  auto future = promise->get_future();

  channel->recv(
      buffer,
      length,
      [promise{std::move(promise)}](const tensorpipe::Error& error) {
        promise->set_value(error);
      });
  return future;
}

[[nodiscard]] inline std::future<tensorpipe::Error> recvWithFuture(
    std::shared_ptr<tensorpipe::channel::Channel> channel,
    const DataWrapper& dataWrapper) {
  return recvWithFuture(
      std::move(channel), dataWrapper.buffer(), dataWrapper.bufferLength());
}

class ChannelTestCase {
 public:
  virtual void run(ChannelTestHelper* helper) = 0;

  virtual ~ChannelTestCase() = default;
};

class ClientServerChannelTestCase : public ChannelTestCase {
  using MultiAcceptResult = std::pair<
      tensorpipe::Error,
      std::vector<std::shared_ptr<tensorpipe::transport::Connection>>>;

  class MultiAcceptResultPromise {
   public:
    explicit MultiAcceptResultPromise(size_t numConnections)
        : connections_(numConnections) {}

    ~MultiAcceptResultPromise() {
      // Sanity check
      if (!error_) {
        for (const auto& conn : connections_) {
          EXPECT_NE(conn, nullptr);
        }
      }
      promise_.set_value(
          MultiAcceptResult(std::move(error_), std::move(connections_)));
    }

    std::future<MultiAcceptResult> getFuture() {
      return promise_.get_future();
    }

    void setConnection(
        size_t connId,
        std::shared_ptr<tensorpipe::transport::Connection> connection) {
      EXPECT_LT(connId, connections_.size());
      connections_[connId] = std::move(connection);
    }

    void setError(tensorpipe::Error error) {
      std::unique_lock<std::mutex> lock(errorMutex_);
      if (error_) {
        return;
      }
      error_ = std::move(error);
    }

   private:
    tensorpipe::Error error_{tensorpipe::Error::kSuccess};
    std::mutex errorMutex_;
    std::vector<std::shared_ptr<tensorpipe::transport::Connection>>
        connections_;
    std::promise<MultiAcceptResult> promise_;
  };

  std::future<MultiAcceptResult> accept(
      tensorpipe::transport::Listener& listener,
      size_t numConnections) {
    auto promise = std::make_shared<MultiAcceptResultPromise>(numConnections);
    for (size_t i = 0; i < numConnections; ++i) {
      listener.accept(
          [promise](
              const tensorpipe::Error& error,
              std::shared_ptr<tensorpipe::transport::Connection> connection) {
            if (error) {
              promise->setError(std::move(error));
              return;
            }

            connection->read([promise, connection](
                                 const tensorpipe::Error& error,
                                 const void* connIdBuf,
                                 size_t length) mutable {
              if (error) {
                promise->setError(std::move(error));
                return;
              }
              ASSERT_EQ(sizeof(uint64_t), length);
              uint64_t connId = *static_cast<const uint64_t*>(connIdBuf);
              promise->setConnection(connId, std::move(connection));
            });
          });
    }

    return promise->getFuture();
  }

  std::vector<std::shared_ptr<tensorpipe::transport::Connection>> connect(
      std::shared_ptr<tensorpipe::transport::Context> transportCtx,
      std::string addr,
      size_t numConnections) {
    std::vector<std::shared_ptr<tensorpipe::transport::Connection>> connections(
        numConnections);
    for (size_t connId = 0; connId < numConnections; ++connId) {
      connections[connId] = transportCtx->connect(addr);
      auto connIdBuf = std::make_shared<uint64_t>(connId);
      connections[connId]->write(
          connIdBuf.get(),
          sizeof(uint64_t),
          [connIdBuf](const tensorpipe::Error& error) {
            EXPECT_FALSE(error) << error.what();
          });
    }

    return connections;
  }

 public:
  void run(ChannelTestHelper* helper) override {
    auto addr = "127.0.0.1";

    helper_ = helper;
    peers_ = helper_->makePeerGroup();
    peers_->spawn(
        [&] {
          auto transportCtx = tensorpipe::transport::uv::create();
          transportCtx->setId("server_harness");
          auto ctx = helper_->makeContext("server");

          auto listener = transportCtx->listen(addr);

          auto connectionsFuture =
              accept(*listener, ctx->numConnectionsNeeded());
          peers_->send(PeerGroup::kClient, listener->addr());

          tensorpipe::Error connectionsError;
          std::vector<std::shared_ptr<tensorpipe::transport::Connection>>
              connections;
          std::tie(connectionsError, connections) = connectionsFuture.get();
          EXPECT_FALSE(connectionsError) << connectionsError.what();

          auto channel = ctx->createChannel(
              std::move(connections), tensorpipe::channel::Endpoint::kListen);

          server(std::move(channel));

          ctx->join();
          transportCtx->join();

          afterServer();
        },
        [&] {
          auto transportCtx = tensorpipe::transport::uv::create();
          transportCtx->setId("client_harness");
          auto ctx = helper_->makeContext("client");

          auto laddr = peers_->recv(PeerGroup::kClient);

          auto connections =
              connect(transportCtx, laddr, ctx->numConnectionsNeeded());

          auto channel = ctx->createChannel(
              std::move(connections), tensorpipe::channel::Endpoint::kConnect);

          client(std::move(channel));

          ctx->join();
          transportCtx->join();

          afterClient();
        });
  }

  virtual void server(
      std::shared_ptr<tensorpipe::channel::Channel> /* channel */) {}
  virtual void client(
      std::shared_ptr<tensorpipe::channel::Channel> /* channel */) {}

  virtual void afterServer() {}
  virtual void afterClient() {}

 protected:
  ChannelTestHelper* helper_;
  std::shared_ptr<PeerGroup> peers_;
};

class ChannelTestSuite : public ::testing::TestWithParam<ChannelTestHelper*> {};

// Register a channel test.
#define CHANNEL_TEST(suite, name) \
  TEST_P(suite, name) {           \
    name##Test t;                 \
    t.run(GetParam());            \
  }


================================================
FILE: tensorpipe/test/channel/channel_test_cpu.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/test/channel/channel_test_cpu.h>

#include <numeric>

#include <tensorpipe/test/channel/channel_test.h>

using namespace tensorpipe;
using namespace tensorpipe::channel;

// Call send and recv with a null pointer and a length of 0.
class NullPointerTest : public ClientServerChannelTestCase {
  void server(std::shared_ptr<Channel> channel) override {
    // Perform send and wait for completion.
    std::future<Error> sendFuture =
        sendWithFuture(channel, CpuBuffer{.ptr = nullptr}, 0);
    Error sendError = sendFuture.get();
    EXPECT_FALSE(sendError) << sendError.what();

    this->peers_->done(PeerGroup::kServer);
    this->peers_->join(PeerGroup::kServer);
  }

  void client(std::shared_ptr<Channel> channel) override {
    // Perform recv and wait for completion.
    std::future<Error> recvFuture =
        recvWithFuture(channel, CpuBuffer{.ptr = nullptr}, 0);
    Error recvError = recvFuture.get();
    EXPECT_FALSE(recvError) << recvError.what();

    this->peers_->done(PeerGroup::kClient);
    this->peers_->join(PeerGroup::kClient);
  }
};

CHANNEL_TEST(CpuChannelTestSuite, NullPointer);

// This test wants to make sure that the "heavy lifting" of copying data isn't
// performed inline inside the recv method as that would make the user-facing
// read method of the pipe blocking.
// However, since we can't really check that behavior, we'll check a highly
// correlated one: that the recv callback isn't called inline from within the
// recv method. We do so by having that behavior cause a deadlock.
class CallbacksAreDeferredTest : public ClientServerChannelTestCase {
  static constexpr auto kDataSize = 256;

 public:
  void server(std::shared_ptr<Channel> channel) override {
    // Initialize with sequential values.
    std::vector<uint8_t> data(kDataSize);
    std::iota(data.begin(), data.end(), 0);

    // Perform send and wait for completion.
    std::promise<Error> sendPromise;
    auto mutex = std::make_shared<std::mutex>();
    std::unique_lock<std::mutex> callerLock(*mutex);
    channel->send(
        CpuBuffer{.ptr = data.data()},
        kDataSize,
        [&sendPromise, mutex](const Error& error) {
          std::unique_lock<std::mutex> calleeLock(*mutex);
          sendPromise.set_value(error);
        });
    callerLock.unlock();
    Error sendError = sendPromise.get_future().get();
    EXPECT_FALSE(sendError) << sendError.what();

    this->peers_->done(PeerGroup::kServer);
    this->peers_->join(PeerGroup::kServer);
  }

  void client(std::shared_ptr<Channel> channel) override {
    // Initialize with zeroes.
    std::vector<uint8_t> data(kDataSize);
    std::fill(data.begin(), data.end(), 0);

    // Perform recv and wait for completion.
    std::promise<Error> recvPromise;
    std::mutex mutex;
    std::unique_lock<std::mutex> callerLock(mutex);
    channel->recv(
        CpuBuffer{.ptr = data.data()},
        kDataSize,
        [&recvPromise, &mutex](const Error& error) {
          std::unique_lock<std::mutex> calleeLock(mutex);
          recvPromise.set_value(error);
        });
    callerLock.unlock();
    Error recvError = recvPromise.get_future().get();
    EXPECT_FALSE(recvError) << recvError.what();

    // Validate contents of vector.
    for (auto i = 0; i < kDataSize; i++) {
      EXPECT_EQ(data[i], i);
    }

    this->peers_->done(PeerGroup::kClient);
    this->peers_->join(PeerGroup::kClient);
  }
};

CHANNEL_TEST(CpuChannelTestSuite, CallbacksAreDeferred);


================================================
FILE: tensorpipe/test/channel/channel_test_cpu.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <memory>
#include <utility>
#include <vector>

#include <tensorpipe/common/cpu_buffer.h>
#include <tensorpipe/test/channel/channel_test.h>

class CpuDataWrapper : public DataWrapper {
 public:
  explicit CpuDataWrapper(size_t length) : vector_(length) {}

  explicit CpuDataWrapper(std::vector<uint8_t> v) : vector_(v) {}

  tensorpipe::Buffer buffer() const override {
    return tensorpipe::CpuBuffer{.ptr = const_cast<uint8_t*>(vector_.data())};
  }

  size_t bufferLength() const override {
    return vector_.size();
  }

  std::vector<uint8_t> unwrap() override {
    return vector_;
  }

 private:
  std::vector<uint8_t> vector_;
};

class CpuChannelTestHelper : public ChannelTestHelper {
 public:
  std::unique_ptr<DataWrapper> makeDataWrapper(size_t length) override {
    return std::make_unique<CpuDataWrapper>(length);
  }

  std::unique_ptr<DataWrapper> makeDataWrapper(
      std::vector<uint8_t> v) override {
    return std::make_unique<CpuDataWrapper>(std::move(v));
  }
};

class CpuChannelTestSuite
    : public ::testing::TestWithParam<CpuChannelTestHelper*> {};


================================================
FILE: tensorpipe/test/channel/channel_test_cuda.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/test/channel/channel_test_cuda.h>

#include <cuda_runtime.h>
#include <gmock/gmock.h>

#include <tensorpipe/test/channel/kernel.cuh>

using namespace tensorpipe;
using namespace tensorpipe::channel;

class ReceiverWaitsForStartEventTest : public ClientServerChannelTestCase {
  static constexpr size_t kSize = 1024;

  void server(std::shared_ptr<Channel> channel) override {
    TP_CUDA_CHECK(cudaSetDevice(0));
    cudaStream_t sendStream;
    TP_CUDA_CHECK(
        cudaStreamCreateWithFlags(&sendStream, cudaStreamNonBlocking));
    void* ptr;
    TP_CUDA_CHECK(cudaMalloc(&ptr, kSize));

    // Delay sendStream with computations on buffer.
    slowKernel(ptr, kSize, sendStream);

    // Set buffer to target value.
    TP_CUDA_CHECK(cudaMemsetAsync(ptr, 0x42, kSize, sendStream));

    // Perform send and wait for completion.
    auto sendPromise = std::make_shared<std::promise<tensorpipe::Error>>();
    auto sendFuture = sendPromise->get_future();

    channel->send(
        CudaBuffer{
            .ptr = ptr,
            .stream = sendStream,
        },
        kSize,
        [sendPromise{std::move(sendPromise)}](const tensorpipe::Error& error) {
          sendPromise->set_value(error);
        });

    Error sendError = sendFuture.get();
    EXPECT_FALSE(sendError) << sendError.what();
    TP_CUDA_CHECK(cudaFree(ptr));

    this->peers_->done(PeerGroup::kServer);
    this->peers_->join(PeerGroup::kServer);
  }

  void client(std::shared_ptr<Channel> channel) override {
    TP_CUDA_CHECK(cudaSetDevice(0));
    cudaStream_t recvStream;
    TP_CUDA_CHECK(
        cudaStreamCreateWithFlags(&recvStream, cudaStreamNonBlocking));
    void* ptr;
    TP_CUDA_CHECK(cudaMalloc(&ptr, kSize));

    // Perform recv and wait for completion.
    auto recvPromise = std::make_shared<std::promise<tensorpipe::Error>>();
    auto recvFuture = recvPromise->get_future();

    channel->recv(
        CudaBuffer{
            .ptr = ptr,
            .stream = recvStream,
        },
        kSize,
        [recvPromise{std::move(recvPromise)}](const tensorpipe::Error& error) {
          recvPromise->set_value(error);
        });

    Error recvError = recvFuture.get();
    EXPECT_FALSE(recvError) << recvError.what();

    std::array<uint8_t, kSize> data;
    TP_CUDA_CHECK(cudaStreamSynchronize(recvStream));
    TP_CUDA_CHECK(cudaMemcpy(data.data(), ptr, kSize, cudaMemcpyDefault));
    EXPECT_THAT(data, ::testing::Each(0x42));
    TP_CUDA_CHECK(cudaFree(ptr));

    this->peers_->done(PeerGroup::kClient);
    this->peers_->join(PeerGroup::kClient);
  }
};

CHANNEL_TEST(CudaChannelTestSuite, ReceiverWaitsForStartEvent);

class SendOffsetAllocationTest : public ClientServerChannelTestCase {
 public:
  static constexpr int kDataSize = 256;
  static constexpr int kOffset = 128;

  void server(std::shared_ptr<Channel> channel) override {
    // Initialize with sequential values.
    void* ptr;
    TP_CUDA_CHECK(cudaMalloc(&ptr, kOffset + kDataSize));
    // Set buffer to target value.
    TP_CUDA_CHECK(cudaMemset(ptr, 0xff, kOffset));
    TP_CUDA_CHECK(
        cudaMemset(static_cast<uint8_t*>(ptr) + kOffset, 0x42, kDataSize));

    // Perform send and wait for completion.
    std::future<Error> sendFuture = sendWithFuture(
        channel,
        CudaBuffer{.ptr = static_cast<uint8_t*>(ptr) + kOffset},
        kDataSize);
    Error sendError = sendFuture.get();
    EXPECT_FALSE(sendError) << sendError.what();

    this->peers_->done(PeerGroup::kServer);
    this->peers_->join(PeerGroup::kServer);
  }

  void client(std::shared_ptr<Channel> channel) override {
    std::unique_ptr<DataWrapper> wrappedData =
        helper_->makeDataWrapper(kDataSize);

    // Perform recv and wait for completion.
    std::future<Error> recvFuture = recvWithFuture(channel, *wrappedData);
    Error recvError = recvFuture.get();
    EXPECT_FALSE(recvError) << recvError.what();

    // Validate contents of vector.
    EXPECT_THAT(wrappedData->unwrap(), ::testing::Each(0x42));

    this->peers_->done(PeerGroup::kClient);
    this->peers_->join(PeerGroup::kClient);
  }
};

CHANNEL_TEST(CudaChannelTestSuite, SendOffsetAllocation);


================================================
FILE: tensorpipe/test/channel/channel_test_cuda.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <memory>
#include <utility>
#include <vector>

#include <tensorpipe/common/cuda.h>
#include <tensorpipe/common/cuda_buffer.h>
#include <tensorpipe/test/channel/channel_test.h>

class CudaDataWrapper : public DataWrapper {
 public:
  // Non-copyable.
  CudaDataWrapper(const CudaDataWrapper&) = delete;
  CudaDataWrapper& operator=(const CudaDataWrapper&) = delete;
  // Non-movable.
  CudaDataWrapper(CudaDataWrapper&& other) = delete;
  CudaDataWrapper& operator=(CudaDataWrapper&& other) = delete;

  explicit CudaDataWrapper(size_t length) : length_(length) {
    if (length_ > 0) {
      TP_CUDA_CHECK(cudaSetDevice(0));
      TP_CUDA_CHECK(cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking));
      TP_CUDA_CHECK(cudaMalloc(&cudaPtr_, length_));
    }
  }

  explicit CudaDataWrapper(std::vector<uint8_t> v) : CudaDataWrapper(v.size()) {
    if (length_ > 0) {
      TP_CUDA_CHECK(cudaMemcpyAsync(
          cudaPtr_, v.data(), length_, cudaMemcpyDefault, stream_));
    }
  }

  tensorpipe::Buffer buffer() const override {
    return tensorpipe::CudaBuffer{
        .ptr = cudaPtr_,
        .stream = stream_,
    };
  }

  size_t bufferLength() const override {
    return length_;
  }

  std::vector<uint8_t> unwrap() override {
    std::vector<uint8_t> v(length_);
    if (length_ > 0) {
      TP_CUDA_CHECK(cudaStreamSynchronize(stream_));
      TP_CUDA_CHECK(cudaMemcpy(v.data(), cudaPtr_, length_, cudaMemcpyDefault));
    }
    return v;
  }

  ~CudaDataWrapper() override {
    if (length_ > 0) {
      TP_CUDA_CHECK(cudaFree(cudaPtr_));
      TP_CUDA_CHECK(cudaStreamDestroy(stream_));
    }
  }

 private:
  void* cudaPtr_{nullptr};
  size_t length_{0};
  cudaStream_t stream_{cudaStreamDefault};
};

class CudaChannelTestHelper : public ChannelTestHelper {
 public:
  std::unique_ptr<DataWrapper> makeDataWrapper(size_t length) override {
    return std::make_unique<CudaDataWrapper>(length);
  }

  std::unique_ptr<DataWrapper> makeDataWrapper(
      std::vector<uint8_t> v) override {
    return std::make_unique<CudaDataWrapper>(std::move(v));
  }
};

class CudaChannelTestSuite
    : public ::testing::TestWithParam<CudaChannelTestHelper*> {};

class CudaMultiGPUChannelTestSuite
    : public ::testing::TestWithParam<CudaChannelTestHelper*> {};

class CudaXDTTChannelTestSuite
    : public ::testing::TestWithParam<CudaChannelTestHelper*> {};


================================================
FILE: tensorpipe/test/channel/channel_test_cuda_multi_gpu.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <cuda_runtime.h>
#include <gmock/gmock.h>

#include <tensorpipe/test/channel/channel_test_cuda.h>
#include <tensorpipe/test/channel/cuda_helpers.h>
#include <tensorpipe/test/test_environment.h>

using namespace tensorpipe;
using namespace tensorpipe::channel;

class SendAcrossDevicesTest : public ClientServerChannelTestCase {
  static constexpr size_t kSize = 1024;

 public:
  void run(ChannelTestHelper* helper) override {
    if (TestEnvironment::numCudaDevices() < 2) {
      GTEST_SKIP() << "Skipping test requiring >=2 CUDA devices.";
    }

    ClientServerChannelTestCase::run(helper);
  }

 private:
  void server(std::shared_ptr<Channel> channel) override {
    cudaStream_t sendStream;
    void* ptr;
    {
      // Send happens from device #0.
      CudaDeviceGuard guard(0);
      TP_CUDA_CHECK(
          cudaStreamCreateWithFlags(&sendStream, cudaStreamNonBlocking));
      TP_CUDA_CHECK(cudaMalloc(&ptr, kSize));

      // Set buffer to target value.
      TP_CUDA_CHECK(cudaMemsetAsync(ptr, 0x42, kSize, sendStream));
    }

    // Perform send and wait for completion.
    auto sendPromise = std::make_shared<std::promise<tensorpipe::Error>>();
    auto sendFuture = sendPromise->get_future();

    channel->send(
        CudaBuffer{
            .ptr = ptr,
            .stream = sendStream,
        },
        kSize,
        [sendPromise{std::move(sendPromise)}](const tensorpipe::Error& error) {
          sendPromise->set_value(error);
        });

    Error sendError = sendFuture.get();
    EXPECT_FALSE(sendError) << sendError.what();

    {
      CudaDeviceGuard guard(0);
      TP_CUDA_CHECK(cudaFree(ptr));
      TP_CUDA_CHECK(cudaStreamDestroy(sendStream));
    }

    this->peers_->done(PeerGroup::kServer);
    this->peers_->join(PeerGroup::kServer);
  }

  void afterServer() override {
    if (this->peers_->endpointsInSameProcess()) {
      EXPECT_TRUE(initializedCudaContexts({0, 1}));
    } else {
      EXPECT_TRUE(initializedCudaContexts({0}));
    }
  }

  void client(std::shared_ptr<Channel> channel) override {
    cudaStream_t recvStream;
    void* ptr;
    {
      // Recv happens on device #1.
      CudaDeviceGuard guard(1);
      TP_CUDA_CHECK(
          cudaStreamCreateWithFlags(&recvStream, cudaStreamNonBlocking));
      TP_CUDA_CHECK(cudaMalloc(&ptr, kSize));
    }

    // Perform recv and wait for completion.
    auto recvPromise = std::make_shared<std::promise<tensorpipe::Error>>();
    auto recvFuture = recvPromise->get_future();

    channel->recv(
        CudaBuffer{
            .ptr = ptr,
            .stream = recvStream,
        },
        kSize,
        [recvPromise{std::move(recvPromise)}](const tensorpipe::Error& error) {
          recvPromise->set_value(error);
        });

    Error recvError = recvFuture.get();
    EXPECT_FALSE(recvError) << recvError.what();

    {
      CudaDeviceGuard guard(1);
      std::array<uint8_t, kSize> data;
      TP_CUDA_CHECK(cudaStreamSynchronize(recvStream));
      TP_CUDA_CHECK(cudaMemcpy(data.data(), ptr, kSize, cudaMemcpyDefault));
      EXPECT_THAT(data, ::testing::Each(0x42));
      TP_CUDA_CHECK(cudaFree(ptr));
      TP_CUDA_CHECK(cudaStreamDestroy(recvStream));
    }

    this->peers_->done(PeerGroup::kClient);
    this->peers_->join(PeerGroup::kClient);
  }

  void afterClient() override {
    if (this->peers_->endpointsInSameProcess()) {
      EXPECT_TRUE(initializedCudaContexts({0, 1}));
    } else {
      EXPECT_TRUE(initializedCudaContexts({1}));
    }
  }
};

CHANNEL_TEST(CudaMultiGPUChannelTestSuite, SendAcrossDevices);

class SendReverseAcrossDevicesTest : public ClientServerChannelTestCase {
  static constexpr size_t kSize = 1024;

 public:
  void run(ChannelTestHelper* helper) override {
    if (TestEnvironment::numCudaDevices() < 2) {
      GTEST_SKIP() << "Skipping test requiring >=2 CUDA devices.";
    }

    ClientServerChannelTestCase::run(helper);
  }

 private:
  void server(std::shared_ptr<Channel> channel) override {
    cudaStream_t sendStream;
    void* ptr;
    {
      // Send happens from device #1.
      CudaDeviceGuard guard(1);
      TP_CUDA_CHECK(
          cudaStreamCreateWithFlags(&sendStream, cudaStreamNonBlocking));
      TP_CUDA_CHECK(cudaMalloc(&ptr, kSize));

      // Set buffer to target value.
      TP_CUDA_CHECK(cudaMemsetAsync(ptr, 0x42, kSize, sendStream));
    }

    // Perform send and wait for completion.
    auto sendPromise = std::make_shared<std::promise<tensorpipe::Error>>();
    auto sendFuture = sendPromise->get_future();

    channel->send(
        CudaBuffer{
            .ptr = ptr,
            .stream = sendStream,
        },
        kSize,
        [sendPromise{std::move(sendPromise)}](const tensorpipe::Error& error) {
          sendPromise->set_value(error);
        });

    Error sendError = sendFuture.get();
    EXPECT_FALSE(sendError) << sendError.what();

    {
      CudaDeviceGuard guard(1);
      TP_CUDA_CHECK(cudaFree(ptr));
      TP_CUDA_CHECK(cudaStreamDestroy(sendStream));
    }

    this->peers_->done(PeerGroup::kServer);
    this->peers_->join(PeerGroup::kServer);
  }

  void afterServer() override {
    if (this->peers_->endpointsInSameProcess()) {
      EXPECT_TRUE(initializedCudaContexts({0, 1}));
    } else {
      EXPECT_TRUE(initializedCudaContexts({1}));
    }
  }

  void client(std::shared_ptr<Channel> channel) override {
    cudaStream_t recvStream;
    void* ptr;
    {
      // Recv happens on device #0.
      CudaDeviceGuard guard(0);
      TP_CUDA_CHECK(
          cudaStreamCreateWithFlags(&recvStream, cudaStreamNonBlocking));
      TP_CUDA_CHECK(cudaMalloc(&ptr, kSize));
    }

    // Perform recv and wait for completion.
    auto recvPromise = std::make_shared<std::promise<tensorpipe::Error>>();
    auto recvFuture = recvPromise->get_future();

    channel->recv(
        CudaBuffer{
            .ptr = ptr,
            .stream = recvStream,
        },
        kSize,
        [recvPromise{std::move(recvPromise)}](const tensorpipe::Error& error) {
          recvPromise->set_value(error);
        });

    Error recvError = recvFuture.get();
    EXPECT_FALSE(recvError) << recvError.what();

    {
      CudaDeviceGuard guard(0);
      std::array<uint8_t, kSize> data;
      TP_CUDA_CHECK(cudaStreamSynchronize(recvStream));
      TP_CUDA_CHECK(cudaMemcpy(data.data(), ptr, kSize, cudaMemcpyDefault));
      EXPECT_THAT(data, ::testing::Each(0x42));
      TP_CUDA_CHECK(cudaFree(ptr));
      TP_CUDA_CHECK(cudaStreamDestroy(recvStream));
    }

    this->peers_->done(PeerGroup::kClient);
    this->peers_->join(PeerGroup::kClient);
  }

  void afterClient() override {
    if (this->peers_->endpointsInSameProcess()) {
      EXPECT_TRUE(initializedCudaContexts({0, 1}));
    } else {
      EXPECT_TRUE(initializedCudaContexts({0}));
    }
  }
};

CHANNEL_TEST(CudaMultiGPUChannelTestSuite, SendReverseAcrossDevices);

class SendAcrossNonDefaultDevicesTest : public ClientServerChannelTestCase {
  static constexpr size_t kSize = 1024;

 public:
  void run(ChannelTestHelper* helper) override {
    if (TestEnvironment::numCudaDevices() < 2) {
      GTEST_SKIP() << "Skipping test requiring >=2 CUDA devices.";
    }

    ClientServerChannelTestCase::run(helper);
  }

 private:
  void server(std::shared_ptr<Channel> channel) override {
    cudaStream_t sendStream;
    void* ptr;
    {
      // Send happens from device #1.
      CudaDeviceGuard guard(1);
      TP_CUDA_CHECK(
          cudaStreamCreateWithFlags(&sendStream, cudaStreamNonBlocking));
      TP_CUDA_CHECK(cudaMalloc(&ptr, kSize));

      // Set buffer to target value.
      TP_CUDA_CHECK(cudaMemsetAsync(ptr, 0x42, kSize, sendStream));
    }

    // Perform send and wait for completion.
    auto sendPromise = std::make_shared<std::promise<tensorpipe::Error>>();
    auto sendFuture = sendPromise->get_future();

    channel->send(
        CudaBuffer{
            .ptr = ptr,
            .stream = sendStream,
        },
        kSize,
        [sendPromise{std::move(sendPromise)}](const tensorpipe::Error& error) {
          sendPromise->set_value(error);
        });

    Error sendError = sendFuture.get();
    EXPECT_FALSE(sendError) << sendError.what();

    {
      CudaDeviceGuard guard(1);
      TP_CUDA_CHECK(cudaFree(ptr));
      TP_CUDA_CHECK(cudaStreamDestroy(sendStream));
    }

    this->peers_->done(PeerGroup::kServer);
    this->peers_->join(PeerGroup::kServer);
  }

  void afterServer() override {
    EXPECT_TRUE(initializedCudaContexts({1}));
  }

  void client(std::shared_ptr<Channel> channel) override {
    cudaStream_t recvStream;
    void* ptr;
    {
      // Recv happens on device #1.
      CudaDeviceGuard guard(1);
      TP_CUDA_CHECK(
          cudaStreamCreateWithFlags(&recvStream, cudaStreamNonBlocking));
      TP_CUDA_CHECK(cudaMalloc(&ptr, kSize));
    }

    // Perform recv and wait for completion.
    auto recvPromise = std::make_shared<std::promise<tensorpipe::Error>>();
    auto recvFuture = recvPromise->get_future();

    channel->recv(
        CudaBuffer{
            .ptr = ptr,
            .stream = recvStream,
        },
        kSize,
        [recvPromise{std::move(recvPromise)}](const tensorpipe::Error& error) {
          recvPromise->set_value(error);
        });

    Error recvError = recvFuture.get();
    EXPECT_FALSE(recvError) << recvError.what();

    {
      CudaDeviceGuard guard(1);
      std::array<uint8_t, kSize> data;
      TP_CUDA_CHECK(cudaStreamSynchronize(recvStream));
      TP_CUDA_CHECK(cudaMemcpy(data.data(), ptr, kSize, cudaMemcpyDefault));
      EXPECT_THAT(data, ::testing::Each(0x42));
      TP_CUDA_CHECK(cudaFree(ptr));
      TP_CUDA_CHECK(cudaStreamDestroy(recvStream));
    }

    this->peers_->done(PeerGroup::kClient);
    this->peers_->join(PeerGroup::kClient);
  }

  void afterClient() override {
    EXPECT_TRUE(initializedCudaContexts({1}));
  }
};

CHANNEL_TEST(CudaMultiGPUChannelTestSuite, SendAcrossNonDefaultDevices);


================================================
FILE: tensorpipe/test/channel/channel_test_cuda_xdtt.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/test/channel/channel_test_cuda.h>

#include <cuda_runtime.h>
#include <gmock/gmock.h>

#include <tensorpipe/test/channel/kernel.cuh>

using namespace tensorpipe;
using namespace tensorpipe::channel;

class SendFromCpuToGpuTest : public ClientServerChannelTestCase {
  static constexpr size_t kSize = 1024;

  void server(std::shared_ptr<Channel> channel) override {
    // Perform send and wait for completion.
    auto sendPromise = std::make_shared<std::promise<tensorpipe::Error>>();
    auto sendFuture = sendPromise->get_future();

    std::vector<uint8_t> data(kSize, 0x42);
    channel->send(
        CpuBuffer{
            .ptr = data.data(),
        },
        kSize,
        [sendPromise{std::move(sendPromise)}](const tensorpipe::Error& error) {
          sendPromise->set_value(error);
        });

    Error sendError = sendFuture.get();
    EXPECT_FALSE(sendError) << sendError.what();

    this->peers_->done(PeerGroup::kServer);
    this->peers_->join(PeerGroup::kServer);
  }

  void client(std::shared_ptr<Channel> channel) override {
    TP_CUDA_CHECK(cudaSetDevice(0));
    cudaStream_t recvStream;
    TP_CUDA_CHECK(
        cudaStreamCreateWithFlags(&recvStream, cudaStreamNonBlocking));
    void* ptr;
    TP_CUDA_CHECK(cudaMalloc(&ptr, kSize));

    // Perform recv and wait for completion.
    auto recvPromise = std::make_shared<std::promise<tensorpipe::Error>>();
    auto recvFuture = recvPromise->get_future();

    channel->recv(
        CudaBuffer{
            .ptr = ptr,
            .stream = recvStream,
        },
        kSize,
        [recvPromise{std::move(recvPromise)}](const tensorpipe::Error& error) {
          recvPromise->set_value(error);
        });

    Error recvError = recvFuture.get();
    EXPECT_FALSE(recvError) << recvError.what();

    std::array<uint8_t, kSize> data;
    TP_CUDA_CHECK(cudaStreamSynchronize(recvStream));
    TP_CUDA_CHECK(cudaMemcpy(data.data(), ptr, kSize, cudaMemcpyDefault));
    EXPECT_THAT(data, ::testing::Each(0x42));
    TP_CUDA_CHECK(cudaFree(ptr));

    this->peers_->done(PeerGroup::kClient);
    this->peers_->join(PeerGroup::kClient);
  }
};

CHANNEL_TEST(CudaXDTTChannelTestSuite, SendFromCpuToGpu);

class SendFromGpuToCpuTest : public ClientServerChannelTestCase {
  static constexpr size_t kSize = 1024;

  void server(std::shared_ptr<Channel> channel) override {
    TP_CUDA_CHECK(cudaSetDevice(0));
    cudaStream_t sendStream;
    TP_CUDA_CHECK(
        cudaStreamCreateWithFlags(&sendStream, cudaStreamNonBlocking));
    void* ptr;
    TP_CUDA_CHECK(cudaMalloc(&ptr, kSize));

    // Set buffer to target value.
    TP_CUDA_CHECK(cudaMemsetAsync(ptr, 0x42, kSize, sendStream));

    // Perform send and wait for completion.
    auto sendPromise = std::make_shared<std::promise<tensorpipe::Error>>();
    auto sendFuture = sendPromise->get_future();

    channel->send(
        CudaBuffer{
            .ptr = ptr,
            .stream = sendStream,
        },
        kSize,
        [sendPromise{std::move(sendPromise)}](const tensorpipe::Error& error) {
          sendPromise->set_value(error);
        });

    Error sendError = sendFuture.get();
    EXPECT_FALSE(sendError) << sendError.what();
    TP_CUDA_CHECK(cudaFree(ptr));

    this->peers_->done(PeerGroup::kServer);
    this->peers_->join(PeerGroup::kServer);
  }

  void client(std::shared_ptr<Channel> channel) override {
    // Perform recv and wait for completion.
    auto recvPromise = std::make_shared<std::promise<tensorpipe::Error>>();
    auto recvFuture = recvPromise->get_future();

    std::vector<uint8_t> data(kSize);
    channel->recv(
        CpuBuffer{
            .ptr = data.data(),
        },
        kSize,
        [recvPromise{std::move(recvPromise)}](const tensorpipe::Error& error) {
          recvPromise->set_value(error);
        });

    Error recvError = recvFuture.get();
    EXPECT_FALSE(recvError) << recvError.what();

    EXPECT_THAT(data, ::testing::Each(0x42));

    this->peers_->done(PeerGroup::kClient);
    this->peers_->join(PeerGroup::kClient);
  }
};

CHANNEL_TEST(CudaXDTTChannelTestSuite, SendFromGpuToCpu);

class SendFromCpuToCpuTest : public ClientServerChannelTestCase {
  static constexpr size_t kSize = 1024;

  void server(std::shared_ptr<Channel> channel) override {
    // Perform send and wait for completion.
    auto sendPromise = std::make_shared<std::promise<tensorpipe::Error>>();
    auto sendFuture = sendPromise->get_future();

    std::vector<uint8_t> data(kSize, 0x42);
    channel->send(
        CpuBuffer{
            .ptr = data.data(),
        },
        kSize,
        [sendPromise{std::move(sendPromise)}](const tensorpipe::Error& error) {
          sendPromise->set_value(error);
        });

    Error sendError = sendFuture.get();
    EXPECT_FALSE(sendError) << sendError.what();

    this->peers_->done(PeerGroup::kServer);
    this->peers_->join(PeerGroup::kServer);
  }

  void client(std::shared_ptr<Channel> channel) override {
    // Perform recv and wait for completion.
    auto recvPromise = std::make_shared<std::promise<tensorpipe::Error>>();
    auto recvFuture = recvPromise->get_future();

    std::vector<uint8_t> data(kSize);
    channel->recv(
        CpuBuffer{
            .ptr = data.data(),
        },
        kSize,
        [recvPromise{std::move(recvPromise)}](const tensorpipe::Error& error) {
          recvPromise->set_value(error);
        });

    Error recvError = recvFuture.get();
    EXPECT_FALSE(recvError) << recvError.what();

    EXPECT_THAT(data, ::testing::Each(0x42));

    this->peers_->done(PeerGroup::kClient);
    this->peers_->join(PeerGroup::kClient);
  }
};

CHANNEL_TEST(CudaXDTTChannelTestSuite, SendFromCpuToCpu);


================================================
FILE: tensorpipe/test/channel/cma/CMakeLists.txt
================================================
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

add_executable(tensorpipe_channel_cma_probe
  probe.cc
)

target_link_libraries(tensorpipe_channel_cma_probe PRIVATE
  tensorpipe
)


================================================
FILE: tensorpipe/test/channel/cma/cma_test.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/channel/cma/factory.h>
#include <tensorpipe/test/channel/channel_test_cpu.h>

namespace {

class CmaChannelTestHelper : public CpuChannelTestHelper {
 protected:
  std::shared_ptr<tensorpipe::channel::Context> makeContextInternal(
      std::string id) override {
    auto context = tensorpipe::channel::cma::create();
    context->setId(std::move(id));
    return context;
  }
};

CmaChannelTestHelper helper;

} // namespace

INSTANTIATE_TEST_CASE_P(Cma, ChannelTestSuite, ::testing::Values(&helper));

INSTANTIATE_TEST_CASE_P(Cma, CpuChannelTestSuite, ::testing::Values(&helper));


================================================
FILE: tensorpipe/test/channel/cma/docker_tests.sh
================================================
#!/usr/bin/env bash
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

# We use a lot of trailing backslashes inside single-quoted string literals when
# we pass sub-scripts to sh -c, in order to wrap lines for long commands.
# Removing them would be incorrect, hence we just silence the linter warning.
# shellcheck disable=SC1004

set -eo pipefail


echo "Both endpoints in same vanilla container"
# This is not supposed to work, as Docker by default has a seccomp-bpf rule that
# blocks the process_vm_readv syscall.
# See https://jvns.ca/blog/2020/04/29/why-strace-doesnt-work-in-docker/
# and https://docs.docker.com/engine/security/seccomp/

TEMPDIR=$(mktemp --directory)
chmod ugo+rwx "$TEMPDIR"
echo "Using $TEMPDIR for staging data"

docker run \
  --volume "$TEMPDIR:/tmp/report" \
  --volume "$(pwd)/build:/tmp/build" \
  cimg/base:2020.01 \
  sh -c ' \
  TP_VERBOSE_LOGGING=5 \
  /tmp/build/tensorpipe/test/channel/cma/tensorpipe_channel_cma_probe \
  0 /tmp/report/socket \
  > /tmp/report/probe1_report.json & \
  probe1_pid=$!; \
  while [ ! -S /tmp/report/socket ]; do sleep 0.1; done; \
  TP_VERBOSE_LOGGING=5 \
  /tmp/build/tensorpipe/test/channel/cma/tensorpipe_channel_cma_probe \
  1 /tmp/report/socket \
  > /tmp/report/probe2_report.json & \
  probe2_pid=$!; \
  wait $probe1_pid; \
  wait $probe2_pid'

python3 \
  "$(pwd)/tensorpipe/test/channel/cma/probe_report_checker.py" \
  "$TEMPDIR/probe1_report.json" "$TEMPDIR/probe2_report.json" 0


echo "Both endpoints in same container, seccomp-bpf disabled"
# This fixes the above problem, and makes it work.

TEMPDIR=$(mktemp --directory)
chmod ugo+rwx "$TEMPDIR"
echo "Using $TEMPDIR for staging data"

docker run \
  --volume "$TEMPDIR:/tmp/report" \
  --volume "$(pwd)/build:/tmp/build" \
  --security-opt seccomp=unconfined \
  cimg/base:2020.01 \
  sh -c ' \
  TP_VERBOSE_LOGGING=5 \
  /tmp/build/tensorpipe/test/channel/cma/tensorpipe_channel_cma_probe \
  0 /tmp/report/socket \
  > /tmp/report/probe1_report.json & \
  probe1_pid=$!; \
  while [ ! -S /tmp/report/socket ]; do sleep 0.1; done; \
  TP_VERBOSE_LOGGING=5 \
  /tmp/build/tensorpipe/test/channel/cma/tensorpipe_channel_cma_probe \
  1 /tmp/report/socket \
  > /tmp/report/probe2_report.json & \
  probe2_pid=$!; \
  wait $probe1_pid; \
  wait $probe2_pid'

python3 \
  "$(pwd)/tensorpipe/test/channel/cma/probe_report_checker.py" \
  "$TEMPDIR/probe1_report.json" "$TEMPDIR/probe2_report.json" 1


echo "Both endpoints in same container, capability SYS_PTRACE added"
# This should not really matter, but Docker adds a "side effect" to this which
# also re-enables process_vm_readv in seccomp-bpf.

TEMPDIR=$(mktemp --directory)
chmod ugo+rwx "$TEMPDIR"
echo "Using $TEMPDIR for staging data"

docker run \
  --volume "$TEMPDIR:/tmp/report" \
  --volume "$(pwd)/build:/tmp/build" \
  --cap-add SYS_PTRACE \
  cimg/base:2020.01 \
  sh -c ' \
  TP_VERBOSE_LOGGING=5 \
  /tmp/build/tensorpipe/test/channel/cma/tensorpipe_channel_cma_probe \
  0 /tmp/report/socket \
  > /tmp/report/probe1_report.json & \
  probe1_pid=$!; \
  while [ ! -S /tmp/report/socket ]; do sleep 0.1; done; \
  TP_VERBOSE_LOGGING=5 \
  /tmp/build/tensorpipe/test/channel/cma/tensorpipe_channel_cma_probe \
  1 /tmp/report/socket \
  > /tmp/report/probe2_report.json & \
  probe2_pid=$!; \
  wait $probe1_pid; \
  wait $probe2_pid'

python3 \
  "$(pwd)/tensorpipe/test/channel/cma/probe_report_checker.py" \
  "$TEMPDIR/probe1_report.json" "$TEMPDIR/probe2_report.json" 1


echo "Both endpoints in same container, privileged"
# This should not really matter, but Docker adds a "side effect" to this which
# also re-enables process_vm_readv in seccomp-bpf.

TEMPDIR=$(mktemp --directory)
chmod ugo+rwx "$TEMPDIR"
echo "Using $TEMPDIR for staging data"

docker run \
  --volume "$TEMPDIR:/tmp/report" \
  --volume "$(pwd)/build:/tmp/build" \
  --privileged \
  cimg/base:2020.01 \
  sh -c ' \
  TP_VERBOSE_LOGGING=5 \
  /tmp/build/tensorpipe/test/channel/cma/tensorpipe_channel_cma_probe \
  0 /tmp/report/socket \
  > /tmp/report/probe1_report.json & \
  probe1_pid=$!; \
  while [ ! -S /tmp/report/socket ]; do sleep 0.1; done; \
  TP_VERBOSE_LOGGING=5 \
  /tmp/build/tensorpipe/test/channel/cma/tensorpipe_channel_cma_probe \
  1 /tmp/report/socket \
  > /tmp/report/probe2_report.json & \
  probe2_pid=$!; \
  wait $probe1_pid; \
  wait $probe2_pid'

python3 \
  "$(pwd)/tensorpipe/test/channel/cma/probe_report_checker.py" \
  "$TEMPDIR/probe1_report.json" "$TEMPDIR/probe2_report.json" 1


echo "Both endpoints in same container, stronger YAMA limits"
# CMA is able to work under YAMA when the latter is set to levels 0 or 1, as
# in the first case YAMA adds no extra limit and in the second case CMA will
# configure YAMA so that it allows the process to be ptraced by any other one.
# However CMA can't handle YAMA at level 2 or higher.
# We keep disabling seccomp-bpf as otherwise this would be shadowed.

sudo sh -c 'echo 2 > /proc/sys/kernel/yama/ptrace_scope'

TEMPDIR=$(mktemp --directory)
chmod ugo+rwx "$TEMPDIR"
echo "Using $TEMPDIR for staging data"

docker run \
  --volume "$TEMPDIR:/tmp/report" \
  --volume "$(pwd)/build:/tmp/build" \
  --security-opt seccomp=unconfined \
  cimg/base:2020.01 \
  sh -c ' \
  TP_VERBOSE_LOGGING=5 \
  /tmp/build/tensorpipe/test/channel/cma/tensorpipe_channel_cma_probe \
  0 /tmp/report/socket \
  > /tmp/report/probe1_report.json & \
  probe1_pid=$!; \
  while [ ! -S /tmp/report/socket ]; do sleep 0.1; done; \
  TP_VERBOSE_LOGGING=5 \
  /tmp/build/tensorpipe/test/channel/cma/tensorpipe_channel_cma_probe \
  1 /tmp/report/socket \
  > /tmp/report/probe2_report.json & \
  probe2_pid=$!; \
  wait $probe1_pid; \
  wait $probe2_pid'

python3 \
  "$(pwd)/tensorpipe/test/channel/cma/probe_report_checker.py" \
  "$TEMPDIR/probe1_report.json" "$TEMPDIR/probe2_report.json" 0

sudo sh -c 'echo 1 > /proc/sys/kernel/yama/ptrace_scope'


# TODO
# echo "Both endpoints in same container, different users/groups"


# TODO
# echo "Both endpoints in same container, same users/groups but different effective user/group"


echo "Each endpoint in own container, with separate namespace"
# This isn't supposed to work, as each container gets its own user and PID
# namespace, but CMA needs them to match. We disable seccomp-bpf to give this
# test a fighting chance.

TEMPDIR=$(mktemp --directory)
chmod ugo+rwx "$TEMPDIR"
echo "Using $TEMPDIR for staging data"

docker run \
  --volume "$TEMPDIR:/tmp/report" \
  --volume "$(pwd)/build:/tmp/build" \
  --security-opt seccomp=unconfined \
  cimg/base:2020.01 \
  sh -c ' \
  TP_VERBOSE_LOGGING=5 \
  /tmp/build/tensorpipe/test/channel/cma/tensorpipe_channel_cma_probe \
  0 /tmp/report/socket \
  > /tmp/report/probe1_report.json' &
probe1_pid=$!
while [ ! -S "$TEMPDIR/socket" ]; do sleep 0.1; done
docker run \
  --volume "$TEMPDIR:/tmp/report" \
  --volume "$(pwd)/build:/tmp/build" \
  --security-opt seccomp=unconfined \
  cimg/base:2020.01 \
  sh -c ' \
  TP_VERBOSE_LOGGING=5 \
  /tmp/build/tensorpipe/test/channel/cma/tensorpipe_channel_cma_probe \
  1 /tmp/report/socket \
  > /tmp/report/probe2_report.json' &
probe2_pid=$!
wait $probe1_pid
wait $probe2_pid

python3 \
  "$(pwd)/tensorpipe/test/channel/cma/probe_report_checker.py" \
  "$TEMPDIR/probe1_report.json" "$TEMPDIR/probe2_report.json" 0


# Docker allows a container to reuse another one's PID namespace, but doesn't
# allow the same for user namespaces.


echo "Each endpoint in own container, reusing host namespaces"
# This should fix the issues of the above.

TEMPDIR=$(mktemp --directory)
chmod ugo+rwx "$TEMPDIR"
echo "Using $TEMPDIR for staging data"

docker run \
  --volume "$TEMPDIR:/tmp/report" \
  --volume "$(pwd)/build:/tmp/build" \
  --security-opt seccomp=unconfined \
  --pid host \
  --userns host \
  cimg/base:2020.01 \
  sh -c ' \
  TP_VERBOSE_LOGGING=5 \
  /tmp/build/tensorpipe/test/channel/cma/tensorpipe_channel_cma_probe \
  0 /tmp/report/socket \
  > /tmp/report/probe1_report.json' &
probe1_pid=$!
while [ ! -S "$TEMPDIR/socket" ]; do sleep 0.1; done
docker run \
  --volume "$TEMPDIR:/tmp/report" \
  --volume "$(pwd)/build:/tmp/build" \
  --security-opt seccomp=unconfined \
  --pid host \
  --userns host \
  cimg/base:2020.01 \
  sh -c ' \
  TP_VERBOSE_LOGGING=5 \
  /tmp/build/tensorpipe/test/channel/cma/tensorpipe_channel_cma_probe \
  1 /tmp/report/socket \
  > /tmp/report/probe2_report.json' &
probe2_pid=$!
wait $probe1_pid
wait $probe2_pid

python3 \
  "$(pwd)/tensorpipe/test/channel/cma/probe_report_checker.py" \
  "$TEMPDIR/probe1_report.json" "$TEMPDIR/probe2_report.json" 1


echo "Each endpoint in own container, privileged, sharing PID namespace"
# This should also help.

TEMPDIR=$(mktemp --directory)
chmod ugo+rwx "$TEMPDIR"
echo "Using $TEMPDIR for staging data"

docker run \
  --cidfile "$TEMPDIR/probe1_container_id" \
  --volume "$TEMPDIR:/tmp/report" \
  --volume "$(pwd)/build:/tmp/build" \
  --privileged \
  cimg/base:2020.01 \
  sh -c ' \
  TP_VERBOSE_LOGGING=5 \
  /tmp/build/tensorpipe/test/channel/cma/tensorpipe_channel_cma_probe \
  0 /tmp/report/socket \
  > /tmp/report/probe1_report.json' &
probe1_pid=$!
while [ ! -S "$TEMPDIR/socket" ]; do sleep 0.1; done
docker run \
  --volume "$TEMPDIR:/tmp/report" \
  --volume "$(pwd)/build:/tmp/build" \
  --pid "container:$(cat "$TEMPDIR/probe1_container_id")" \
  --privileged \
  cimg/base:2020.01 \
  sh -c ' \
  TP_VERBOSE_LOGGING=5 \
  /tmp/build/tensorpipe/test/channel/cma/tensorpipe_channel_cma_probe \
  1 /tmp/report/socket \
  > /tmp/report/probe2_report.json' &
probe2_pid=$!
wait $probe1_pid
wait $probe2_pid

python3 \
  "$(pwd)/tensorpipe/test/channel/cma/probe_report_checker.py" \
  "$TEMPDIR/probe1_report.json" "$TEMPDIR/probe2_report.json" 1


echo "One endpoint on host, other in container, with own namespace"
# This isn't supposed to work, as each container gets its own user and PID
# namespace, but CMA needs them to match. We disable seccomp-bpf to give this
# test a fighting chance. And also AppArmor, as it starts mattering here,
# because Docker sets its own profile (docker-default) which is different than
# the host's one (unconfined).

TEMPDIR=$(mktemp --directory)
chmod ugo+rwx "$TEMPDIR"
echo "Using $TEMPDIR for staging data"

docker run \
  --volume "$TEMPDIR:/tmp/report" \
  --volume "$(pwd)/build:/tmp/build" \
  --security-opt seccomp=unconfined \
  --security-opt apparmor=unconfined \
  --user "$(id -u):$(id -g)" \
  cimg/base:2020.01 \
  sh -c ' \
  TP_VERBOSE_LOGGING=5 \
  /tmp/build/tensorpipe/test/channel/cma/tensorpipe_channel_cma_probe \
  0 /tmp/report/socket \
  > /tmp/report/probe1_report.json' &
probe1_pid=$!
while [ ! -S "$TEMPDIR/socket" ]; do sleep 0.1; done
sudo chmod ugo+rwx "$TEMPDIR"/socket
TP_VERBOSE_LOGGING=5 \
  "$(pwd)/build/tensorpipe/test/channel/cma/tensorpipe_channel_cma_probe" \
  1 "$TEMPDIR/socket" \
  > "$TEMPDIR/probe2_report.json" &
probe2_pid=$!
wait $probe1_pid
wait $probe2_pid

python3 \
  "$(pwd)/tensorpipe/test/channel/cma/probe_report_checker.py" \
  "$TEMPDIR/probe1_report.json" "$TEMPDIR/probe2_report.json" 0


echo "One endpoint on host, other in container, reusing host namespace"
# This should fix the issues of the above.

TEMPDIR=$(mktemp --directory)
chmod ugo+rwx "$TEMPDIR"
echo "Using $TEMPDIR for staging data"

docker run \
  --volume "$TEMPDIR:/tmp/report" \
  --volume "$(pwd)/build:/tmp/build" \
  --security-opt seccomp=unconfined \
  --security-opt apparmor=unconfined \
  --pid host \
  --userns host \
  --user "$(id -u):$(id -g)" \
  cimg/base:2020.01 \
  sh -c ' \
  TP_VERBOSE_LOGGING=5 \
  /tmp/build/tensorpipe/test/channel/cma/tensorpipe_channel_cma_probe \
  0 /tmp/report/socket \
  > /tmp/report/probe1_report.json' &
probe1_pid=$!
while [ ! -S "$TEMPDIR/socket" ]; do sleep 0.1; done
sudo chmod ugo+rwx "$TEMPDIR"/socket
TP_VERBOSE_LOGGING=5 \
  "$(pwd)/build/tensorpipe/test/channel/cma/tensorpipe_channel_cma_probe" \
  1 "$TEMPDIR/socket" \
  > "$TEMPDIR/probe2_report.json" &
probe2_pid=$!
wait $probe1_pid
wait $probe2_pid

python3 \
  "$(pwd)/tensorpipe/test/channel/cma/probe_report_checker.py" \
  "$TEMPDIR/probe1_report.json" "$TEMPDIR/probe2_report.json" 1


echo "One endpoint on host, other in container, privileged"
# This should also help.

TEMPDIR=$(mktemp --directory)
chmod ugo+rwx "$TEMPDIR"
echo "Using $TEMPDIR for staging data"

docker run \
  --volume "$TEMPDIR:/tmp/report" \
  --volume "$(pwd)/build:/tmp/build" \
  --security-opt seccomp=unconfined \
  --security-opt apparmor=unconfined \
  --pid host \
  --user "$(id -u):$(id -g)" \
  --privileged \
  cimg/base:2020.01 \
  sh -c ' \
  TP_VERBOSE_LOGGING=5 \
  /tmp/build/tensorpipe/test/channel/cma/tensorpipe_channel_cma_probe \
  0 /tmp/report/socket \
  > /tmp/report/probe1_report.json' &
probe1_pid=$!
while [ ! -S "$TEMPDIR/socket" ]; do sleep 0.1; done
sudo chmod ugo+rwx "$TEMPDIR"/socket
TP_VERBOSE_LOGGING=5 \
  "$(pwd)/build/tensorpipe/test/channel/cma/tensorpipe_channel_cma_probe" \
  1 "$TEMPDIR/socket" \
  > "$TEMPDIR/probe2_report.json" &
probe2_pid=$!
wait $probe1_pid
wait $probe2_pid

python3 \
  "$(pwd)/tensorpipe/test/channel/cma/probe_report_checker.py" \
  "$TEMPDIR/probe1_report.json" "$TEMPDIR/probe2_report.json" 1


echo "Both endpoints on host"
# Should be a no-brainer?

TEMPDIR=$(mktemp --directory)
chmod ugo+rwx "$TEMPDIR"
echo "Using $TEMPDIR for staging data"

TP_VERBOSE_LOGGING=5 \
  "$(pwd)/build/tensorpipe/test/channel/cma/tensorpipe_channel_cma_probe" \
  0 "$TEMPDIR/socket" \
  > "$TEMPDIR/probe1_report.json" &
probe1_pid=$!
while [ ! -S "$TEMPDIR/socket" ]; do sleep 0.1; done
TP_VERBOSE_LOGGING=5 \
  "$(pwd)/build/tensorpipe/test/channel/cma/tensorpipe_channel_cma_probe" \
  1 "$TEMPDIR/socket" \
  > "$TEMPDIR/probe2_report.json" &
probe2_pid=$!
wait $probe1_pid
wait $probe2_pid

python3 \
  "$(pwd)/tensorpipe/test/channel/cma/probe_report_checker.py" \
  "$TEMPDIR/probe1_report.json" "$TEMPDIR/probe2_report.json" 1


================================================
FILE: tensorpipe/test/channel/cma/probe.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <cstring>

#include <sys/prctl.h>
#include <sys/socket.h>
#include <sys/types.h>
#include <sys/uio.h>
#include <sys/un.h>
#include <unistd.h>

#include <tensorpipe/channel/cma/factory.h>
#include <tensorpipe/common/defs.h>

namespace {}

int main(int argc, char* argv[]) {
  TP_THROW_ASSERT_IF(argc < 1);
  if (argc != 3) {
    TP_LOG_INFO() << "Usage: " << argv[0]
                  << " [rank] [path to a UNIX domain socket]";
    return 0;
  }

  TP_LOG_INFO() << "My PID is " << ::getpid();

  int rank = std::strtol(argv[1], nullptr, 10);

  int rv;
  int fd = ::socket(AF_UNIX, SOCK_STREAM, 0);
  TP_THROW_SYSTEM_IF(fd < 0, errno);

  struct sockaddr_un socketAddr;
  std::memset(&socketAddr, 0, sizeof(struct sockaddr_un));
  socketAddr.sun_family = AF_UNIX;
  std::strcpy(socketAddr.sun_path, argv[2]);

  if (rank == 0) {
    rv = ::bind(
        fd,
        reinterpret_cast<struct sockaddr*>(&socketAddr),
        sizeof(struct sockaddr_un));
    TP_THROW_SYSTEM_IF(rv < 0, errno);
    rv = ::listen(fd, 0);
    TP_THROW_SYSTEM_IF(rv < 0, errno);
    struct sockaddr_storage peerAddr;
    socklen_t peerAddrlen = sizeof(struct sockaddr_storage);
    do {
      rv = ::accept(
          fd, reinterpret_cast<struct sockaddr*>(&peerAddr), &peerAddrlen);
      TP_THROW_SYSTEM_IF(rv < 0 && errno != EINTR, errno);
    } while (rv < 0);
    int otherFd = rv;
    rv = ::close(fd);
    TP_THROW_SYSTEM_IF(rv < 0, errno);
    rv = ::unlink(argv[2]);
    TP_THROW_SYSTEM_IF(rv < 0, errno);
    fd = otherFd;
  } else {
    do {
      rv = ::connect(
          fd,
          reinterpret_cast<struct sockaddr*>(&socketAddr),
          sizeof(struct sockaddr_un));
      TP_THROW_SYSTEM_IF(rv < 0 && errno != EINTR, errno);
    } while (rv < 0);
  }

  struct ucred peerCreds;
  std::memset(&peerCreds, 0, sizeof(struct ucred));
  socklen_t peerCredsLen = sizeof(struct ucred);
  rv = ::getsockopt(fd, SOL_SOCKET, SO_PEERCRED, &peerCreds, &peerCredsLen);

  pid_t peerPid = peerCreds.pid;

  TP_LOG_INFO() << "The peer's PID is " << peerPid;

  rv = ::prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY, 0, 0, 0);
  TP_THROW_SYSTEM_IF(rv < 0, errno);

  uint64_t outbox = 0x0123456789abcdef;
  void* outboxPtr = &outbox;
  TP_LOG_INFO() << "My outbox's address is 0x" << std::hex
                << reinterpret_cast<uintptr_t>(outboxPtr);
  rv = ::write(fd, &outboxPtr, sizeof(void*));
  TP_THROW_SYSTEM_IF(rv < 0, errno);
  TP_THROW_ASSERT_IF(rv != sizeof(void*));
  void* peerOutboxPtr;
  rv = ::read(fd, &peerOutboxPtr, sizeof(void*));
  TP_THROW_SYSTEM_IF(rv < 0, errno);
  TP_THROW_ASSERT_IF(rv != sizeof(void*));
  TP_LOG_INFO() << "The peer's inbox address is 0x" << std::hex
                << reinterpret_cast<uintptr_t>(peerOutboxPtr);

  uint64_t inbox;
  struct iovec localIov;
  std::memset(&localIov, 0, sizeof(struct iovec));
  localIov.iov_base = &inbox;
  localIov.iov_len = sizeof(uint64_t);
  struct iovec remoteIov;
  std::memset(&remoteIov, 0, sizeof(struct iovec));
  remoteIov.iov_base = peerOutboxPtr;
  remoteIov.iov_len = sizeof(uint64_t);

  ssize_t result = ::process_vm_readv(peerPid, &localIov, 1, &remoteIov, 1, 0);
  TP_LOG_INFO() << "Calling process_vm_readv returned " << result
                << ", errno is set to " << errno
                << " and my inbox now has value 0x" << std::hex << inbox;
  bool successful = false;
  if (result >= 0) {
    TP_THROW_ASSERT_IF(result != sizeof(uint64_t));
    TP_THROW_ASSERT_IF(inbox != 0x0123456789abcdef);
    successful = true;
  }

  uint8_t ack;
  rv = ::write(fd, &ack, sizeof(uint8_t));
  TP_THROW_SYSTEM_IF(rv < 0, errno);
  TP_THROW_ASSERT_IF(rv != sizeof(uint8_t));
  rv = ::read(fd, &ack, sizeof(uint8_t));
  TP_THROW_SYSTEM_IF(rv < 0, errno);
  TP_THROW_ASSERT_IF(rv != sizeof(uint8_t));

  rv = ::close(fd);
  TP_THROW_SYSTEM_IF(rv < 0, errno);

  auto ctx = tensorpipe::channel::cma::create();
  TP_LOG_INFO() << "The CMA context's viability is: " << std::boolalpha
                << ctx->isViable();
  std::string descriptor;
  if (ctx->isViable()) {
    auto cpuDevice = tensorpipe::Device{tensorpipe::kCpuDeviceType, 0};
    auto deviceDescriptors = ctx->deviceDescriptors();
    auto iter = deviceDescriptors.find(cpuDevice);
    TP_DCHECK(iter != deviceDescriptors.end());
    descriptor = iter->second;
  }
  TP_LOG_INFO() << "Its descriptor is: " << descriptor;

  std::cout << "{\"syscall_success\": " << successful
            << ", \"viability\": " << ctx->isViable()
            << ", \"device_descriptor\": \"" << descriptor << "\"}"
            << std::endl;
}


================================================
FILE: tensorpipe/test/channel/cma/probe_report_checker.py
================================================
#!/usr/bin/env python3
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

import json
import sys

if __name__ == "__main__":
    if len(sys.argv) < 1:
        raise RuntimeError()
    if len(sys.argv) != 4:
        print(
            f"Usage: {sys.argv[0]} [first report] [second report] [supposed to work]",
            file=sys.stderr,
        )
        sys.exit(0)

    with open(sys.argv[1], "rb") as f:
        first_report = json.load(f)
    with open(sys.argv[2], "rb") as f:
        second_report = json.load(f)
    supposed_to_work = int(sys.argv[3])

    worked_in_practice = (
        first_report["syscall_success"] == 1 and second_report["syscall_success"] == 1
    )
    if worked_in_practice != supposed_to_work:
        raise RuntimeError(
            f"The syscall didn't behave as the test expected it to. It "
            f"{'succeeded' if worked_in_practice else 'failed'} whereas it was "
            f"supposed to {'succeed' if supposed_to_work else 'fail'}."
        )

    detected_as_working = (
        first_report["viability"] == 1
        and second_report["viability"] == 1
        and first_report["device_descriptor"] == second_report["device_descriptor"]
    )
    if detected_as_working != worked_in_practice:
        print(
            f"The CMA autodetection didn't correctly predict the behavior of the "
            f"syscall. It determined it would "
            f"{'succeed' if detected_as_working else 'fail'} whereas it actually "
            f"{'succeeded' if worked_in_practice else 'failed'}.",
            file=sys.stderr,
        )
        sys.exit(1)

    sys.exit(0)


================================================
FILE: tensorpipe/test/channel/cuda_basic/cuda_basic_test.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <numeric>

#include <tensorpipe/channel/basic/factory.h>
#include <tensorpipe/channel/cuda_basic/factory.h>
#include <tensorpipe/test/channel/channel_test_cuda.h>

namespace {

class CudaBasicChannelTestHelper : public CudaChannelTestHelper {
 protected:
  std::shared_ptr<tensorpipe::channel::Context> makeContextInternal(
      std::string id) override {
    auto cpuContext = tensorpipe::channel::basic::create();
    auto context =
        tensorpipe::channel::cuda_basic::create(std::move(cpuContext));
    context->setId(std::move(id));
    return context;
  }

 public:
  std::shared_ptr<PeerGroup> makePeerGroup() override {
    return std::make_shared<ProcessPeerGroup>();
  }
};

CudaBasicChannelTestHelper helper;

class CudaBasicChannelTestSuite : public ChannelTestSuite {};

} // namespace

class CannotCommunicateCpuToCpuTest : public ChannelTestCase {
 public:
  void run(ChannelTestHelper* /* unused */) override {
    ForkedThreadPeerGroup pg;
    pg.spawn(
        [&]() {
          auto cpuContext = tensorpipe::channel::basic::create();
          auto ctx =
              tensorpipe::channel::cuda_basic::create(std::move(cpuContext));
          auto deviceDescriptors = ctx->deviceDescriptors();
          auto it = deviceDescriptors.find(
              tensorpipe::Device{tensorpipe::kCpuDeviceType, 0});
          EXPECT_FALSE(it == deviceDescriptors.end());
          auto descriptor = it->second;
          EXPECT_FALSE(ctx->canCommunicateWithRemote(descriptor, descriptor));
        },
        [&]() {
          // Do nothing.
        });
  }
};

CHANNEL_TEST(CudaBasicChannelTestSuite, CannotCommunicateCpuToCpu);

INSTANTIATE_TEST_CASE_P(
    CudaBasic,
    ChannelTestSuite,
    ::testing::Values(&helper));

INSTANTIATE_TEST_CASE_P(
    CudaBasic,
    CudaChannelTestSuite,
    ::testing::Values(&helper));

INSTANTIATE_TEST_CASE_P(
    CudaBasic,
    CudaMultiGPUChannelTestSuite,
    ::testing::Values(&helper));

INSTANTIATE_TEST_CASE_P(
    CudaBasic,
    CudaXDTTChannelTestSuite,
    ::testing::Values(&helper));

INSTANTIATE_TEST_CASE_P(
    CudaBasic,
    CudaBasicChannelTestSuite,
    ::testing::Values(&helper));


================================================
FILE: tensorpipe/test/channel/cuda_gdr/cuda_gdr_test.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <numeric>

#include <tensorpipe/channel/cuda_gdr/factory.h>
#include <tensorpipe/test/channel/channel_test_cuda.h>

namespace {

class CudaGdrChannelTestHelper : public CudaChannelTestHelper {
 protected:
  std::shared_ptr<tensorpipe::channel::Context> makeContextInternal(
      std::string id) override {
    auto context = tensorpipe::channel::cuda_gdr::create();
    context->setId(std::move(id));
    return context;
  }

 public:
  std::shared_ptr<PeerGroup> makePeerGroup() override {
    return std::make_shared<ProcessPeerGroup>();
  }
};

CudaGdrChannelTestHelper helper;

} // namespace

INSTANTIATE_TEST_CASE_P(CudaGdr, ChannelTestSuite, ::testing::Values(&helper));

INSTANTIATE_TEST_CASE_P(
    CudaGdr,
    CudaChannelTestSuite,
    ::testing::Values(&helper));

INSTANTIATE_TEST_CASE_P(
    CudaGdr,
    CudaMultiGPUChannelTestSuite,
    ::testing::Values(&helper));


================================================
FILE: tensorpipe/test/channel/cuda_helpers.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <algorithm>
#include <cstdlib>
#include <string>
#include <tuple>
#include <vector>

#include <gtest/gtest.h>

#include <tensorpipe/common/cuda.h>
#include <tensorpipe/common/cuda_lib.h>
#include <tensorpipe/common/nvml_lib.h>

namespace tensorpipe {

inline bool isContextOpenOnDevice(const NvmlLib& nvmlLib, nvmlDevice_t device) {
  unsigned int count = 0;
  std::vector<nvmlProcessInfo_t> processInfos;
  while (true) {
    nvmlReturn_t res = nvmlLib.deviceGetComputeRunningProcesses(
        device, &count, processInfos.data());
    processInfos.resize(count);
    if (res == NVML_SUCCESS) {
      break;
    }
    if (res == NVML_ERROR_INSUFFICIENT_SIZE) {
      continue;
    }
    TP_NVML_CHECK(nvmlLib, res);
  }

  pid_t myPid = ::getpid();
  for (const nvmlProcessInfo_t& processInfo : processInfos) {
    if (processInfo.pid == myPid) {
      return true;
    }
  }
  return false;
}

inline ::testing::AssertionResult initializedCudaContexts(
    const std::vector<int>& expectedDeviceIndices) {
  // This check won't work when the test is running in a PID namespace, as NVML
  // will return the PIDs in the root namespace but it doesn't seem possible for
  // us to map them back to our namespace. Hence we use an env var to allow to
  // disable this check in such environments.
  char* shouldSkip = std::getenv("TP_SKIP_CHECK_OPEN_CUDA_CTXS");
  if (shouldSkip != nullptr) {
    return ::testing::AssertionSuccess();
  }

  Error error;
  CudaLib cudaLib;
  std::tie(error, cudaLib) = CudaLib::create();
  TP_THROW_ASSERT_IF(error) << error.what();
  NvmlLib nvmlLib;
  std::tie(error, nvmlLib) = NvmlLib::create();
  TP_THROW_ASSERT_IF(error) << error.what();

  std::vector<std::string> uuids = getUuidsOfVisibleDevices(cudaLib);
  for (int deviceIdx = 0; deviceIdx < uuids.size(); deviceIdx++) {
    // NVML uses a different format for UUIDs.
    std::string nvmlUuid = "GPU-" + uuids[deviceIdx];
    nvmlDevice_t nvmlDevice;
    TP_NVML_CHECK(
        nvmlLib, nvmlLib.deviceGetHandleByUUID(nvmlUuid.c_str(), &nvmlDevice));
    bool actualHasCtx = isContextOpenOnDevice(nvmlLib, nvmlDevice);

    bool expectedHasCtx = std::find(
                              expectedDeviceIndices.begin(),
                              expectedDeviceIndices.end(),
                              deviceIdx) != expectedDeviceIndices.end();

    if (actualHasCtx && !expectedHasCtx) {
      return ::testing::AssertionFailure()
          << "a CUDA context was initialized on device #" << deviceIdx
          << " but that shouldn't have happened";
    }
    if (!actualHasCtx && expectedHasCtx) {
      return ::testing::AssertionFailure()
          << "a CUDA context should have been initialized on device #"
          << deviceIdx << " but that didn't happen";
    }
  }
  return ::testing::AssertionSuccess();
}

} // namespace tensorpipe


================================================
FILE: tensorpipe/test/channel/cuda_ipc/cuda_ipc_test.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <numeric>

#include <tensorpipe/channel/cuda_ipc/factory.h>
#include <tensorpipe/test/channel/channel_test_cuda.h>

namespace {

class CudaIpcChannelTestHelper : public CudaChannelTestHelper {
 protected:
  std::shared_ptr<tensorpipe::channel::Context> makeContextInternal(
      std::string id) override {
    auto context = tensorpipe::channel::cuda_ipc::create();
    context->setId(std::move(id));
    return context;
  }

 public:
  std::shared_ptr<PeerGroup> makePeerGroup() override {
    return std::make_shared<ProcessPeerGroup>();
  }
};

CudaIpcChannelTestHelper helper;

class CudaIpcChannelTestSuite : public ChannelTestSuite {};

} // namespace

class CannotCommunicateInSameProcessTest : public ChannelTestCase {
 public:
  void run(ChannelTestHelper* /* unused */) override {
    ForkedThreadPeerGroup pg;
    pg.spawn(
        [&]() {
          auto ctx = tensorpipe::channel::cuda_ipc::create();
          auto deviceDescriptors = ctx->deviceDescriptors();
          EXPECT_GT(deviceDescriptors.size(), 0);
          auto descriptor = deviceDescriptors.begin()->second;
          // From within a given process, the device descriptors will be the
          // same.
          EXPECT_FALSE(ctx->canCommunicateWithRemote(descriptor, descriptor));
        },
        [&]() {
          // Do nothing.
        });
  }
};

CHANNEL_TEST(CudaIpcChannelTestSuite, CannotCommunicateInSameProcess);

INSTANTIATE_TEST_CASE_P(CudaIpc, ChannelTestSuite, ::testing::Values(&helper));

INSTANTIATE_TEST_CASE_P(
    CudaIpc,
    CudaChannelTestSuite,
    ::testing::Values(&helper));

INSTANTIATE_TEST_CASE_P(
    CudaIpc,
    CudaMultiGPUChannelTestSuite,
    ::testing::Values(&helper));

INSTANTIATE_TEST_CASE_P(
    CudaIpc,
    CudaIpcChannelTestSuite,
    ::testing::Values(&helper));


================================================
FILE: tensorpipe/test/channel/cuda_xth/cuda_xth_test.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <numeric>

#include <tensorpipe/channel/cuda_xth/factory.h>
#include <tensorpipe/test/channel/channel_test_cuda.h>

namespace {

class CudaXthChannelTestHelper : public CudaChannelTestHelper {
 protected:
  std::shared_ptr<tensorpipe::channel::Context> makeContextInternal(
      std::string id) override {
    auto context = tensorpipe::channel::cuda_xth::create();
    context->setId(std::move(id));
    return context;
  }

 public:
  std::shared_ptr<PeerGroup> makePeerGroup() override {
    return std::make_shared<ForkedThreadPeerGroup>();
  }
};

CudaXthChannelTestHelper helper;

} // namespace

INSTANTIATE_TEST_CASE_P(CudaXth, ChannelTestSuite, ::testing::Values(&helper));

INSTANTIATE_TEST_CASE_P(
    CudaXth,
    CudaChannelTestSuite,
    ::testing::Values(&helper));

INSTANTIATE_TEST_CASE_P(
    CudaXth,
    CudaMultiGPUChannelTestSuite,
    ::testing::Values(&helper));


================================================
FILE: tensorpipe/test/channel/kernel.cu
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <cuda.h>

__global__ void _slowKernel(char* ptr, int sz) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  for (; idx < sz; idx += (gridDim.x * blockDim.x)) {
    for (int i = 0; i < 100000; ++i) {
      ptr[idx] += ptr[(idx + 1007) % sz] + i;
    }
  }
}

void slowKernel(void* ptr, int kSize, cudaStream_t stream) {
  _slowKernel<<<128, 128, 0, stream>>>((char*)ptr, kSize);
}


================================================
FILE: tensorpipe/test/channel/kernel.cuh
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <cuda_runtime.h>

// This kernel takes time and puts garbage data in the buffer. It is used to
// test proper synchronization in CUDA channels.
void slowKernel(void* ptr, int kSize, cudaStream_t stream);


================================================
FILE: tensorpipe/test/channel/mpt/mpt_test.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/channel/context.h>
#include <tensorpipe/channel/mpt/factory.h>
#include <tensorpipe/common/cpu_buffer.h>
#include <tensorpipe/test/channel/channel_test_cpu.h>
#include <tensorpipe/transport/connection.h>

namespace {

class MptChannelTestHelper : public CpuChannelTestHelper {
 protected:
  std::shared_ptr<tensorpipe::channel::Context> makeContextInternal(
      std::string id) override {
    std::vector<std::shared_ptr<tensorpipe::transport::Context>> contexts = {
        tensorpipe::transport::uv::create(),
        tensorpipe::transport::uv::create(),
        tensorpipe::transport::uv::create()};
    std::vector<std::shared_ptr<tensorpipe::transport::Listener>> listeners = {
        contexts[0]->listen("127.0.0.1"),
        contexts[1]->listen("127.0.0.1"),
        contexts[2]->listen("127.0.0.1")};
    auto context = tensorpipe::channel::mpt::create(
        std::move(contexts), std::move(listeners));
    context->setId(std::move(id));
    return context;
  }
};

MptChannelTestHelper helper;

class MptChannelTestSuite : public ChannelTestSuite {};

} // namespace

class ContextIsNotJoinedTest : public ChannelTestCase {
  // Because it's static we must define it out-of-line (until C++-17, where we
  // can mark this inline).
  static const std::string kReady;

 public:
  void run(ChannelTestHelper* helper) override {
    auto addr = "127.0.0.1";

    helper_ = helper;
    peers_ = helper_->makePeerGroup();
    peers_->spawn(
        [&] {
          auto context = tensorpipe::transport::uv::create();
          context->setId("server_harness");

          auto listener = context->listen(addr);

          std::promise<std::shared_ptr<tensorpipe::transport::Connection>>
              connectionProm;
          listener->accept(
              [&](const tensorpipe::Error& error,
                  std::shared_ptr<tensorpipe::transport::Connection>
                      connection) {
                ASSERT_FALSE(error) << error.what();
                connectionProm.set_value(std::move(connection));
              });

          peers_->send(PeerGroup::kClient, listener->addr());
          server(connectionProm.get_future().get());

          context->join();
        },
        [&] {
          auto context = tensorpipe::transport::uv::create();
          context->setId("client_harness");

          auto laddr = peers_->recv(PeerGroup::kClient);
          client(context->connect(laddr));

          context->join();
        });
  }

  void server(std::shared_ptr<tensorpipe::transport::Connection> conn) {
    std::shared_ptr<tensorpipe::channel::Context> context =
        this->helper_->makeContext("server");
    this->peers_->send(PeerGroup::kClient, kReady);
    context->createChannel(
        {std::move(conn)}, tensorpipe::channel::Endpoint::kListen);
  }

  void client(std::shared_ptr<tensorpipe::transport::Connection> conn) {
    std::shared_ptr<tensorpipe::channel::Context> context =
        this->helper_->makeContext("client");
    EXPECT_EQ(kReady, this->peers_->recv(PeerGroup::kClient));
    context->createChannel(
        {std::move(conn)}, tensorpipe::channel::Endpoint::kConnect);
  }

 protected:
  ChannelTestHelper* helper_;
  std::shared_ptr<PeerGroup> peers_;
};

const std::string ContextIsNotJoinedTest::kReady = "ready";

CHANNEL_TEST(MptChannelTestSuite, ContextIsNotJoined);

INSTANTIATE_TEST_CASE_P(Mpt, ChannelTestSuite, ::testing::Values(&helper));

INSTANTIATE_TEST_CASE_P(Mpt, CpuChannelTestSuite, ::testing::Values(&helper));

INSTANTIATE_TEST_CASE_P(Mpt, MptChannelTestSuite, ::testing::Values(&helper));


================================================
FILE: tensorpipe/test/channel/xth/xth_test.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/channel/xth/factory.h>
#include <tensorpipe/test/channel/channel_test_cpu.h>

namespace {

class XthChannelTestHelper : public CpuChannelTestHelper {
 protected:
  std::shared_ptr<tensorpipe::channel::Context> makeContextInternal(
      std::string id) override {
    auto context = tensorpipe::channel::xth::create();
    context->setId(std::move(id));
    return context;
  }
};

XthChannelTestHelper helper;

} // namespace

INSTANTIATE_TEST_CASE_P(Xth, ChannelTestSuite, ::testing::Values(&helper));

INSTANTIATE_TEST_CASE_P(Xth, CpuChannelTestSuite, ::testing::Values(&helper));


================================================
FILE: tensorpipe/test/common/cuda_test.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <cstring>

#include <tensorpipe/common/cuda.h>
#include <tensorpipe/common/cuda_lib.h>
#include <tensorpipe/test/peer_group.h>
#include <tensorpipe/test/test_environment.h>

#include <gtest/gtest.h>

namespace {

tensorpipe::CudaLib getCudaLib() {
  tensorpipe::Error error;
  tensorpipe::CudaLib cudaLib;
  std::tie(error, cudaLib) = tensorpipe::CudaLib::create();
  EXPECT_FALSE(error) << error.what();
  return cudaLib;
}

} // namespace

// This tests whether we can retrieve the index of the device on which a pointer
// resides under "normal" circumstances (in the same context where it was
// allocated, or in a "fresh" thread).
TEST(Cuda, DeviceForPointer) {
  if (TestEnvironment::numCudaDevices() < 2) {
    GTEST_SKIP() << "Skipping test requiring >=2 CUDA devices.";
  }

  ForkedThreadPeerGroup pg;
  pg.spawn(
      [&]() {
        TP_CUDA_CHECK(cudaSetDevice(1));
        void* ptr;
        TP_CUDA_CHECK(cudaMalloc(&ptr, 1024));

        EXPECT_EQ(tensorpipe::cudaDeviceForPointer(getCudaLib(), ptr), 1);

        std::string ptrStr(
            reinterpret_cast<char*>(&ptr),
            reinterpret_cast<char*>(&ptr) + sizeof(void*));
        pg.send(PeerGroup::kClient, ptrStr);
      },
      [&]() {
        std::string ptrStr = pg.recv(PeerGroup::kClient);
        void* ptr = *reinterpret_cast<void**>(&ptrStr[0]);

        EXPECT_EQ(tensorpipe::cudaDeviceForPointer(getCudaLib(), ptr), 1);
      });
}

// This tests whether we can retrieve the index of the device on which a pointer
// resided after we've explicitly set the current device to an invalid value.
// This is known to cause problems in recent versions of CUDA, possibly because
// of a bug.
TEST(Cuda, DeviceForPointerAfterReset) {
  if (TestEnvironment::numCudaDevices() < 2) {
    GTEST_SKIP() << "Skipping test requiring >=2 CUDA devices.";
  }

  ForkedThreadPeerGroup pg;
  pg.spawn(
      [&]() {
        TP_CUDA_CHECK(cudaSetDevice(1));
        void* ptr;
        TP_CUDA_CHECK(cudaMalloc(&ptr, 1024));

        TP_CUDA_CHECK(cudaSetDevice(0));

        EXPECT_EQ(tensorpipe::cudaDeviceForPointer(getCudaLib(), ptr), 1);

        std::string ptrStr(
            reinterpret_cast<char*>(&ptr),
            reinterpret_cast<char*>(&ptr) + sizeof(void*));
        pg.send(PeerGroup::kClient, ptrStr);
      },
      [&]() {
        std::string ptrStr = pg.recv(PeerGroup::kClient);
        void* ptr = *reinterpret_cast<void**>(&ptrStr[0]);

        TP_CUDA_CHECK(cudaSetDevice(0));

        EXPECT_EQ(tensorpipe::cudaDeviceForPointer(getCudaLib(), ptr), 1);
      });
}


================================================
FILE: tensorpipe/test/common/defs_test.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/common/defs.h>

#include <gtest/gtest.h>

TEST(Defs, Exception) {
  EXPECT_THROW(TP_THROW_EINVAL(), std::invalid_argument);
  EXPECT_THROW(TP_THROW_EINVAL() << "hola", std::invalid_argument);
  EXPECT_THROW(TP_THROW_EINVAL() << "adioshola", std::invalid_argument);
  EXPECT_THROW(TP_THROW_SYSTEM(ENODATA) << "adioshola", std::system_error);
  EXPECT_THROW(TP_THROW_SYSTEM(EBUSY), std::system_error);
  EXPECT_THROW(TP_THROW_SYSTEM(EBUSY) << "my message", std::system_error);
}


================================================
FILE: tensorpipe/test/common/epoll_loop_test.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <sys/eventfd.h>

#include <deque>

#include <tensorpipe/common/defs.h>
#include <tensorpipe/common/epoll_loop.h>

#include <gtest/gtest.h>

using namespace tensorpipe;

namespace {

class Handler : public EpollLoop::EventHandler {
 public:
  void handleEventsFromLoop(int events) override {
    std::unique_lock<std::mutex> lock(m_);
    events_.push_back(events);
    cv_.notify_all();
  }

  int nextEvents() {
    std::unique_lock<std::mutex> lock(m_);
    cv_.wait(lock, [&]() { return !events_.empty(); });
    int events = events_.front();
    events_.pop_front();
    return events;
  }

 private:
  std::mutex m_;
  std::condition_variable cv_;
  std::deque<int> events_;
};

// Monitor an fd for events and execute function when triggered.
//
// The lifetime of an instance dictates when the specified function
// may be called. The function is guaranteed to not be called after
// the monitor has been destructed.
//
class FunctionEventHandler
    : public EpollLoop::EventHandler,
      public std::enable_shared_from_this<FunctionEventHandler> {
 public:
  using TFunction = std::function<void(FunctionEventHandler&)>;

  FunctionEventHandler(
      DeferredExecutor& deferredExecutor,
      EpollLoop& loop,
      int fd,
      int event,
      TFunction fn);

  ~FunctionEventHandler() override;

  void start();

  void cancel();

  void handleEventsFromLoop(int events) override;

 private:
  DeferredExecutor& deferredExecutor_;
  EpollLoop& loop_;
  const int fd_;
  const int event_;
  TFunction fn_;

  std::mutex mutex_;
  bool cancelled_{false};
};

FunctionEventHandler::FunctionEventHandler(
    DeferredExecutor& deferredExecutor,
    EpollLoop& loop,
    int fd,
    int event,
    TFunction fn)
    : deferredExecutor_(deferredExecutor),
      loop_(loop),
      fd_(fd),
      event_(event),
      fn_(std::move(fn)) {}

FunctionEventHandler::~FunctionEventHandler() {
  cancel();
}

void FunctionEventHandler::start() {
  deferredExecutor_.runInLoop(
      [&]() { loop_.registerDescriptor(fd_, event_, shared_from_this()); });
}

void FunctionEventHandler::cancel() {
  std::unique_lock<std::mutex> lock(mutex_);
  if (!cancelled_) {
    deferredExecutor_.runInLoop([&]() {
      loop_.unregisterDescriptor(fd_);
      cancelled_ = true;
    });
  }
}

void FunctionEventHandler::handleEventsFromLoop(int events) {
  if (events & event_) {
    fn_(*this);
  }
}

// Instantiates an event monitor for the specified fd.
template <typename T>
std::shared_ptr<FunctionEventHandler> createMonitor(
    DeferredExecutor& reactor,
    EpollLoop& loop,
    std::shared_ptr<T> shared,
    int fd,
    int event,
    std::function<void(T&, FunctionEventHandler&)> fn) {
  auto handler = std::make_shared<FunctionEventHandler>(
      reactor,
      loop,
      fd,
      event,
      [weak{std::weak_ptr<T>{shared}},
       fn{std::move(fn)}](FunctionEventHandler& handler) {
        auto shared = weak.lock();
        if (shared) {
          fn(*shared, handler);
        }
      });
  handler->start();
  return handler;
}

} // namespace

TEST(ShmLoop, RegisterUnregister) {
  OnDemandDeferredExecutor deferredExecutor;
  EpollLoop loop{deferredExecutor};
  auto handler = std::make_shared<Handler>();
  auto efd = Fd(eventfd(0, EFD_NONBLOCK));

  {
    // Test if writable (always).
    deferredExecutor.runInLoop([&]() {
      loop.registerDescriptor(efd.fd(), EPOLLOUT | EPOLLONESHOT, handler);
    });
    ASSERT_EQ(handler->nextEvents(), EPOLLOUT);
    efd.writeOrThrow<uint64_t>(1337);

    // Test if readable (only if previously written to).
    deferredExecutor.runInLoop([&]() {
      loop.registerDescriptor(efd.fd(), EPOLLIN | EPOLLONESHOT, handler);
    });
    ASSERT_EQ(handler->nextEvents(), EPOLLIN);
    ASSERT_EQ(efd.readOrThrow<uint64_t>(), 1337);

    // Test if we can unregister the descriptor.
    deferredExecutor.runInLoop([&]() { loop.unregisterDescriptor(efd.fd()); });
  }

  loop.join();
}

TEST(ShmLoop, Monitor) {
  OnDemandDeferredExecutor deferredExecutor;
  EpollLoop loop{deferredExecutor};
  auto efd = Fd(eventfd(0, EFD_NONBLOCK));
  constexpr uint64_t kValue = 1337;

  {
    std::mutex mutex;
    std::condition_variable cv;
    bool done = false;

    // Test if writable (always).
    auto shared = std::make_shared<int>(1338);
    auto monitor = createMonitor<int>(
        deferredExecutor,
        loop,
        shared,
        efd.fd(),
        EPOLLOUT,
        [&](int& i, FunctionEventHandler& handler) {
          EXPECT_EQ(i, 1338);
          efd.writeOrThrow<uint64_t>(kValue);
          handler.cancel();
          {
            std::unique_lock<std::mutex> lock(mutex);
            done = true;
            cv.notify_all();
          }
        });

    // Wait for monitor to trigger and perform a write.
    std::unique_lock<std::mutex> lock(mutex);
    cv.wait(lock, [&]() { return done; });
  }

  {
    std::mutex mutex;
    std::condition_variable cv;
    bool done = false;
    uint64_t value = 0;

    // Test if readable (only if previously written to).
    auto shared = std::make_shared<int>(1338);
    auto monitor = createMonitor<int>(
        deferredExecutor,
        loop,
        shared,
        efd.fd(),
        EPOLLIN,
        [&](int& i, FunctionEventHandler& handler) {
          EXPECT_EQ(i, 1338);
          value = efd.readOrThrow<uint64_t>();
          handler.cancel();
          {
            std::unique_lock<std::mutex> lock(mutex);
            done = true;
            cv.notify_all();
          }
        });

    // Wait for monitor to trigger and perform a read.
    std::unique_lock<std::mutex> lock(mutex);
    cv.wait(lock, [&]() { return done; });

    // Verify we read the correct value.
    ASSERT_EQ(value, kValue);
  }

  loop.join();
}

TEST(ShmLoop, Defer) {
  OnDemandDeferredExecutor deferredExecutor;
  auto promise = std::make_shared<std::promise<void>>();
  auto future = promise->get_future();
  deferredExecutor.deferToLoop([promise]() { promise->set_value(); });
  future.wait();
  ASSERT_TRUE(future.valid());
}


================================================
FILE: tensorpipe/test/common/ringbuffer_test.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <array>

#include <tensorpipe/common/ringbuffer.h>
#include <tensorpipe/common/ringbuffer_role.h>

#include <gtest/gtest.h>

using namespace tensorpipe;

struct TestData {
  uint16_t a;
  uint16_t b;
  uint16_t c;

  bool operator==(const TestData& other) const {
    return a == other.a && b == other.b && c == other.c;
  }
};

constexpr static int kNumRingbufferRoles = 2;
constexpr static int kConsumerRoleIdx = 0;
constexpr static int kProducerRoleIdx = 1;
using Consumer = RingBufferRole<kNumRingbufferRoles, kConsumerRoleIdx>;
using Producer = RingBufferRole<kNumRingbufferRoles, kProducerRoleIdx>;

// Holds and owns the memory for the ringbuffer's header and data.
class RingBufferStorage {
 public:
  explicit RingBufferStorage(size_t size) : header_(size) {}

  RingBuffer<kNumRingbufferRoles> getRb() {
    return {&header_, data_.get()};
  }

 private:
  RingBufferHeader<kNumRingbufferRoles> header_;
  std::unique_ptr<uint8_t[]> data_ =
      std::make_unique<uint8_t[]>(header_.kDataPoolByteSize);
};

size_t usedSize(RingBuffer<kNumRingbufferRoles>& rb) {
  return rb.getHeader().template readMarker<kProducerRoleIdx>() -
      rb.getHeader().template readMarker<kConsumerRoleIdx>();
}

TEST(RingBuffer, WriteCopy) {
  EXPECT_EQ(sizeof(TestData), 6);

  // 16 bytes buffer. Fits two full TestData (each 6).
  size_t size = 1u << 4;

  RingBufferStorage storage(size);
  RingBuffer<kNumRingbufferRoles> rb = storage.getRb();
  // Make a producer.
  Producer p{rb};
  // Make a consumer.
  Consumer c{rb};

  EXPECT_EQ(usedSize(rb), 0);

  TestData d0{.a = 0xBA98, .b = 0x7654, .c = 0xA312};
  TestData d1{.a = 0xA987, .b = 0x7777, .c = 0x2812};
  TestData d2{.a = 0xFFFF, .b = 0x3333, .c = 0x1212};

  {
    ssize_t ret = p.write(&d0, sizeof(d0));
    EXPECT_EQ(ret, sizeof(TestData));
  }
  EXPECT_EQ(usedSize(rb), 6);

  {
    ssize_t ret = p.write(&d1, sizeof(d1));
    EXPECT_EQ(ret, sizeof(TestData));
  }
  EXPECT_EQ(usedSize(rb), 12);

  {
    ssize_t ret = p.write(&d2, sizeof(d2));
    EXPECT_EQ(ret, -ENODATA) << "Needs 2 more bytes to write the 6 required, "
                                "because 12 out of 16 are used.";
  }

  TestData r;

  {
    ssize_t ret = c.read(&r, sizeof(r));
    EXPECT_EQ(ret, sizeof(r));
    EXPECT_EQ(r, d0);
  }

  {
    ssize_t ret = c.read(&r, sizeof(r));
    EXPECT_EQ(ret, sizeof(r));
    EXPECT_EQ(r, d1);
  }
  // It should be empty by now.
  EXPECT_EQ(usedSize(rb), 0);

  {
    ssize_t ret = p.write(&d2, sizeof(d2));
    EXPECT_EQ(ret, sizeof(TestData));
  }
  {
    ssize_t ret = c.read(&r, sizeof(r));
    EXPECT_EQ(ret, sizeof(r));
    EXPECT_EQ(r, d2);
  }
  // It should be empty by now.
  EXPECT_EQ(usedSize(rb), 0);
}

TEST(RingBuffer, ReadMultipleElems) {
  // 256 bytes buffer.
  size_t size = 1u << 8u;

  RingBufferStorage storage(size);
  RingBuffer<kNumRingbufferRoles> rb = storage.getRb();
  // Make a producer.
  Producer p{rb};
  // Make a consumer.
  Consumer c{rb};

  EXPECT_EQ(usedSize(rb), 0);

  uint16_t n = 0xACAC; // fits 128 times

  {
    for (int i = 0; i < 128; ++i) {
      ssize_t ret = p.write(&n, sizeof(n));
      EXPECT_EQ(ret, sizeof(n));
    }

    // It must be full by now.
    EXPECT_EQ(usedSize(rb), 256);

    ssize_t ret = p.write(&n, sizeof(n));
    EXPECT_EQ(ret, -ENODATA);
  }

  {
    uint8_t b = 0xEE;

    ssize_t ret = p.write(&b, sizeof(b));
    EXPECT_EQ(ret, -ENODATA) << "Needs an extra byte";
  }

  {
    // read the three bytes at once.
    ssize_t ret;
    ret = c.startTx();
    EXPECT_EQ(ret, 0);

    std::array<uint8_t, 3> r;
    ret = c.readInTx</*AllowPartial=*/false>(r.data(), sizeof(r));
    EXPECT_EQ(ret, 3);
    EXPECT_EQ(r[0], 0xAC);
    EXPECT_EQ(r[1], 0xAC);
    EXPECT_EQ(r[2], 0xAC);
    ret = c.commitTx();
    EXPECT_EQ(ret, 0);
  }

  {
    // read 253 bytes at once.
    ssize_t ret;
    ret = c.startTx();
    EXPECT_EQ(ret, 0);

    std::array<uint8_t, 253> r;
    ret = c.readInTx</*AllowPartial=*/false>(r.data(), sizeof(r));
    EXPECT_EQ(ret, 253);
    for (int i = 0; i < 253; ++i) {
      EXPECT_EQ(r[i], 0xAC);
    }
    ret = c.commitTx();
    EXPECT_EQ(ret, 0);
  }

  {
    // No more elements
    ssize_t ret;
    ret = c.startTx();
    EXPECT_EQ(ret, 0);
    uint8_t ch;
    ret = c.readInTx</*AllowPartial=*/false>(&ch, sizeof(ch));
    EXPECT_EQ(ret, -ENODATA);
    ret = c.cancelTx();
    EXPECT_EQ(ret, 0);
    EXPECT_TRUE(!c.inTx()) << "Canceled transaction should've been canceled";
  }
}

TEST(RingBuffer, CopyWrapping) {
  // 8 bytes buffer.
  size_t size = 1u << 3;

  RingBufferStorage storage(size);
  RingBuffer<kNumRingbufferRoles> rb = storage.getRb();
  // Make a producer.
  Producer p{rb};
  // Make a consumer.
  Consumer c{rb};

  EXPECT_EQ(usedSize(rb), 0);

  uint8_t ch = 0xA7;
  uint64_t n = 0xFFFFFFFFFFFFFFFF;

  // Put one byte.
  EXPECT_EQ(rb.getHeader().template readMarker<kProducerRoleIdx>(), 0);
  EXPECT_EQ(rb.getHeader().template readMarker<kConsumerRoleIdx>(), 0);
  EXPECT_EQ(usedSize(rb), 0);
  ssize_t ret = p.write(&ch, sizeof(ch));
  EXPECT_EQ(ret, sizeof(ch));
  EXPECT_EQ(rb.getHeader().template readMarker<kProducerRoleIdx>(), 1);
  EXPECT_EQ(rb.getHeader().template readMarker<kConsumerRoleIdx>(), 0);
  EXPECT_EQ(usedSize(rb), 1);

  // Next 8 bytes won't fit.
  ret = p.write(&n, sizeof(n));
  EXPECT_EQ(ret, -ENODATA)
      << "Needs an extra byte to write the 8 bytes element. "
         "Capacity 8, used 1.";

  // Remove the one byte in, now head is one off.
  uint8_t cr;
  uint64_t nr;

  ret = c.read(&cr, sizeof(cr));
  EXPECT_EQ(ret, sizeof(cr));
  EXPECT_EQ(cr, ch);
  EXPECT_EQ(rb.getHeader().template readMarker<kProducerRoleIdx>(), 1);
  EXPECT_EQ(rb.getHeader().template readMarker<kConsumerRoleIdx>(), 1);

  // Next 8 bytes will fit, but wrap.
  ret = p.write(&n, sizeof(n));
  EXPECT_EQ(ret, sizeof(n));
  EXPECT_EQ(rb.getHeader().template readMarker<kProducerRoleIdx>(), 9);
  EXPECT_EQ(rb.getHeader().template readMarker<kConsumerRoleIdx>(), 1);

  ret = c.read(&nr, sizeof(nr));
  EXPECT_EQ(ret, sizeof(nr));
  EXPECT_EQ(nr, n);
  EXPECT_EQ(rb.getHeader().template readMarker<kProducerRoleIdx>(), 9);
  EXPECT_EQ(rb.getHeader().template readMarker<kConsumerRoleIdx>(), 9);
}

TEST(RingBuffer, ReadTxWrappingOneCons) {
  // 8 bytes buffer.
  size_t size = 1u << 3;

  RingBufferStorage storage(size);
  RingBuffer<kNumRingbufferRoles> rb = storage.getRb();
  // Make a producer.
  Producer p{rb};
  // Make a consumer.
  Consumer c1{rb};

  EXPECT_EQ(usedSize(rb), 0);

  uint8_t ch = 0xA7;
  uint64_t n = 0xFFFFFFFFFFFFFFFF;

  // Put one byte.
  {
    EXPECT_EQ(rb.getHeader().template readMarker<kProducerRoleIdx>(), 0);
    EXPECT_EQ(rb.getHeader().template readMarker<kConsumerRoleIdx>(), 0);
    EXPECT_EQ(usedSize(rb), 0);
    ssize_t ret = p.write(&ch, sizeof(ch));
    EXPECT_EQ(ret, sizeof(ch));
    EXPECT_EQ(rb.getHeader().template readMarker<kProducerRoleIdx>(), 1);
    EXPECT_EQ(rb.getHeader().template readMarker<kConsumerRoleIdx>(), 0);
    EXPECT_EQ(usedSize(rb), 1);
  }

  // Next 8 bytes won't fit.
  {
    ssize_t ret = p.write(&n, sizeof(n));
    EXPECT_EQ(ret, -ENODATA)
        << "Needs an extra byte to write the 8 bytes element. "
           "Capacity 8, used 1.";
  }

  // Remove the one byte in, now head is one off.
  EXPECT_FALSE(c1.inTx());

  {
    // Start c1 read Tx
    ssize_t ret;
    ret = c1.startTx();
    EXPECT_EQ(ret, 0);

    uint8_t rch;
    ret = c1.readInTx</*AllowPartial=*/false>(&rch, sizeof(rch));
    EXPECT_EQ(ret, sizeof(uint8_t));
    EXPECT_EQ(rch, ch);
    EXPECT_EQ(rb.getHeader().template readMarker<kProducerRoleIdx>(), 1);
    EXPECT_EQ(rb.getHeader().template readMarker<kConsumerRoleIdx>(), 0);
    EXPECT_TRUE(c1.inTx());
  }

  {
    // Complete c1's Tx.
    ssize_t ret = c1.commitTx();
    EXPECT_EQ(ret, 0);
    EXPECT_EQ(rb.getHeader().template readMarker<kProducerRoleIdx>(), 1);
    EXPECT_EQ(rb.getHeader().template readMarker<kConsumerRoleIdx>(), 1);
  }
  {
    // Retrying to commit should fail.
    ssize_t ret = c1.commitTx();
    EXPECT_EQ(ret, -EINVAL);
  }

  {
    // Next 8 bytes will fit, but wrap.
    ssize_t ret = p.write(&n, sizeof(n));
    EXPECT_EQ(ret, sizeof(n));
    EXPECT_EQ(rb.getHeader().template readMarker<kProducerRoleIdx>(), 9);
    EXPECT_EQ(rb.getHeader().template readMarker<kConsumerRoleIdx>(), 1);
  }

  {
    // Start c1 read Tx again.
    ssize_t ret;
    ret = c1.startTx();
    EXPECT_EQ(ret, 0);

    uint64_t rn;
    ret = c1.readInTx</*AllowPartial=*/false>(&rn, sizeof(rn));
    EXPECT_EQ(ret, sizeof(uint64_t));
    EXPECT_EQ(rn, n);
    EXPECT_EQ(rb.getHeader().template readMarker<kProducerRoleIdx>(), 9);
    EXPECT_EQ(rb.getHeader().template readMarker<kConsumerRoleIdx>(), 1);
    EXPECT_TRUE(c1.inTx());
  }

  {
    // Complete c1.
    ssize_t ret = c1.commitTx();
    EXPECT_EQ(ret, 0);
    ret = c1.commitTx();
    EXPECT_EQ(ret, -EINVAL);
  }

  {
    // Next 8 bytes will fit, but wrap.
    ssize_t ret = p.write(&n, sizeof(n));
    EXPECT_EQ(ret, sizeof(n));
    EXPECT_EQ(rb.getHeader().template readMarker<kProducerRoleIdx>(), 17);
    EXPECT_EQ(rb.getHeader().template readMarker<kConsumerRoleIdx>(), 9);
  }
  {
    ssize_t ret;
    ret = c1.startTx();
    EXPECT_EQ(ret, 0);

    uint64_t rn;
    ret = c1.readInTx</*AllowPartial=*/false>(&rn, sizeof(rn));
    EXPECT_EQ(ret, sizeof(uint64_t));
    EXPECT_EQ(rn, n);
    EXPECT_EQ(rb.getHeader().template readMarker<kProducerRoleIdx>(), 17);
    EXPECT_EQ(rb.getHeader().template readMarker<kConsumerRoleIdx>(), 9);
  }

  {
    // Cancel tx, data should be readable again.
    ssize_t ret = c1.cancelTx();
    EXPECT_EQ(ret, 0);
  }

  {
    // Now c1 can read.
    ssize_t ret;
    ret = c1.startTx();
    EXPECT_EQ(ret, 0);

    uint64_t rn;
    ret = c1.readInTx</*AllowPartial=*/false>(&rn, sizeof(rn));
    EXPECT_EQ(ret, sizeof(uint64_t));
    EXPECT_EQ(rn, n);
    EXPECT_EQ(rb.getHeader().template readMarker<kProducerRoleIdx>(), 17);
    EXPECT_EQ(rb.getHeader().template readMarker<kConsumerRoleIdx>(), 9);
  }

  {
    // Commit succeds.
    ssize_t ret = c1.commitTx();
    EXPECT_EQ(ret, 0);
    EXPECT_FALSE(c1.inTx());
  }
}

TEST(RingBuffer, ReadTxWrapping) {
  // 8 bytes buffer.
  size_t size = 1u << 3;

  RingBufferStorage storage(size);
  RingBuffer<kNumRingbufferRoles> rb = storage.getRb();
  // Make a producer.
  Producer p{rb};
  // Make consumers.
  Consumer c1{rb};
  Consumer c2{rb};

  EXPECT_EQ(usedSize(rb), 0);

  uint8_t ch = 0xA7;
  uint64_t n = 0x3333333333333333;

  // Put one byte.
  {
    EXPECT_EQ(rb.getHeader().template readMarker<kProducerRoleIdx>(), 0);
    EXPECT_EQ(rb.getHeader().template readMarker<kConsumerRoleIdx>(), 0);
    EXPECT_EQ(usedSize(rb), 0);
    ssize_t ret = p.write(&ch, sizeof(ch));
    EXPECT_EQ(ret, sizeof(ch));
    EXPECT_EQ(rb.getHeader().template readMarker<kProducerRoleIdx>(), 1);
    EXPECT_EQ(rb.getHeader().template readMarker<kConsumerRoleIdx>(), 0);
    EXPECT_EQ(usedSize(rb), 1);
  }

  // Next 8 bytes won't fit.
  {
    ssize_t ret = p.write(&n, sizeof(n));
    EXPECT_EQ(ret, -ENODATA)
        << "Needs an extra byte to write the 8 bytes element. "
           "Capacity 8, used 1.";
  }

  // Remove the one byte in, now head is one off.
  EXPECT_FALSE(c1.inTx());
  EXPECT_FALSE(c2.inTx());

  {
    // Start c1 read Tx
    ssize_t ret;
    ret = c1.startTx();
    EXPECT_EQ(ret, 0);

    uint8_t rch;
    ret = c1.readInTx</*AllowPartial=*/false>(&rch, sizeof(rch));
    EXPECT_EQ(ret, sizeof(uint8_t));
    EXPECT_EQ(rch, ch);
    EXPECT_EQ(rb.getHeader().template readMarker<kProducerRoleIdx>(), 1);
    EXPECT_EQ(rb.getHeader().template readMarker<kConsumerRoleIdx>(), 0);
    EXPECT_TRUE(c1.inTx());
  }

  {
    // Complete c1's Tx.
    ssize_t ret = c1.commitTx();
    EXPECT_EQ(ret, 0);
    EXPECT_EQ(rb.getHeader().template readMarker<kProducerRoleIdx>(), 1);
    EXPECT_EQ(rb.getHeader().template readMarker<kConsumerRoleIdx>(), 1);
  }
  {
    // Retrying to commit should fail.
    ssize_t ret = c1.commitTx();
    EXPECT_EQ(ret, -EINVAL);
  }

  {
    // Next 8 bytes will fit, but wrap.
    ssize_t ret = p.write(&n, sizeof(n));
    EXPECT_EQ(ret, sizeof(n));
    EXPECT_EQ(rb.getHeader().template readMarker<kProducerRoleIdx>(), 9);
    EXPECT_EQ(rb.getHeader().template readMarker<kConsumerRoleIdx>(), 1);
  }

  {
    // Start c1 read Tx again.
    ssize_t ret;
    ret = c1.startTx();
    EXPECT_EQ(ret, 0);

    uint64_t rn;
    ret = c1.readInTx</*AllowPartial=*/false>(&rn, sizeof(rn));
    EXPECT_EQ(ret, sizeof(uint64_t));
    EXPECT_EQ(rn, n);
    EXPECT_EQ(rb.getHeader().template readMarker<kProducerRoleIdx>(), 9);
    EXPECT_EQ(rb.getHeader().template readMarker<kConsumerRoleIdx>(), 1);
    EXPECT_TRUE(c1.inTx());
  }

  {
    // Try to start read tx before c1 completing and get -EAGAIN.
    ssize_t ret;
    ret = c2.startTx();
    EXPECT_EQ(ret, -EAGAIN);
  }

  {
    // Complete c1.
    ssize_t ret = c1.commitTx();
    EXPECT_EQ(ret, 0);
    ret = c1.commitTx();
    EXPECT_EQ(ret, -EINVAL);
  }

  {
    // Next 8 bytes will fit, but wrap.
    ssize_t ret = p.write(&n, sizeof(n));
    EXPECT_EQ(ret, sizeof(n));
    EXPECT_EQ(rb.getHeader().template readMarker<kProducerRoleIdx>(), 17);
    EXPECT_EQ(rb.getHeader().template readMarker<kConsumerRoleIdx>(), 9);
  }
  {
    ssize_t ret;
    ret = c2.startTx();
    EXPECT_EQ(ret, 0);

    uint64_t rn;
    ret = c2.readInTx</*AllowPartial=*/false>(&rn, sizeof(rn));
    EXPECT_EQ(ret, sizeof(uint64_t));
    EXPECT_EQ(rn, n);
    EXPECT_EQ(rb.getHeader().template readMarker<kProducerRoleIdx>(), 17);
    EXPECT_EQ(rb.getHeader().template readMarker<kConsumerRoleIdx>(), 9);
  }

  {
    // Cancel tx, data should be readable again.
    ssize_t ret = c2.cancelTx();
    EXPECT_EQ(ret, 0);
  }

  {
    // Now c1 can read.
    ssize_t ret;
    ret = c1.startTx();
    EXPECT_EQ(ret, 0);

    uint64_t rn;
    ret = c1.readInTx</*AllowPartial=*/false>(&rn, sizeof(rn));
    EXPECT_EQ(ret, sizeof(uint64_t));
    EXPECT_EQ(rn, n);
    EXPECT_EQ(rb.getHeader().template readMarker<kProducerRoleIdx>(), 17);
    EXPECT_EQ(rb.getHeader().template readMarker<kConsumerRoleIdx>(), 9);
  }

  {
    // Commit succeds.
    ssize_t ret = c1.commitTx();
    EXPECT_EQ(ret, 0);
    EXPECT_FALSE(c1.inTx());
    EXPECT_FALSE(c2.inTx());
  }
}

TEST(RingBuffer, accessContiguousInTx) {
  // 256 bytes buffer.
  size_t size = 1u << 8u;

  RingBufferStorage storage(size);
  RingBuffer<kNumRingbufferRoles> rb = storage.getRb();
  // Make a producer.
  Producer p{rb};
  // Make a consumer.
  Consumer c{rb};

  EXPECT_EQ(usedSize(rb), 0);

  // Use different values for the three writing passes to tell them apart.
  uint16_t value1 = 0xACAC; // fits 128 times
  uint16_t value2 = 0xDCDC; // fits 128 times
  uint16_t value3 = 0xEFEF; // fits 128 times

  {
    for (int i = 0; i < 128; ++i) {
      ssize_t ret = p.write(&value1, sizeof(value1));
      EXPECT_EQ(ret, sizeof(value1));
    }

    // It must be full by now.
    EXPECT_EQ(usedSize(rb), 256);

    uint8_t b = 0xEE;
    ssize_t ret = p.write(&b, sizeof(b));
    EXPECT_EQ(ret, -ENODATA);
  }

  {
    // Read a 128-byte buffer that is left-aligned with the start.
    ssize_t ret;
    ret = c.startTx();
    EXPECT_EQ(ret, 0);

    std::array<Consumer::Buffer, 2> buffers;
    std::tie(ret, buffers) = c.accessContiguousInTx</*AllowPartial=*/true>(128);
    EXPECT_EQ(ret, 1);
    EXPECT_EQ(buffers[0].len, 128);
    for (int i = 0; i < 128; ++i) {
      EXPECT_EQ(buffers[0].ptr[i], 0xAC);
    }
    ret = c.commitTx();
    EXPECT_EQ(ret, 0);
    EXPECT_EQ(usedSize(rb), 128);
  }

  {
    for (int i = 0; i < 64; ++i) {
      ssize_t ret = p.write(&value2, sizeof(value2));
      EXPECT_EQ(ret, sizeof(value2));
    }

    // It must be full again by now.
    EXPECT_EQ(usedSize(rb), 256);
  }

  {
    // Read a 256-byte buffer that wraps around halfway through.
    ssize_t ret;
    ret = c.startTx();
    EXPECT_EQ(ret, 0);

    std::array<Consumer::Buffer, 2> buffers;
    std::tie(ret, buffers) = c.accessContiguousInTx</*AllowPartial=*/true>(256);
    EXPECT_EQ(ret, 2);
    EXPECT_EQ(buffers[0].len, 128);
    for (int i = 0; i < 128; ++i) {
      EXPECT_EQ(buffers[0].ptr[i], 0xAC);
    }
    EXPECT_EQ(buffers[1].len, 128);
    for (int i = 0; i < 128; ++i) {
      EXPECT_EQ(buffers[1].ptr[i], 0xDC);
    }
    ret = c.commitTx();
    EXPECT_EQ(ret, 0);
    EXPECT_EQ(usedSize(rb), 0);
  }

  {
    for (int i = 0; i < 64; ++i) {
      ssize_t ret = p.write(&value2, sizeof(value2));
      EXPECT_EQ(ret, sizeof(value2));
    }
    for (int i = 0; i < 64; ++i) {
      ssize_t ret = p.write(&value3, sizeof(value3));
      EXPECT_EQ(ret, sizeof(value3));
    }

    // It must be full again by now.
    EXPECT_EQ(usedSize(rb), 256);
  }

  {
    // Read a 128-byte buffer that is right-aligned with the end.
    ssize_t ret;
    ret = c.startTx();
    EXPECT_EQ(ret, 0);

    std::array<Consumer::Buffer, 2> buffers;
    std::tie(ret, buffers) = c.accessContiguousInTx</*AllowPartial=*/true>(128);
    EXPECT_EQ(ret, 1);
    EXPECT_EQ(buffers[0].len, 128);
    for (int i = 0; i < 128; ++i) {
      EXPECT_EQ(buffers[0].ptr[i], 0xDC);
    }
    ret = c.commitTx();
    EXPECT_EQ(ret, 0);
    EXPECT_EQ(usedSize(rb), 128);
  }

  {
    for (int i = 0; i < 64; ++i) {
      ssize_t ret = p.write(&value3, sizeof(value3));
      EXPECT_EQ(ret, sizeof(value3));
    }

    // It must be full again by now.
    EXPECT_EQ(usedSize(rb), 256);
  }

  {
    // Reading the whole 256 bytes.
    ssize_t ret;
    ret = c.startTx();
    EXPECT_EQ(ret, 0);

    std::array<Consumer::Buffer, 2> buffers;
    std::tie(ret, buffers) = c.accessContiguousInTx</*AllowPartial=*/true>(256);
    EXPECT_EQ(ret, 1);
    EXPECT_EQ(buffers[0].len, 256);
    for (int i = 0; i < 256; ++i) {
      EXPECT_EQ(buffers[0].ptr[i], 0xEF);
    }
    ret = c.commitTx();
    EXPECT_EQ(ret, 0);
    EXPECT_EQ(usedSize(rb), 0);
  }

  {
    // Attempt reading from empty buffer.
    ssize_t ret;
    ret = c.startTx();
    EXPECT_EQ(ret, 0);

    std::array<Consumer::Buffer, 2> buffers;
    std::tie(ret, buffers) = c.accessContiguousInTx</*AllowPartial=*/true>(200);
    EXPECT_EQ(ret, 0);
    ret = c.commitTx();
    EXPECT_EQ(ret, 0);
    EXPECT_EQ(usedSize(rb), 0);
  }
}


================================================
FILE: tensorpipe/test/common/shm_ringbuffer_test.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/common/ringbuffer.h>
#include <tensorpipe/common/ringbuffer_role.h>
#include <tensorpipe/common/shm_ringbuffer.h>
#include <tensorpipe/common/shm_segment.h>
#include <tensorpipe/common/socket.h>

#include <sys/eventfd.h>
#include <sys/socket.h>
#include <sys/types.h>

#include <thread>

#include <gtest/gtest.h>

using namespace tensorpipe;

constexpr static int kNumRingbufferRoles = 2;
using Consumer = RingBufferRole<kNumRingbufferRoles, 0>;
using Producer = RingBufferRole<kNumRingbufferRoles, 1>;

// Same process produces and consumes share memory through different mappings.
TEST(ShmRingBuffer, SameProducerConsumer) {
  Fd headerFd;
  Fd dataFd;
  {
    // Producer part.
    // Buffer large enough to fit all data and persistent
    // (needs to be unlinked up manually).
    Error error;
    ShmSegment headerSegment;
    ShmSegment dataSegment;
    RingBuffer<kNumRingbufferRoles> rb;
    std::tie(error, headerSegment, dataSegment, rb) =
        createShmRingBuffer<kNumRingbufferRoles>(256 * 1024);
    Producer prod{rb};

    // Producer loop. It all fits in buffer.
    int i = 0;
    while (i < 2000) {
      ssize_t ret = prod.write(&i, sizeof(i));
      EXPECT_EQ(ret, sizeof(i));
      ++i;
    }

    // Duplicate the file descriptors so that the shared memory remains alive
    // when the original fds are closed by the segments' destructors.
    headerFd = Fd(::dup(headerSegment.getFd()));
    dataFd = Fd(::dup(dataSegment.getFd()));
  }

  {
    // Consumer part.
    // Map file again (to a different address) and consume it.
    Error error;
    ShmSegment headerSegment;
    ShmSegment dataSegment;
    RingBuffer<kNumRingbufferRoles> rb;
    std::tie(error, headerSegment, dataSegment, rb) =
        loadShmRingBuffer<kNumRingbufferRoles>(
            std::move(headerFd), std::move(dataFd));
    Consumer cons{rb};

    int i = 0;
    while (i < 2000) {
      int value;
      ssize_t ret = cons.read(&value, sizeof(value));
      EXPECT_EQ(ret, sizeof(value));
      EXPECT_EQ(value, i);
      ++i;
    }
  }
};

TEST(ShmRingBuffer, SingleProducer_SingleConsumer) {
  int sockFds[2];
  {
    int rv = socketpair(AF_UNIX, SOCK_STREAM, 0, sockFds);
    if (rv != 0) {
      TP_THROW_SYSTEM(errno) << "Failed to create socket pair";
    }
  }

  int eventFd = eventfd(0, 0);
  if (eventFd < 0) {
    TP_THROW_SYSTEM(errno) << "Failed to create event fd";
  }

  int pid = fork();
  if (pid < 0) {
    TP_THROW_SYSTEM(errno) << "Failed to fork";
  }

  if (pid == 0) {
    // child, the producer
    // Make a scope so segments are destroyed even on exit(0).
    {
      Error error;
      ShmSegment headerSegment;
      ShmSegment dataSegment;
      RingBuffer<kNumRingbufferRoles> rb;
      std::tie(error, headerSegment, dataSegment, rb) =
          createShmRingBuffer<kNumRingbufferRoles>(1024);
      Producer prod{rb};

      {
        auto err = sendFdsToSocket(
            sockFds[0], headerSegment.getFd(), dataSegment.getFd());
        if (err) {
          TP_THROW_ASSERT() << err.what();
        }
      }

      int i = 0;
      while (i < 2000) {
        ssize_t ret = prod.write(&i, sizeof(i));
        if (ret == -ENODATA) {
          std::this_thread::yield();
          continue;
        }
        EXPECT_EQ(ret, sizeof(i));
        ++i;
      }
      // Because of buffer size smaller than amount of data written,
      // producer cannot have completed the loop before consumer
      // started consuming the data.

      {
        uint64_t c;
        ::read(eventFd, &c, sizeof(uint64_t));
      }
    }
    // Child exits. Careful when calling exit() directly, because
    // it does not call destructors. We ensured shared_ptrs were
    // destroyed before by calling exit(0).
    exit(0);
  }
  // parent, the consumer

  // Wait for other process to create buffer.
  Fd headerFd;
  Fd dataFd;
  {
    auto err = recvFdsFromSocket(sockFds[1], headerFd, dataFd);
    if (err) {
      TP_THROW_ASSERT() << err.what();
    }
  }
  Error error;
  ShmSegment headerSegment;
  ShmSegment dataSegment;
  RingBuffer<kNumRingbufferRoles> rb;
  std::tie(error, headerSegment, dataSegment, rb) =
      loadShmRingBuffer<kNumRingbufferRoles>(
          std::move(headerFd), std::move(dataFd));
  Consumer cons{rb};

  int i = 0;
  while (i < 2000) {
    int value;
    ssize_t ret = cons.read(&value, sizeof(value));
    if (ret == -ENODATA) {
      std::this_thread::yield();
      continue;
    }
    EXPECT_EQ(ret, sizeof(value));
    EXPECT_EQ(value, i);
    ++i;
  }
  {
    uint64_t c = 1;
    ::write(eventFd, &c, sizeof(uint64_t));
  }
  ::close(eventFd);
  ::close(sockFds[0]);
  ::close(sockFds[1]);
  // Wait for child to make gtest happy.
  ::wait(nullptr);
};


================================================
FILE: tensorpipe/test/common/shm_segment_test.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/common/shm_segment.h>
#include <tensorpipe/common/socket.h>

#include <sys/eventfd.h>
#include <sys/socket.h>

#include <thread>

#include <gtest/gtest.h>

using namespace tensorpipe;

// Same process produces and consumes share memory through different mappings.
TEST(ShmSegment, SameProducerConsumer_Scalar) {
  // Set affinity of producer to CPU zero so that consumer only has to read from
  // that one CPU's buffer.
  cpu_set_t cpuset;
  CPU_ZERO(&cpuset);
  CPU_SET(0, &cpuset);
  sched_setaffinity(0, sizeof(cpu_set_t), &cpuset);

  // This must stay alive for the file descriptor to remain open.
  Fd fd;
  {
    // Producer part.
    Error error;
    ShmSegment segment;
    int* myIntPtr;
    std::tie(error, segment, myIntPtr) = ShmSegment::create<int>();
    ASSERT_FALSE(error) << error.what();
    int& myInt = *myIntPtr;
    myInt = 1000;

    // Duplicate the file descriptor so that the shared memory remains alive
    // when the original fd is closed by the segment's destructor.
    fd = Fd(::dup(segment.getFd()));
  }

  {
    // Consumer part.
    // Map file again (to a different address) and consume it.
    Error error;
    ShmSegment segment;
    int* myIntPtr;
    std::tie(error, segment, myIntPtr) = ShmSegment::load<int>(std::move(fd));
    ASSERT_FALSE(error) << error.what();
    EXPECT_EQ(segment.getSize(), sizeof(int));
    EXPECT_EQ(*myIntPtr, 1000);
  }
};

TEST(ShmSegment, SingleProducer_SingleConsumer_Array) {
  size_t numFloats = 330000;

  int sockFds[2];
  {
    int rv = socketpair(AF_UNIX, SOCK_STREAM, 0, sockFds);
    if (rv != 0) {
      TP_THROW_SYSTEM(errno) << "Failed to create socket pair";
    }
  }

  int eventFd = eventfd(0, 0);
  if (eventFd < 0) {
    TP_THROW_SYSTEM(errno) << "Failed to create event fd";
  }

  int pid = fork();
  if (pid < 0) {
    TP_THROW_SYSTEM(errno) << "Failed to fork";
  }

  if (pid == 0) {
    // child, the producer
    // Make a scope so shared_ptr's are released even on exit(0).
    {
      // use huge pages in creation and not in loading. This should only affects
      // TLB overhead.
      Error error;
      ShmSegment segment;
      float* myFloats;
      std::tie(error, segment, myFloats) =
          ShmSegment::create<float[]>(numFloats);
      ASSERT_FALSE(error) << error.what();

      for (int i = 0; i < numFloats; ++i) {
        myFloats[i] = i;
      }

      {
        auto err = sendFdsToSocket(sockFds[0], segment.getFd());
        if (err) {
          TP_THROW_ASSERT() << err.what();
        }
      }
      {
        uint64_t c;
        ::read(eventFd, &c, sizeof(uint64_t));
      }
    }
    // Child exits. Careful when calling exit() directly, because
    // it does not call destructors. We ensured shared_ptrs were
    // destroyed before by calling exit(0).
    exit(0);
  }

  // parent, the consumer
  Fd segmentFd;
  {
    auto err = recvFdsFromSocket(sockFds[1], segmentFd);
    if (err) {
      TP_THROW_ASSERT() << err.what();
    }
  }
  Error error;
  ShmSegment segment;
  float* myFloats;
  std::tie(error, segment, myFloats) =
      ShmSegment::load<float[]>(std::move(segmentFd));
  ASSERT_FALSE(error) << error.what();
  EXPECT_EQ(numFloats * sizeof(float), segment.getSize());
  for (int i = 0; i < numFloats; ++i) {
    EXPECT_EQ(myFloats[i], i);
  }
  {
    uint64_t c = 1;
    ::write(eventFd, &c, sizeof(uint64_t));
  }
  ::close(eventFd);
  ::close(sockFds[0]);
  ::close(sockFds[1]);
  // Wait for child to make gtest happy.
  ::wait(nullptr);
};


================================================
FILE: tensorpipe/test/common/system_test.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/common/system.h>

#include <gtest/gtest.h>

using namespace tensorpipe;

TEST(Pow2, isPow2) {
  for (uint64_t i = 0; i < 63; ++i) {
    EXPECT_TRUE(isPow2(1ull << i));
  }

  EXPECT_FALSE(isPow2(3));
  EXPECT_FALSE(isPow2(5));
  EXPECT_FALSE(isPow2(10));
  EXPECT_FALSE(isPow2(15));
  EXPECT_TRUE(isPow2(16));
  EXPECT_FALSE(isPow2(17));
  EXPECT_FALSE(isPow2(18));
  EXPECT_FALSE(isPow2(25));
  EXPECT_FALSE(isPow2(1028));
}

TEST(Pow2, nextPow2) {
  for (uint64_t i = 0; i < 63; ++i) {
    uint64_t p2 = 1ull << i;
    uint64_t nextP2 = 1ull << (i + 1);
    EXPECT_EQ(nextPow2(p2), p2);
    EXPECT_EQ(nextPow2(p2 + 1), nextP2);
  }
}


================================================
FILE: tensorpipe/test/core/context_test.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <cstring>
#include <exception>
#include <future>
#include <memory>
#include <string>

#include <gtest/gtest.h>

#include <tensorpipe/tensorpipe.h>
#include <tensorpipe/test/peer_group.h>

#if TP_USE_CUDA
#include <tensorpipe/common/cuda.h>
#include <tensorpipe/tensorpipe_cuda.h>
#endif // TP_USE_CUDA

using namespace tensorpipe;

namespace {

::testing::AssertionResult buffersAreEqual(
    const void* ptr1,
    const size_t len1,
    const void* ptr2,
    const size_t len2) {
  if (ptr1 == nullptr && ptr2 == nullptr) {
    if (len1 == 0 && len2 == 0) {
      return ::testing::AssertionSuccess();
    }
    if (len1 != 0) {
      return ::testing::AssertionFailure()
          << "first pointer is null but length isn't 0";
    }
    if (len1 != 0) {
      return ::testing::AssertionFailure()
          << "second pointer is null but length isn't 0";
    }
  }
  if (ptr1 == nullptr) {
    return ::testing::AssertionFailure()
        << "first pointer is null but second one isn't";
  }
  if (ptr2 == nullptr) {
    return ::testing::AssertionFailure()
        << "second pointer is null but first one isn't";
  }
  if (len1 != len2) {
    return ::testing::AssertionFailure()
        << "first length is " << len1 << " but second one is " << len2;
  }
  if (std::memcmp(ptr1, ptr2, len1) != 0) {
    return ::testing::AssertionFailure() << "buffer contents aren't equal";
  }
  return ::testing::AssertionSuccess();
}

#if TP_USE_CUDA
std::vector<uint8_t> unwrapCudaBuffer(CudaBuffer b, size_t length) {
  std::vector<uint8_t> result(length);
  TP_CUDA_CHECK(cudaStreamSynchronize(b.stream));
  TP_CUDA_CHECK(cudaMemcpy(result.data(), b.ptr, length, cudaMemcpyDefault));

  return result;
}
#endif // TP_USE_CUDA

::testing::AssertionResult descriptorAndAllocationMatchMessage(
    const Descriptor& descriptor,
    const Allocation& allocation,
    const Message& message) {
  EXPECT_EQ(descriptor.payloads.size(), allocation.payloads.size());
  if (descriptor.payloads.size() != message.payloads.size()) {
    return ::testing::AssertionFailure()
        << "descriptor has " << descriptor.payloads.size()
        << " payloads but message has " << message.payloads.size();
  }
  for (size_t idx = 0; idx < descriptor.payloads.size(); idx++) {
    EXPECT_TRUE(buffersAreEqual(
        allocation.payloads[idx].data,
        descriptor.payloads[idx].length,
        message.payloads[idx].data,
        message.payloads[idx].length));
  }
  EXPECT_EQ(descriptor.tensors.size(), allocation.tensors.size());
  if (descriptor.tensors.size() != message.tensors.size()) {
    return ::testing::AssertionFailure()
        << "descriptor has " << descriptor.tensors.size()
        << " tensors but message has " << message.tensors.size();
  }
  for (size_t idx = 0; idx < descriptor.tensors.size(); idx++) {
    EXPECT_EQ(
        allocation.tensors[idx].buffer.device(),
        message.tensors[idx].buffer.device());
    const std::string& deviceType =
        allocation.tensors[idx].buffer.device().type;

    if (deviceType == kCpuDeviceType) {
      EXPECT_TRUE(buffersAreEqual(
          allocation.tensors[idx].buffer.unwrap<CpuBuffer>().ptr,
          descriptor.tensors[idx].length,
          message.tensors[idx].buffer.unwrap<CpuBuffer>().ptr,
          message.tensors[idx].length));
#if TP_USE_CUDA
    } else if (deviceType == kCudaDeviceType) {
      std::vector<uint8_t> buffer1 = unwrapCudaBuffer(
          allocation.tensors[idx].buffer.unwrap<CudaBuffer>(),
          descriptor.tensors[idx].length);
      std::vector<uint8_t> buffer2 = unwrapCudaBuffer(
          message.tensors[idx].buffer.unwrap<CudaBuffer>(),
          message.tensors[idx].length);
      EXPECT_TRUE(buffersAreEqual(
          buffer1.data(), buffer1.size(), buffer2.data(), buffer2.size()));
#endif // TP_USE_CUDA
    } else {
      ADD_FAILURE() << "Unexpected device type: " << deviceType;
    }
  }
  return ::testing::AssertionSuccess();
}

#if TP_USE_CUDA
struct CudaPointerDeleter {
  void operator()(void* ptr) {
    TP_CUDA_CHECK(cudaFree(ptr));
  }
};

std::unique_ptr<void, CudaPointerDeleter> makeCudaPointer(size_t length) {
  void* cudaPtr;
  TP_CUDA_CHECK(cudaMalloc(&cudaPtr, length));
  return std::unique_ptr<void, CudaPointerDeleter>(cudaPtr);
}
#endif // TP_USE_CUDA

// Having 4 payloads per message is arbitrary.
constexpr int kNumPayloads = 4;
// Having 4 tensors per message ensures there are 2 CPU tensors and 2 CUDA
// tensors.
constexpr int kNumTensors = 4;
std::string kPayloadData = "I'm a payload";
std::string kTensorData = "And I'm a tensor";
#if TP_USE_CUDA
const int kCudaTensorLength = 32;
const uint8_t kCudaTensorFillValue = 0x42;
#endif // TP_USE_CUDA

Message::Tensor makeTensor(int index) {
#if TP_USE_CUDA
  static std::unique_ptr<void, CudaPointerDeleter> kCudaTensorData = []() {
    auto cudaPtr = makeCudaPointer(kCudaTensorLength);
    TP_CUDA_CHECK(
        cudaMemset(cudaPtr.get(), kCudaTensorFillValue, kCudaTensorLength));
    return cudaPtr;
  }();

  if (index % 2 == 1) {
    return {
        .buffer =
            CudaBuffer{
                .ptr = kCudaTensorData.get(),
                .stream = cudaStreamDefault,
            },
        // FIXME: Use non-blocking stream.
        .length = kCudaTensorLength,
    };
  }
#endif // TP_USE_CUDA

  return {
      .buffer =
          CpuBuffer{
              .ptr = reinterpret_cast<void*>(
                  const_cast<char*>(kTensorData.data())),
          },
      .length = kTensorData.length(),
  };
}

Message makeMessage(int numPayloads, int numTensors) {
  Message message;
  for (int i = 0; i < numPayloads; i++) {
    Message::Payload payload;
    payload.data =
        reinterpret_cast<void*>(const_cast<char*>(kPayloadData.data()));
    payload.length = kPayloadData.length();
    message.payloads.push_back(std::move(payload));
  }
  for (int i = 0; i < numTensors; i++) {
    message.tensors.push_back(makeTensor(i));
  }
  return message;
}

Allocation allocateForDescriptor(
    const Descriptor& descriptor,
    std::vector<std::shared_ptr<void>>& buffers) {
  Allocation allocation;
  for (const auto& payload : descriptor.payloads) {
    // FIXME: Changing this to a make_shared causes havoc.
    auto payloadData = std::unique_ptr<uint8_t, std::default_delete<uint8_t[]>>(
        new uint8_t[payload.length]);
    allocation.payloads.push_back({.data = payloadData.get()});
    buffers.push_back(std::move(payloadData));
  }
  for (const auto& tensor : descriptor.tensors) {
    if (tensor.sourceDevice.type == kCpuDeviceType) {
      auto tensorData =
          std::unique_ptr<uint8_t, std::default_delete<uint8_t[]>>(
              new uint8_t[tensor.length]);
      allocation.tensors.push_back({
          .buffer = CpuBuffer{.ptr = tensorData.get()},
      });
      buffers.push_back(std::move(tensorData));
#if TP_USE_CUDA
    } else if (tensor.sourceDevice.type == kCudaDeviceType) {
      auto tensorData = makeCudaPointer(tensor.length);
      allocation.tensors.push_back({
          .buffer =
              CudaBuffer{
                  .ptr = tensorData.get(),
                  // FIXME: Use non-blocking streams.
                  .stream = cudaStreamDefault,
              },
      });
      buffers.push_back(std::move(tensorData));
#endif // TP_USE_CUDA
    } else {
      ADD_FAILURE() << "Unrecognized device type: " << tensor.sourceDevice.type;
    }
  }

  return allocation;
}

Message messageFromAllocation(
    const Descriptor& descriptor,
    const Allocation& allocation) {
  Message message;
  message.metadata = descriptor.metadata;
  for (int payloadIdx = 0; payloadIdx < descriptor.payloads.size();
       ++payloadIdx) {
    message.payloads.emplace_back();
    Message::Payload& payload = message.payloads.back();
    payload.metadata = descriptor.payloads[payloadIdx].metadata;
    payload.length = descriptor.payloads[payloadIdx].length;
    payload.data = allocation.payloads[payloadIdx].data;
  }
  for (int tensorIdx = 0; tensorIdx < descriptor.tensors.size(); ++tensorIdx) {
    message.tensors.emplace_back();
    Message::Tensor& tensor = message.tensors.back();
    tensor.metadata = descriptor.tensors[tensorIdx].metadata;
    tensor.length = descriptor.tensors[tensorIdx].length;
    tensor.buffer = allocation.tensors[tensorIdx].buffer;
  }

  return message;
}

std::vector<std::string> genUrls() {
  std::vector<std::string> res;

#if TENSORPIPE_HAS_SHM_TRANSPORT
  res.push_back("shm://");
#endif // TENSORPIPE_HAS_SHM_TRANSPORT
  res.push_back("uv://127.0.0.1");

  return res;
}

std::shared_ptr<Context> makeContext() {
  auto context = std::make_shared<Context>();

  context->registerTransport(0, "uv", transport::uv::create());
#if TENSORPIPE_HAS_SHM_TRANSPORT
  context->registerTransport(1, "shm", transport::shm::create());
#endif // TENSORPIPE_HAS_SHM_TRANSPORT
  context->registerChannel(0, "basic", channel::basic::create());
#if TENSORPIPE_HAS_CMA_CHANNEL
  context->registerChannel(1, "cma", channel::cma::create());
#endif // TENSORPIPE_HAS_CMA_CHANNEL
#if TP_USE_CUDA
  context->registerChannel(
      10, "cuda_basic", channel::cuda_basic::create(channel::basic::create()));
#if TENSORPIPE_HAS_CUDA_IPC_CHANNEL
  context->registerChannel(11, "cuda_ipc", channel::cuda_ipc::create());
#endif // TENSORPIPE_HAS_CUDA_IPC_CHANNEL
  context->registerChannel(12, "cuda_xth", channel::cuda_xth::create());
#endif // TP_USE_CUDA

  return context;
}

} // namespace

TEST(Context, ClientPingSerial) {
  ForkedThreadPeerGroup pg;
  pg.spawn(
      [&]() {
        std::vector<std::shared_ptr<void>> buffers;
        std::promise<std::shared_ptr<Pipe>> serverPipePromise;
        std::promise<Descriptor> readDescriptorPromise;
        std::promise<void> readMessagePromise;

        auto context = makeContext();

        auto listener = context->listen(genUrls());
        pg.send(PeerGroup::kClient, listener->url("uv"));

        listener->accept([&](const Error& error, std::shared_ptr<Pipe> pipe) {
          if (error) {
            serverPipePromise.set_exception(
                std::make_exception_ptr(std::runtime_error(error.what())));
          } else {
            serverPipePromise.set_value(std::move(pipe));
          }
        });
        std::shared_ptr<Pipe> serverPipe = serverPipePromise.get_future().get();

        serverPipe->readDescriptor(
            [&readDescriptorPromise](
                const Error& error, Descriptor descriptor) {
              if (error) {
                readDescriptorPromise.set_exception(
                    std::make_exception_ptr(std::runtime_error(error.what())));
              } else {
                readDescriptorPromise.set_value(std::move(descriptor));
              }
            });

        Descriptor descriptor = readDescriptorPromise.get_future().get();
        Allocation allocation = allocateForDescriptor(descriptor, buffers);
        serverPipe->read(allocation, [&readMessagePromise](const Error& error) {
          if (error) {
            readMessagePromise.set_exception(
                std::make_exception_ptr(std::runtime_error(error.what())));
          } else {
            readMessagePromise.set_value();
          }
        });
        readMessagePromise.get_future().get();
        EXPECT_TRUE(descriptorAndAllocationMatchMessage(
            descriptor, allocation, makeMessage(kNumPayloads, kNumTensors)));

        pg.done(PeerGroup::kServer);
        pg.join(PeerGroup::kServer);

        context->join();
      },
      [&]() {
        std::promise<void> writtenMessagePromise;

        auto context = makeContext();

        auto url = pg.recv(PeerGroup::kClient);
        auto clientPipe = context->connect(url);

        clientPipe->write(
            makeMessage(kNumPayloads, kNumTensors),
            [&writtenMessagePromise](const Error& error) {
              if (error) {
                writtenMessagePromise.set_exception(
                    std::make_exception_ptr(std::runtime_error(error.what())));
              } else {
                writtenMessagePromise.set_value();
              }
            });
        writtenMessagePromise.get_future().get();

        pg.done(PeerGroup::kClient);
        pg.join(PeerGroup::kClient);

        context->join();
      });
}

TEST(Context, ClientPingInline) {
  ForkedThreadPeerGroup pg;
  pg.spawn(
      [&]() {
        std::vector<std::shared_ptr<void>> buffers;
        std::promise<std::shared_ptr<Pipe>> serverPipePromise;
        std::promise<void> readCompletedProm;

        auto context = makeContext();

        auto listener = context->listen(genUrls());
        pg.send(PeerGroup::kClient, listener->url("uv"));

        listener->accept([&](const Error& error, std::shared_ptr<Pipe> pipe) {
          if (error) {
            serverPipePromise.set_exception(
                std::make_exception_ptr(std::runtime_error(error.what())));
          } else {
            serverPipePromise.set_value(std::move(pipe));
          }
        });
        std::shared_ptr<Pipe> serverPipe = serverPipePromise.get_future().get();

        serverPipe->readDescriptor([&serverPipe, &readCompletedProm, &buffers](
                                       const Error& error,
                                       Descriptor descriptor) {
          if (error) {
            ADD_FAILURE() << error.what();
            readCompletedProm.set_value();
            return;
          }

          Allocation allocation = allocateForDescriptor(descriptor, buffers);
          serverPipe->read(
              allocation,
              [&readCompletedProm,
               descriptor{std::move(descriptor)},
               allocation](const Error& error) {
                if (error) {
                  readCompletedProm.set_exception(std::make_exception_ptr(
                      std::runtime_error(error.what())));
                } else {
                  EXPECT_TRUE(descriptorAndAllocationMatchMessage(
                      descriptor,
                      allocation,
                      makeMessage(kNumPayloads, kNumTensors)));
                  readCompletedProm.set_value();
                }
              });
        });
        readCompletedProm.get_future().get();

        pg.done(PeerGroup::kServer);
        pg.join(PeerGroup::kServer);

        context->join();
      },
      [&]() {
        std::promise<void> writeCompletedProm;

        auto context = makeContext();

        auto url = pg.recv(PeerGroup::kClient);
        auto clientPipe = context->connect(url);

        clientPipe->write(
            makeMessage(kNumPayloads, kNumTensors),
            [&writeCompletedProm](const Error& error) {
              if (error) {
                writeCompletedProm.set_exception(
                    std::make_exception_ptr(std::runtime_error(error.what())));
              } else {
                writeCompletedProm.set_value();
              }
            });
        writeCompletedProm.get_future().get();

        pg.done(PeerGroup::kClient);
        pg.join(PeerGroup::kClient);

        context->join();
      });
}

TEST(Context, ServerPingPongTwice) {
  ForkedThreadPeerGroup pg;
  pg.spawn(
      [&]() {
        std::vector<std::shared_ptr<void>> buffers;
        std::promise<std::shared_ptr<Pipe>> serverPipePromise;
        std::promise<void> pingCompletedProm;

        auto context = makeContext();

        auto listener = context->listen(genUrls());
        pg.send(PeerGroup::kClient, listener->url("uv"));

        listener->accept([&](const Error& error, std::shared_ptr<Pipe> pipe) {
          if (error) {
            serverPipePromise.set_exception(
                std::make_exception_ptr(std::runtime_error(error.what())));
          } else {
            serverPipePromise.set_value(std::move(pipe));
          }
        });
        std::shared_ptr<Pipe> serverPipe = serverPipePromise.get_future().get();

        int numPingsGoneThrough = 0;
        for (int i = 0; i < 2; i++) {
          serverPipe->write(
              makeMessage(kNumPayloads, kNumTensors),
              [&serverPipe,
               &pingCompletedProm,
               &buffers,
               &numPingsGoneThrough,
               i](const Error& error) {
                if (error) {
                  ADD_FAILURE() << error.what();
                  pingCompletedProm.set_value();
                  return;
                }
                serverPipe->readDescriptor(
                    [&serverPipe,
                     &pingCompletedProm,
                     &buffers,
                     &numPingsGoneThrough,
                     i](const Error& error, Descriptor descriptor) {
                      if (error) {
                        ADD_FAILURE() << error.what();
                        pingCompletedProm.set_value();
                        return;
                      }
                      Allocation allocation =
                          allocateForDescriptor(descriptor, buffers);
                      serverPipe->read(
                          allocation,
                          [&pingCompletedProm,
                           &numPingsGoneThrough,
                           descriptor{std::move(descriptor)},
                           allocation,
                           i](const Error& error) {
                            if (error) {
                              ADD_FAILURE() << error.what();
                              pingCompletedProm.set_value();
                              return;
                            }
                            EXPECT_TRUE(descriptorAndAllocationMatchMessage(
                                descriptor,
                                allocation,
                                makeMessage(kNumPayloads, kNumTensors)));
                            EXPECT_EQ(numPingsGoneThrough, i);
                            numPingsGoneThrough++;
                            if (numPingsGoneThrough == 2) {
                              pingCompletedProm.set_value();
                            }
                          });
                    });
              });
        }
        pingCompletedProm.get_future().get();

        pg.done(PeerGroup::kServer);
        pg.join(PeerGroup::kServer);

        context->join();
      },
      [&]() {
        std::vector<std::shared_ptr<void>> buffers;
        std::promise<void> pongCompletedProm;

        auto context = makeContext();

        auto url = pg.recv(PeerGroup::kClient);
        auto clientPipe = context->connect(url);

        int numPongsGoneThrough = 0;
        for (int i = 0; i < 2; i++) {
          clientPipe->readDescriptor([&clientPipe,
                                      &pongCompletedProm,
                                      &buffers,
                                      &numPongsGoneThrough,
                                      i](const Error& error,
                                         Descriptor descriptor) {
            if (error) {
              ADD_FAILURE() << error.what();
              pongCompletedProm.set_value();
              return;
            }
            Allocation allocation = allocateForDescriptor(descriptor, buffers);
            clientPipe->read(
                allocation,
                [&clientPipe,
                 &pongCompletedProm,
                 &numPongsGoneThrough,
                 descriptor{std::move(descriptor)},
                 allocation,
                 i](const Error& error) {
                  if (error) {
                    ADD_FAILURE() << error.what();
                    pongCompletedProm.set_value();
                    return;
                  }

                  // Copy received message to send it back.
                  Message message =
                      messageFromAllocation(descriptor, allocation);
                  clientPipe->write(
                      std::move(message),
                      [&pongCompletedProm, &numPongsGoneThrough, i](
                          const Error& error) {
                        if (error) {
                          ADD_FAILURE() << error.what();
                          pongCompletedProm.set_value();
                          return;
                        }
                        EXPECT_EQ(numPongsGoneThrough, i);
                        numPongsGoneThrough++;
                        if (numPongsGoneThrough == 2) {
                          pongCompletedProm.set_value();
                        }
                      });
                });
          });
        }
        pongCompletedProm.get_future().get();

        pg.done(PeerGroup::kClient);
        pg.join(PeerGroup::kClient);

        context->join();
      });
}

static void pipeRead(
    std::shared_ptr<Pipe>& pipe,
    std::vector<std::shared_ptr<void>>& buffers,
    std::function<void(const Error&, Descriptor, Allocation)> fn) {
  pipe->readDescriptor([&pipe, &buffers, fn{std::move(fn)}](
                           const Error& error, Descriptor descriptor) mutable {
    ASSERT_FALSE(error);
    Allocation allocation = allocateForDescriptor(descriptor, buffers);
    pipe->read(
        allocation,
        [fn{std::move(fn)}, descriptor{std::move(descriptor)}, allocation](
            const Error& error) mutable {
          fn(error, std::move(descriptor), std::move(allocation));
        });
  });
}

TEST(Context, MixedTensorMessage) {
  constexpr int kNumMessages = 2;

  ForkedThreadPeerGroup pg;
  pg.spawn(
      [&]() {
        std::vector<std::shared_ptr<void>> buffers;
        std::promise<std::shared_ptr<Pipe>> serverPipePromise;
        std::promise<void> readCompletedProm;

        auto context = makeContext();

        auto listener = context->listen(genUrls());
        pg.send(PeerGroup::kClient, listener->url("uv"));

        listener->accept([&](const Error& error, std::shared_ptr<Pipe> pipe) {
          if (error) {
            serverPipePromise.set_exception(
                std::make_exception_ptr(std::runtime_error(error.what())));
          } else {
            serverPipePromise.set_value(std::move(pipe));
          }
        });
        std::shared_ptr<Pipe> serverPipe = serverPipePromise.get_future().get();

        std::atomic<int> readNum(kNumMessages);
        pipeRead(
            serverPipe,
            buffers,
            [&readNum, &readCompletedProm](
                const Error& error,
                Descriptor descriptor,
                Allocation allocation) {
              ASSERT_FALSE(error);
              EXPECT_TRUE(descriptorAndAllocationMatchMessage(
                  descriptor,
                  allocation,
                  makeMessage(kNumPayloads, kNumTensors)));
              if (--readNum == 0) {
                readCompletedProm.set_value();
              }
            });
        pipeRead(
            serverPipe,
            buffers,
            [&readNum, &readCompletedProm](
                const Error& error,
                Descriptor descriptor,
                Allocation allocation) {
              ASSERT_FALSE(error);
              EXPECT_TRUE(descriptorAndAllocationMatchMessage(
                  descriptor, allocation, makeMessage(0, 0)));
              if (--readNum == 0) {
                readCompletedProm.set_value();
              }
            });
        readCompletedProm.get_future().get();

        pg.done(PeerGroup::kServer);
        pg.join(PeerGroup::kServer);

        context->join();
      },
      [&]() {
        std::promise<void> writeCompletedProm;

        auto context = makeContext();

        auto url = pg.recv(PeerGroup::kClient);
        auto clientPipe = context->connect(url);

        std::atomic<int> writeNum(kNumMessages);
        clientPipe->write(
            makeMessage(kNumPayloads, kNumTensors),
            [&writeNum, &writeCompletedProm](const Error& error) {
              ASSERT_FALSE(error) << error.what();
              if (--writeNum == 0) {
                writeCompletedProm.set_value();
              }
            });
        clientPipe->write(
            makeMessage(0, 0),
            [&writeNum, &writeCompletedProm](const Error& error) {
              ASSERT_FALSE(error) << error.what();
              if (--writeNum == 0) {
                writeCompletedProm.set_value();
              }
            });
        writeCompletedProm.get_future().get();

        pg.done(PeerGroup::kClient);
        pg.join(PeerGroup::kClient);

        context->join();
      });
}


================================================
FILE: tensorpipe/test/core/listener_test.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/tensorpipe.h>

#include <cstring>
#include <exception>
#include <future>
#include <memory>
#include <string>

#include <gtest/gtest.h>

using namespace tensorpipe;

TEST(Listener, ClosingAbortsOperations) {
  auto context = std::make_shared<Context>();

  context->registerTransport(0, "uv", transport::uv::create());
  context->registerChannel(0, "basic", channel::basic::create());

  {
    auto listener = context->listen({"uv://127.0.0.1"});

    std::promise<void> donePromise;
    listener->accept(
        [&](const Error& error, std::shared_ptr<Pipe> /* unused */) {
          EXPECT_TRUE(error);
          donePromise.set_value();
        });
    listener->close();
    donePromise.get_future().get();
  }

  context->join();
}


================================================
FILE: tensorpipe/test/core/pipe_cuda_test.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/test/core/pipe_test.h>

using namespace tensorpipe;

class CudaSimpleWriteReadWithAllTargetDevicesTest
    : public ClientServerPipeTestCase {
  InlineMessage imessage_ = {
      .payloads =
          {
              {.data = "payload #1", .metadata = "payload metadata #1"},
              {.data = "payload #2", .metadata = "payload metadata #2"},
              {.data = "payload #3", .metadata = "payload metadata #3"},
          },
      .tensors =
          {
              {
                  .data = "tensor #1",
                  .metadata = "tensor metadata #1",
                  .device = Device{kCudaDeviceType, 0},
                  .targetDevice = Device{kCudaDeviceType, 0},
              },
              {
                  .data = "tensor #2",
                  .metadata = "tensor metadata #2",
                  .device = Device{kCpuDeviceType, 0},
                  .targetDevice = Device{kCudaDeviceType, 0},
              },
              {
                  .data = "tensor #3",
                  .metadata = "tensor metadata #3",
                  .device = Device{kCudaDeviceType, 0},
                  .targetDevice = Device{kCpuDeviceType, 0},
              },
              {
                  .data = "tensor #4",
                  .metadata = "tensor metadata #4",
                  .device = Device{kCpuDeviceType, 0},
                  .targetDevice = Device{kCpuDeviceType, 0},
              },
          },
      .metadata = "pipe metadata",
  };

 public:
  void server(Pipe& pipe) override {
    Message message;
    Storage storage;
    std::tie(message, storage) = makeMessage(imessage_);
    auto future = pipeWriteWithFuture(pipe, message);
    future.get();
  }

  void client(Pipe& pipe) override {
    Descriptor descriptor;
    Storage storage;
    auto future = pipeReadWithFuture(
        pipe,
        /*targetDevices=*/
        {
            Device{kCudaDeviceType, 0},
            Device{kCudaDeviceType, 0},
            Device{kCpuDeviceType, 0},
            Device{kCpuDeviceType, 0},
        });
    std::tie(descriptor, storage) = future.get();
    expectDescriptorAndStorageMatchMessage(descriptor, storage, imessage_);
  }
};

TEST(Pipe, CudaSimpleWriteReadWithAllTargetDevices) {
  CudaSimpleWriteReadWithAllTargetDevicesTest test;
  test.run();
}

class CudaSimpleWriteReadWithSomeTargetDevicesTest
    : public ClientServerPipeTestCase {
  InlineMessage imessage_ = {
      .payloads =
          {
              {.data = "payload #1", .metadata = "payload metadata #1"},
              {.data = "payload #2", .metadata = "payload metadata #2"},
              {.data = "payload #3", .metadata = "payload metadata #3"},
          },
      .tensors =
          {
              {
                  .data = "tensor #1",
                  .metadata = "tensor metadata #1",
                  .device = Device{kCudaDeviceType, 0},
                  .targetDevice = Device{kCudaDeviceType, 0},
              },
              {
                  .data = "tensor #2",
                  .metadata = "tensor metadata #2",
                  .device = Device{kCudaDeviceType, 0},
              },
              {
                  .data = "tensor #3",
                  .metadata = "tensor metadata #3",
                  .device = Device{kCpuDeviceType, 0},
              },
          },
      .metadata = "pipe metadata",
  };

 public:
  void server(Pipe& pipe) override {
    Message message;
    Storage storage;
    std::tie(message, storage) = makeMessage(imessage_);
    auto future = pipeWriteWithFuture(pipe, message);
    future.get();
  }

  void client(Pipe& pipe) override {
    Descriptor descriptor;
    Storage storage;
    auto future = pipeReadWithFuture(
        pipe,
        /*targetDevices=*/
        {
            Device{kCudaDeviceType, 0},
            Device{kCpuDeviceType, 0},
            Device{kCpuDeviceType, 0},
        });
    std::tie(descriptor, storage) = future.get();
    expectDescriptorAndStorageMatchMessage(descriptor, storage, imessage_);
  }
};

TEST(Pipe, CudaSimpleWriteReadWithSomeTargetDevices) {
  CudaSimpleWriteReadWithSomeTargetDevicesTest test;
  test.run();
}

class CudaSimpleWriteReadWithoutTargetDeviceTest
    : public ClientServerPipeTestCase {
  InlineMessage imessage_ = {
      .payloads =
          {
              {.data = "payload #1", .metadata = "payload metadata #1"},
              {.data = "payload #2", .metadata = "payload metadata #2"},
              {.data = "payload #3", .metadata = "payload metadata #3"},
          },
      .tensors =
          {
              {
                  .data = "tensor #1",
                  .metadata = "tensor metadata #1",
                  .device = Device{kCpuDeviceType, 0},
              },
              {
                  .data = "tensor #2",
                  .metadata = "tensor metadata #2",
                  .device = Device{kCpuDeviceType, 0},
              },
              {
                  .data = "tensor #3",
                  .metadata = "tensor metadata #3",
                  .device = Device{kCudaDeviceType, 0},
              },
              {
                  .data = "tensor #4",
                  .metadata = "tensor metadata #4",
                  .device = Device{kCudaDeviceType, 0},
              },
          },
      .metadata = "pipe metadata",
  };

 public:
  void server(Pipe& pipe) override {
    Message message;
    Storage storage;
    std::tie(message, storage) = makeMessage(imessage_);
    auto future = pipeWriteWithFuture(pipe, message);
    future.get();
  }

  void client(Pipe& pipe) override {
    Descriptor descriptor;
    Storage storage;
    auto future = pipeReadWithFuture(
        pipe,
        /*targetDevices=*/
        {
            Device{kCpuDeviceType, 0},
            Device{kCudaDeviceType, 0},
            Device{kCpuDeviceType, 0},
            Device{kCudaDeviceType, 0},
        });
    std::tie(descriptor, storage) = future.get();
    expectDescriptorAndStorageMatchMessage(descriptor, storage, imessage_);
  }
};

TEST(Pipe, CudaSimpleWriteReadWithoutTargetDevice) {
  CudaSimpleWriteReadWithoutTargetDeviceTest test;
  test.run();
}


================================================
FILE: tensorpipe/test/core/pipe_test.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/test/core/pipe_test.h>

using namespace tensorpipe;

class SimpleWriteReadTest : public ClientServerPipeTestCase {
  InlineMessage imessage_ = {
      .payloads =
          {
              {.data = "payload #1", .metadata = "payload metadata #1"},
              {.data = "payload #2", .metadata = "payload metadata #2"},
              {.data = "payload #3", .metadata = "payload metadata #3"},
          },
      .tensors =
          {
              {
                  .data = "tensor #1",
                  .metadata = "tensor metadata #1",
                  .device = Device{kCpuDeviceType, 0},
              },
              {
                  .data = "tensor #2",
                  .metadata = "tensor metadata #2",
                  .device = Device{kCpuDeviceType, 0},
              },
              {
                  .data = "tensor #3",
                  .metadata = "tensor metadata #3",
                  .device = Device{kCpuDeviceType, 0},
              },
          },
      .metadata = "pipe metadata",
  };

 public:
  void server(Pipe& pipe) override {
    Message message;
    Storage storage;
    std::tie(message, storage) = makeMessage(imessage_);
    auto future = pipeWriteWithFuture(pipe, message);
    future.get();
  }

  void client(Pipe& pipe) override {
    Descriptor descriptor;
    Storage storage;
    auto future = pipeReadWithFuture(
        pipe,
        /*targetDevices=*/
        {
            Device{kCpuDeviceType, 0},
            Device{kCpuDeviceType, 0},
            Device{kCpuDeviceType, 0},
        });
    std::tie(descriptor, storage) = future.get();
    expectDescriptorAndStorageMatchMessage(descriptor, storage, imessage_);
  }
};

TEST(Pipe, SimpleWriteRead) {
  SimpleWriteReadTest test;
  test.run();
}

class SimpleWriteReadPayloadsOnlyTest : public ClientServerPipeTestCase {
  InlineMessage imessage_ = {
      .payloads =
          {
              {.data = "payload #1", .metadata = "payload metadata #1"},
              {.data = "payload #2", .metadata = "payload metadata #2"},
              {.data = "payload #3", .metadata = "payload metadata #3"},
          },
      .tensors = {},
      .metadata = "pipe metadata",
  };

 public:
  void server(Pipe& pipe) override {
    Message message;
    Storage storage;
    std::tie(message, storage) = makeMessage(imessage_);
    auto future = pipeWriteWithFuture(pipe, message);
    future.get();
  }

  void client(Pipe& pipe) override {
    Descriptor descriptor;
    Storage storage;
    auto future = pipeReadWithFuture(pipe, /*targetDevices=*/{});
    std::tie(descriptor, storage) = future.get();
    expectDescriptorAndStorageMatchMessage(descriptor, storage, imessage_);
  }
};

TEST(Pipe, SimpleWriteReadPayloadsOnly) {
  SimpleWriteReadPayloadsOnlyTest test;
  test.run();
}

class SimpleWriteReadTensorsOnlyTest : public ClientServerPipeTestCase {
  InlineMessage imessage_ = {
      .payloads = {},
      .tensors =
          {
              {
                  .data = "tensor #1",
                  .metadata = "tensor metadata #1",
                  .device = Device{kCpuDeviceType, 0},
              },
              {
                  .data = "tensor #2",
                  .metadata = "tensor metadata #2",
                  .device = Device{kCpuDeviceType, 0},
              },
              {
                  .data = "tensor #3",
                  .metadata = "tensor metadata #3",
                  .device = Device{kCpuDeviceType, 0},
              },
          },
      .metadata = "pipe metadata",
  };

 public:
  void server(Pipe& pipe) override {
    Message message;
    Storage storage;
    std::tie(message, storage) = makeMessage(imessage_);
    auto future = pipeWriteWithFuture(pipe, message);
    future.get();
  }

  void client(Pipe& pipe) override {
    Descriptor descriptor;
    Storage storage;
    auto future = pipeReadWithFuture(
        pipe,
        /*targetDevices=*/
        {
            Device{kCpuDeviceType, 0},
            Device{kCpuDeviceType, 0},
            Device{kCpuDeviceType, 0},
        });
    std::tie(descriptor, storage) = future.get();
    expectDescriptorAndStorageMatchMessage(descriptor, storage, imessage_);
  }
};

TEST(Pipe, SimpleWriteReadTensorsOnly) {
  SimpleWriteReadTensorsOnlyTest test;
  test.run();
}

class SimpleWriteReadWithAllTargetDevicesTest
    : public ClientServerPipeTestCase {
  InlineMessage imessage_ = {
      .payloads =
          {
              {.data = "payload #1", .metadata = "payload metadata #1"},
              {.data = "payload #2", .metadata = "payload metadata #2"},
              {.data = "payload #3", .metadata = "payload metadata #3"},
          },
      .tensors =
          {
              {
                  .data = "tensor #1",
                  .metadata = "tensor metadata #1",
                  .device = Device{kCpuDeviceType, 0},
                  .targetDevice = Device{kCpuDeviceType, 0},
              },
              {
                  .data = "tensor #2",
                  .metadata = "tensor metadata #2",
                  .device = Device{kCpuDeviceType, 0},
                  .targetDevice = Device{kCpuDeviceType, 0},
              },
              {
                  .data = "tensor #3",
                  .metadata = "tensor metadata #3",
                  .device = Device{kCpuDeviceType, 0},
                  .targetDevice = Device{kCpuDeviceType, 0},
              },
          },
      .metadata = "pipe metadata",
  };

 public:
  void server(Pipe& pipe) override {
    Message message;
    Storage storage;
    std::tie(message, storage) = makeMessage(imessage_);
    auto future = pipeWriteWithFuture(pipe, message);
    future.get();
  }

  void client(Pipe& pipe) override {
    Descriptor descriptor;
    Storage storage;
    auto future = pipeReadWithFuture(
        pipe,
        /*targetDevices=*/
        {
            Device{kCpuDeviceType, 0},
            Device{kCpuDeviceType, 0},
            Device{kCpuDeviceType, 0},
        });
    std::tie(descriptor, storage) = future.get();
    expectDescriptorAndStorageMatchMessage(descriptor, storage, imessage_);
  }
};

TEST(Pipe, SimpleWriteReadWithAllTargetDevices) {
  SimpleWriteReadWithAllTargetDevicesTest test;
  test.run();
}

class SimpleWriteReadWithSomeTargetDevicesTest
    : public ClientServerPipeTestCase {
  InlineMessage imessage_ = {
      .payloads =
          {
              {.data = "payload #1", .metadata = "payload metadata #1"},
              {.data = "payload #2", .metadata = "payload metadata #2"},
              {.data = "payload #3", .metadata = "payload metadata #3"},
          },
      .tensors =
          {
              {
                  .data = "tensor #1",
                  .metadata = "tensor metadata #1",
                  .device = Device{kCpuDeviceType, 0},
                  .targetDevice = Device{kCpuDeviceType, 0},
              },
              {
                  .data = "tensor #2",
                  .metadata = "tensor metadata #2",
                  .device = Device{kCpuDeviceType, 0},
              },
              {
                  .data = "tensor #3",
                  .metadata = "tensor metadata #3",
                  .device = Device{kCpuDeviceType, 0},
                  .targetDevice = Device{kCpuDeviceType, 0},
              },
          },
      .metadata = "pipe metadata",
  };

 public:
  void server(Pipe& pipe) override {
    Message message;
    Storage storage;
    std::tie(message, storage) = makeMessage(imessage_);
    auto future = pipeWriteWithFuture(pipe, message);
    future.get();
  }

  void client(Pipe& pipe) override {
    Descriptor descriptor;
    Storage storage;
    auto future = pipeReadWithFuture(
        pipe,
        /*targetDevices=*/
        {
            Device{kCpuDeviceType, 0},
            Device{kCpuDeviceType, 0},
            Device{kCpuDeviceType, 0},
        });
    std::tie(descriptor, storage) = future.get();
    expectDescriptorAndStorageMatchMessage(descriptor, storage, imessage_);
  }
};

TEST(Pipe, SimpleWriteReadWithSomeTargetDevices) {
  SimpleWriteReadWithSomeTargetDevicesTest test;
  test.run();
}

class MultipleWriteReadTest : public ClientServerPipeTestCase {
  InlineMessage imessage1_ = {
      .payloads =
          {
              {.data = "payload #1.1", .metadata = "payload metadata #1.1"},
          },
      .tensors =
          {
              {
                  .data = "tensor #1.1",
                  .metadata = "tensor metadata #1.1",
                  .device = Device{kCpuDeviceType, 0},
              },
          },
      .metadata = "message metadata",
  };

  InlineMessage imessage2_ = {
      .payloads =
          {
              {.data = "payload #2.1", .metadata = "payload metadata #2.1"},
          },
      .tensors =
          {
              {
                  .data = "tensor #2.1",
                  .metadata = "tensor metadata #2.1",
                  .device = Device{kCpuDeviceType, 0},
              },
          },
      .metadata = "message metadata",
  };

 public:
  void server(Pipe& pipe) override {
    Message message1;
    Storage storage1;
    std::tie(message1, storage1) = makeMessage(imessage1_);
    auto future1 = pipeWriteWithFuture(pipe, message1);

    Message message2;
    Storage storage2;
    std::tie(message2, storage2) = makeMessage(imessage2_);
    auto future2 = pipeWriteWithFuture(pipe, message2);

    future1.get();
    future2.get();
  }

  void client(Pipe& pipe) override {
    auto future1 = pipeReadWithFuture(
        pipe,
        /*targetDevices=*/
        {
            Device{kCpuDeviceType, 0},
        });
    auto future2 = pipeReadWithFuture(
        pipe,
        /*targetDevices=*/
        {
            Device{kCpuDeviceType, 0},
        });

    Descriptor descriptor1;
    Storage storage1;
    std::tie(descriptor1, storage1) = future1.get();
    expectDescriptorAndStorageMatchMessage(descriptor1, storage1, imessage1_);

    Descriptor descriptor2;
    Storage storage2;
    std::tie(descriptor2, storage2) = future2.get();
    expectDescriptorAndStorageMatchMessage(descriptor2, storage2, imessage2_);
  }
};

TEST(Pipe, MultipleWriteRead) {
  MultipleWriteReadTest test;
  test.run();
}

class MultipleWriteReadWithSomeTargetDevicesTest
    : public ClientServerPipeTestCase {
  InlineMessage imessage1_ = {
      .payloads =
          {
              {.data = "payload #1.1", .metadata = "payload metadata #1.1"},
          },
      .tensors =
          {
              {
                  .data = "tensor #1.1",
                  .metadata = "tensor metadata #1.1",
                  .device = Device{kCpuDeviceType, 0},
              },
          },
      .metadata = "message metadata",
  };

  InlineMessage imessage2_ = {
      .payloads =
          {
              {.data = "payload #2.1", .metadata = "payload metadata #2.1"},
          },
      .tensors =
          {
              {
                  .data = "tensor #2.1",
                  .metadata = "tensor metadata #2.1",
                  .device = Device{kCpuDeviceType, 0},
                  .targetDevice = Device{kCpuDeviceType, 0},
              },
          },
      .metadata = "message metadata",
  };

 public:
  void server(Pipe& pipe) override {
    Message message1;
    Storage storage1;
    std::tie(message1, storage1) = makeMessage(imessage1_);
    auto future1 = pipeWriteWithFuture(pipe, message1);

    Message message2;
    Storage storage2;
    std::tie(message2, storage2) = makeMessage(imessage2_);
    auto future2 = pipeWriteWithFuture(pipe, message2);

    future1.get();
    future2.get();
  }

  void client(Pipe& pipe) override {
    auto future1 = pipeReadWithFuture(
        pipe,
        /*targetDevices=*/
        {
            Device{kCpuDeviceType, 0},
        });
    auto future2 = pipeReadWithFuture(
        pipe,
        /*targetDevices=*/
        {
            Device{kCpuDeviceType, 0},
        });

    Descriptor descriptor1;
    Storage storage1;
    std::tie(descriptor1, storage1) = future1.get();
    expectDescriptorAndStorageMatchMessage(descriptor1, storage1, imessage1_);

    Descriptor descriptor2;
    Storage storage2;
    std::tie(descriptor2, storage2) = future2.get();
    expectDescriptorAndStorageMatchMessage(descriptor2, storage2, imessage2_);
  }
};

TEST(Pipe, MultipleWriteReadWithSomeTargetDevices) {
  MultipleWriteReadWithSomeTargetDevicesTest test;
  test.run();
}

class WriteFromBothThenReadTest : public ClientServerPipeTestCase {
  InlineMessage imessage1_ = {
      .payloads =
          {
              {.data = "payload #1.1", .metadata = "payload metadata #1.1"},
          },
      .tensors =
          {
              {
                  .data = "tensor #1.1",
                  .metadata = "tensor metadata #1.1",
                  .device = Device{kCpuDeviceType, 0},
              },
          },
      .metadata = "message metadata",
  };

  InlineMessage imessage2_ = {
      .payloads =
          {
              {.data = "payload #2.1", .metadata = "payload metadata #2.1"},
          },
      .tensors =
          {
              {
                  .data = "tensor #2.1",
                  .metadata = "tensor metadata #2.1",
                  .device = Device{kCpuDeviceType, 0},
              },
          },
      .metadata = "message metadata",
  };

 public:
  void server(Pipe& pipe) override {
    Message message;
    Storage writeStorage;
    std::tie(message, writeStorage) = makeMessage(imessage1_);
    auto writeFuture = pipeWriteWithFuture(pipe, message);

    auto readFuture = pipeReadWithFuture(
        pipe,
        /*targetDevices=*/
        {
            Device{kCpuDeviceType, 0},
        });

    writeFuture.get();

    Descriptor descriptor;
    Storage readStorage;
    std::tie(descriptor, readStorage) = readFuture.get();
    expectDescriptorAndStorageMatchMessage(descriptor, readStorage, imessage2_);
  }

  void client(Pipe& pipe) override {
    Message message;
    Storage writeStorage;
    std::tie(message, writeStorage) = makeMessage(imessage2_);
    auto writeFuture = pipeWriteWithFuture(pipe, message);

    auto readFuture = pipeReadWithFuture(
        pipe,
        /*targetDevices=*/
        {
            Device{kCpuDeviceType, 0},
        });

    writeFuture.get();

    Descriptor descriptor;
    Storage readStorage;
    std::tie(descriptor, readStorage) = readFuture.get();
    expectDescriptorAndStorageMatchMessage(descriptor, readStorage, imessage1_);
  }
};

TEST(Pipe, WriteFromBothThenRead) {
  WriteFromBothThenReadTest test;
  test.run();
}


================================================
FILE: tensorpipe/test/core/pipe_test.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <future>
#include <memory>
#include <tuple>
#include <vector>

#include <gtest/gtest.h>

#include <tensorpipe/tensorpipe.h>
#include <tensorpipe/test/peer_group.h>

#if TP_USE_CUDA
#include <tensorpipe/common/cuda.h>
#include <tensorpipe/tensorpipe_cuda.h>
#endif // TP_USE_CUDA

struct Storage {
  std::vector<std::shared_ptr<void>> payloads;
  std::vector<std::pair<std::shared_ptr<void>, tensorpipe::Buffer>> tensors;
};

struct InlineMessage {
  struct Payload {
    std::string data;
    std::string metadata;
  };

  struct Tensor {
    std::string data;
    std::string metadata;
    tensorpipe::Device device;
    tensorpipe::optional<tensorpipe::Device> targetDevice;
  };

  std::vector<Payload> payloads;
  std::vector<Tensor> tensors;
  std::string metadata;
};

inline std::pair<tensorpipe::Message, Storage> makeMessage(
    InlineMessage imessage) {
  tensorpipe::Message message;
  Storage storage;

  for (auto& payload : imessage.payloads) {
    size_t length = payload.data.length();
    auto data = std::unique_ptr<uint8_t, std::default_delete<uint8_t[]>>(
        new uint8_t[length]);
    std::memcpy(data.get(), &payload.data[0], length);
    message.payloads.push_back({
        .data = data.get(),
        .length = length,
        .metadata = payload.metadata,
    });
    storage.payloads.push_back(std::move(data));
  }

  for (auto& tensor : imessage.tensors) {
    size_t length = tensor.data.length();
    tensorpipe::Buffer buffer;
    std::shared_ptr<void> data;
    if (tensor.device.type == tensorpipe::kCpuDeviceType) {
      data = std::unique_ptr<uint8_t, std::default_delete<uint8_t[]>>(
          new uint8_t[length]);
      std::memcpy(data.get(), &tensor.data[0], length);
      buffer = tensorpipe::CpuBuffer{.ptr = data.get()};
#if TP_USE_CUDA
    } else if (tensor.device.type == tensorpipe::kCudaDeviceType) {
      void* cudaPtr;
      TP_CUDA_CHECK(cudaSetDevice(tensor.device.index));
      TP_CUDA_CHECK(cudaMalloc(&cudaPtr, length));
      data = std::unique_ptr<void, std::function<void(void*)>>(
          cudaPtr, [](void* ptr) { TP_CUDA_CHECK(cudaFree(ptr)); });
      // TODO: Properly dispose of stream when done.
      cudaStream_t stream;
      TP_CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
      buffer = tensorpipe::CudaBuffer{
          .ptr = data.get(),
          .stream = stream,
      };
      TP_CUDA_CHECK(cudaMemcpyAsync(
          cudaPtr, &tensor.data[0], length, cudaMemcpyDefault, stream));
#endif // TP_USE_CUDA
    } else {
      ADD_FAILURE() << "Unexpected source device: " << tensor.device.toString();
    }

    message.tensors.push_back({
        .buffer = buffer,
        .length = length,
        .targetDevice = tensor.targetDevice,
        .metadata = tensor.metadata,
    });
    storage.tensors.push_back({std::move(data), std::move(buffer)});
  }

  message.metadata = imessage.metadata;

  return {std::move(message), std::move(storage)};
}

inline std::pair<tensorpipe::Allocation, Storage> makeAllocation(
    const tensorpipe::Descriptor& descriptor,
    const std::vector<tensorpipe::Device>& devices) {
  tensorpipe::Allocation allocation;
  Storage storage;
  for (const auto& payload : descriptor.payloads) {
    auto data = std::unique_ptr<uint8_t, std::default_delete<uint8_t[]>>(
        new uint8_t[payload.length]);
    allocation.payloads.push_back({.data = data.get()});
    storage.payloads.push_back(std::move(data));
  }

  TP_DCHECK(devices.size() == descriptor.tensors.size());
  for (size_t tensorIdx = 0; tensorIdx < descriptor.tensors.size();
       ++tensorIdx) {
    const auto& tensor = descriptor.tensors[tensorIdx];
    tensorpipe::Device targetDevice = devices[tensorIdx];

    if (tensor.targetDevice.has_value()) {
      TP_DCHECK(targetDevice == *tensor.targetDevice);
    }

    if (targetDevice.type == tensorpipe::kCpuDeviceType) {
      auto data = std::unique_ptr<uint8_t, std::default_delete<uint8_t[]>>(
          new uint8_t[tensor.length]);
      tensorpipe::Buffer buffer = tensorpipe::CpuBuffer{.ptr = data.get()};
      allocation.tensors.push_back({.buffer = buffer});
      storage.tensors.push_back({std::move(data), std::move(buffer)});
#if TP_USE_CUDA
    } else if (targetDevice.type == tensorpipe::kCudaDeviceType) {
      void* cudaPtr;
      TP_CUDA_CHECK(cudaSetDevice(targetDevice.index));
      TP_CUDA_CHECK(cudaMalloc(&cudaPtr, tensor.length));
      auto data = std::unique_ptr<void, std::function<void(void*)>>(
          cudaPtr, [](void* ptr) { TP_CUDA_CHECK(cudaFree(ptr)); });
      // TODO: Properly dispose of stream when done.
      cudaStream_t stream;
      TP_CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
      tensorpipe::Buffer buffer = tensorpipe::CudaBuffer{
          .ptr = data.get(),
          .stream = stream,
      };
      allocation.tensors.push_back({.buffer = buffer});
      storage.tensors.push_back({std::move(data), std::move(buffer)});
#endif // TP_USE_CUDA
    } else {
      ADD_FAILURE() << "Unexpected target device: " << targetDevice.toString();
    }
  }

  return {std::move(allocation), std::move(storage)};
}

inline std::future<void> pipeWriteWithFuture(
    tensorpipe::Pipe& pipe,
    tensorpipe::Message message) {
  auto promise = std::make_shared<std::promise<void>>();
  auto future = promise->get_future();

  pipe.write(
      std::move(message),
      [promise{std::move(promise)}](const tensorpipe::Error& error) {
        if (error) {
          promise->set_exception(
              std::make_exception_ptr(std::runtime_error(error.what())));
          return;
        }

        promise->set_value();
      });

  return future;
}

inline std::future<std::tuple<tensorpipe::Descriptor, Storage>>
pipeReadWithFuture(
    tensorpipe::Pipe& pipe,
    std::vector<tensorpipe::Device> targetDevices) {
  auto promise = std::make_shared<
      std::promise<std::tuple<tensorpipe::Descriptor, Storage>>>();
  auto future = promise->get_future();
  pipe.readDescriptor([&pipe,
                       promise{std::move(promise)},
                       targetDevices{std::move(targetDevices)}](
                          const tensorpipe::Error& error,
                          tensorpipe::Descriptor descriptor) mutable {
    if (error) {
      promise->set_exception(
          std::make_exception_ptr(std::runtime_error(error.what())));
      return;
    }

    tensorpipe::Allocation allocation;
    Storage storage;
    std::tie(allocation, storage) = makeAllocation(descriptor, targetDevices);
    pipe.read(
        std::move(allocation),
        [promise{std::move(promise)},
         descriptor{std::move(descriptor)},
         storage{std::move(storage)}](const tensorpipe::Error& error) mutable {
          if (error) {
            promise->set_exception(
                std::make_exception_ptr(std::runtime_error(error.what())));
            return;
          }

          promise->set_value(std::make_tuple<tensorpipe::Descriptor, Storage>(
              std::move(descriptor), std::move(storage)));
        });
  });

  return future;
}

inline void expectDescriptorAndStorageMatchMessage(
    tensorpipe::Descriptor descriptor,
    Storage storage,
    InlineMessage imessage) {
  EXPECT_EQ(imessage.metadata, descriptor.metadata);

  EXPECT_EQ(descriptor.payloads.size(), storage.payloads.size());
  EXPECT_EQ(imessage.payloads.size(), storage.payloads.size());
  for (size_t idx = 0; idx < imessage.payloads.size(); ++idx) {
    EXPECT_EQ(
        imessage.payloads[idx].metadata, descriptor.payloads[idx].metadata);
    EXPECT_EQ(
        imessage.payloads[idx].data.length(), descriptor.payloads[idx].length);
    EXPECT_EQ(
        imessage.payloads[idx].data,
        std::string(
            static_cast<char*>(storage.payloads[idx].get()),
            descriptor.payloads[idx].length));
  }

  EXPECT_EQ(descriptor.tensors.size(), storage.tensors.size());
  EXPECT_EQ(imessage.tensors.size(), storage.tensors.size());
  for (size_t idx = 0; idx < imessage.tensors.size(); ++idx) {
    EXPECT_TRUE(
        imessage.tensors[idx].device == descriptor.tensors[idx].sourceDevice);
    EXPECT_EQ(imessage.tensors[idx].metadata, descriptor.tensors[idx].metadata);
    EXPECT_EQ(
        imessage.tensors[idx].targetDevice,
        descriptor.tensors[idx].targetDevice);
    const tensorpipe::Device& device = storage.tensors[idx].second.device();
    EXPECT_TRUE(
        !imessage.tensors[idx].targetDevice ||
        imessage.tensors[idx].targetDevice == device);
    size_t length = descriptor.tensors[idx].length;
    EXPECT_EQ(imessage.tensors[idx].data.length(), length);
    if (device.type == tensorpipe::kCpuDeviceType) {
      const tensorpipe::CpuBuffer& buffer =
          storage.tensors[idx].second.unwrap<tensorpipe::CpuBuffer>();
      EXPECT_EQ(
          imessage.tensors[idx].data,
          std::string(static_cast<char*>(buffer.ptr), length));
#if TP_USE_CUDA
    } else if (device.type == tensorpipe::kCudaDeviceType) {
      const tensorpipe::CudaBuffer& buffer =
          storage.tensors[idx].second.unwrap<tensorpipe::CudaBuffer>();
      std::string data(length, 0x0);
      TP_CUDA_CHECK(cudaStreamSynchronize(buffer.stream));
      TP_CUDA_CHECK(
          cudaMemcpy(&data[0], buffer.ptr, length, cudaMemcpyDefault));
      EXPECT_EQ(imessage.tensors[idx].data, data.data());
#endif // TP_USE_CUDA
    } else {
      ADD_FAILURE() << "Unexpected target device: " << device.toString();
    }
  }
}

inline std::vector<std::string> genUrls() {
  std::vector<std::string> res;

#if TENSORPIPE_HAS_SHM_TRANSPORT
  res.push_back("shm://");
#endif // TENSORPIPE_HAS_SHM_TRANSPORT
  res.push_back("uv://127.0.0.1");

  return res;
}

inline std::shared_ptr<tensorpipe::Context> makeContext() {
  auto context = std::make_shared<tensorpipe::Context>();

  context->registerTransport(0, "uv", tensorpipe::transport::uv::create());
#if TENSORPIPE_HAS_SHM_TRANSPORT
  context->registerTransport(1, "shm", tensorpipe::transport::shm::create());
#endif // TENSORPIPE_HAS_SHM_TRANSPORT
  context->registerChannel(100, "basic", tensorpipe::channel::basic::create());
#if TENSORPIPE_HAS_CMA_CHANNEL
  context->registerChannel(101, "cma", tensorpipe::channel::cma::create());
#endif // TENSORPIPE_HAS_CMA_CHANNEL

#if TP_USE_CUDA
  context->registerChannel(
      10,
      "cuda_basic",
      tensorpipe::channel::cuda_basic::create(
          tensorpipe::channel::basic::create()));
#if TENSORPIPE_HAS_CUDA_IPC_CHANNEL
  context->registerChannel(
      11, "cuda_ipc", tensorpipe::channel::cuda_ipc::create());
#endif // TENSORPIPE_HAS_CUDA_IPC_CHANNEL
  context->registerChannel(
      12, "cuda_xth", tensorpipe::channel::cuda_xth::create());
#endif // TP_USE_CUDA

  return context;
}

class ClientServerPipeTestCase {
  ForkedThreadPeerGroup pg_;

 public:
  void run() {
    pg_.spawn(
        [&]() {
          auto context = makeContext();

          auto listener = context->listen(genUrls());
          pg_.send(PeerGroup::kClient, listener->url("uv"));

          std::promise<std::shared_ptr<tensorpipe::Pipe>> promise;
          listener->accept([&](const tensorpipe::Error& error,
                               std::shared_ptr<tensorpipe::Pipe> pipe) {
            if (error) {
              promise.set_exception(
                  std::make_exception_ptr(std::runtime_error(error.what())));
            } else {
              promise.set_value(std::move(pipe));
            }
          });

          std::shared_ptr<tensorpipe::Pipe> pipe = promise.get_future().get();
          server(*pipe);

          pg_.done(PeerGroup::kServer);
          pg_.join(PeerGroup::kServer);

          context->join();
        },
        [&]() {
          auto context = makeContext();

          auto url = pg_.recv(PeerGroup::kClient);
          auto pipe = context->connect(url);

          client(*pipe);

          pg_.done(PeerGroup::kClient);
          pg_.join(PeerGroup::kClient);

          context->join();
        });
  }

  virtual void client(tensorpipe::Pipe& pipe) = 0;
  virtual void server(tensorpipe::Pipe& pipe) = 0;

  virtual ~ClientServerPipeTestCase() = default;
};


================================================
FILE: tensorpipe/test/peer_group.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <array>
#include <string>
#include <thread>

#include <unistd.h>

#include <gtest/gtest.h>

#include <tensorpipe/common/defs.h>
#include <tensorpipe/common/queue.h>

class PeerGroup {
 public:
  static constexpr int kNumPeers = 2;
  static constexpr int kServer = 0;
  static constexpr int kClient = 1;

  virtual ~PeerGroup() = default;

  // Send message to given peer.
  virtual void send(int receiverId, const std::string&) = 0;

  // Read next message for given peer. This method is blocking.
  virtual std::string recv(int receiverId) = 0;

  // Spawn two peers each running one of the provided functions.
  virtual void spawn(std::function<void()>, std::function<void()>) = 0;

  // Whether the two endpoints are two threads in the same process (as opposed
  // to two separate processes).
  virtual bool endpointsInSameProcess() const = 0;

  // Signal other peers that this peer is done.
  void done(int selfId) {
    send(1 - selfId, doneString_);
    std::unique_lock<std::mutex> lock(m_);
    done_[selfId] = true;
    condVar_[selfId].notify_one();
  }

  // Wait for all peers (including this one) to be done.
  void join(int selfId) {
    EXPECT_EQ(doneString_, recv(selfId));

    std::unique_lock<std::mutex> lock(m_);
    condVar_[selfId].wait(lock, [&] { return done_[selfId]; });
  }

 private:
  // This should be static but then we need to define it out-of-line (or mark it
  // as inline once we can use C++-17).
  const std::string doneString_ = "done";
  std::mutex m_;
  std::array<bool, kNumPeers> done_{{false, false}};
  std::array<std::condition_variable, kNumPeers> condVar_;
};

class ThreadPeerGroup : public PeerGroup {
 public:
  void send(int receiverId, const std::string& str) override {
    q_[receiverId].push(str);
  }

  std::string recv(int receiverId) override {
    return q_[receiverId].pop();
  }

  void spawn(std::function<void()> f1, std::function<void()> f2) override {
    std::array<std::function<void()>, kNumPeers> fns = {
        std::move(f1), std::move(f2)};
    std::array<std::thread, kNumPeers> ts;

    for (int peerId = 0; peerId < kNumPeers; ++peerId) {
      ts[peerId] = std::thread(fns[peerId]);
    }

    for (auto& t : ts) {
      t.join();
    }
  }

  bool endpointsInSameProcess() const override {
    return true;
  }

 private:
  std::array<tensorpipe::Queue<std::string>, kNumPeers> q_;
};

class ForkedThreadPeerGroup : public ThreadPeerGroup {
 public:
  void spawn(std::function<void()> f1, std::function<void()> f2) override {
    // Some tests modify the global state of the process (such as initializing
    // the CUDA context), which would cause other tests running as sub-processes
    // to fail. Here, we run all thread-based tests in a sub-process to avoid
    // this issue.
    pid_t pid = fork();
    TP_THROW_SYSTEM_IF(pid < 0, errno) << "Failed to fork";
    if (pid == 0) {
      ThreadPeerGroup::spawn(f1, f2);
      std::exit(((testing::Test::HasFailure()) ? 1 : 0));
    }

    int status;
    TP_THROW_SYSTEM_IF(waitpid(pid, &status, 0) < 0, errno)
        << "Failed to wait for child test process";
    EXPECT_TRUE(WIFEXITED(status));
    if (WIFSIGNALED(status)) {
      TP_LOG_WARNING() << "Test process terminated with signal "
                       << WTERMSIG(status);
    }
    const int exitStatus = WEXITSTATUS(status);
    EXPECT_EQ(0, exitStatus);
  }
};

class ProcessPeerGroup : public PeerGroup {
 public:
  void send(int receiverId, const std::string& str) override {
    uint64_t len = str.length();

    int ret;

    ret = write(pipefd_[receiverId][kWriteEnd], &len, sizeof(len));
    TP_THROW_SYSTEM_IF(ret < 0, errno) << "Failed to write to pipe";
    EXPECT_EQ(sizeof(len), ret);

    ret = write(pipefd_[receiverId][kWriteEnd], str.data(), len);
    TP_THROW_SYSTEM_IF(ret < 0, errno) << "Failed to write to pipe";
    EXPECT_EQ(len, ret);
  }

  std::string recv(int receiverId) override {
    int ret;

    uint64_t len;
    ret = read(pipefd_[receiverId][kReadEnd], &len, sizeof(len));
    TP_THROW_SYSTEM_IF(ret < 0, errno) << "Failed to read from pipe";
    EXPECT_EQ(sizeof(len), ret);

    std::string str(len, 0);
    ret = read(pipefd_[receiverId][kReadEnd], &str[0], len);
    TP_THROW_SYSTEM_IF(ret < 0, errno) << "Failed to read from pipe";
    EXPECT_EQ(len, ret);

    return str;
  }

  void spawn(std::function<void()> f1, std::function<void()> f2) override {
    std::array<std::function<void()>, kNumPeers> fns = {
        std::move(f1), std::move(f2)};
    std::array<pid_t, kNumPeers> pids = {-1, -1};

    for (int peerId = 0; peerId < kNumPeers; ++peerId) {
      TP_THROW_SYSTEM_IF(pipe(pipefd_[peerId].data()) < 0, errno)
          << "Failed to create pipe";
    }

    for (int peerId = 0; peerId < kNumPeers; ++peerId) {
      pids[peerId] = fork();
      TP_THROW_SYSTEM_IF(pids[peerId] < 0, errno) << "Failed to fork";
      if (pids[peerId] == 0) {
        try {
          // Close writing end of our pipe.
          TP_THROW_SYSTEM_IF(close(pipefd_[peerId][kWriteEnd]) < 0, errno)
              << "Failed to close fd";
          // Close reading end of other pipe.
          TP_THROW_SYSTEM_IF(close(pipefd_[1 - peerId][kReadEnd]) < 0, errno)
              << "Failed to close fd";

          fns[peerId]();
        } catch (const std::exception& e) {
          TP_LOG_ERROR() << "Child #" << peerId << " (PID " << getpid()
                         << ") encountered exception " << e.what();
          std::exit(2);
        } catch (...) {
          std::exit(3);
        }
        std::exit(((testing::Test::HasFailure()) ? 1 : 0));
      }
    }

    // Close all pipes in parent process.
    for (int peerId = 0; peerId < kNumPeers; ++peerId) {
      for (int pipeEnd = 0; pipeEnd < 2; ++pipeEnd) {
        TP_THROW_SYSTEM_IF(close(pipefd_[peerId][pipeEnd]) < 0, errno)
            << "Failed to close fd";
      }
    }

    for (int peerId = 0; peerId < kNumPeers; ++peerId) {
      int status;
      TP_THROW_SYSTEM_IF(waitpid(-1, &status, 0) < 0, errno)
          << "Failed to wait for child process";
      EXPECT_TRUE(WIFEXITED(status));
      if (WIFSIGNALED(status)) {
        TP_LOG_WARNING() << "Peer process terminated with signal "
                         << WTERMSIG(status);
      }
      const int exitStatus = WEXITSTATUS(status);
      EXPECT_EQ(0, exitStatus);
    }
  }

  bool endpointsInSameProcess() const override {
    return false;
  }

 private:
  static constexpr int kReadEnd = 0;
  static constexpr int kWriteEnd = 1;

  std::array<std::array<int, 2>, kNumPeers> pipefd_;
};


================================================
FILE: tensorpipe/test/python/tensorpipe.py
================================================
#!/usr/bin/env python3
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

import threading
import unittest

import pytensorpipe as tp


class TestTensorpipe(unittest.TestCase):
    def test_read_write(self):
        context = tp.Context()
        context.register_transport(0, "tcp", tp.create_uv_transport())
        create_shm_transport = getattr(tp, "create_shm_transport", None)
        if create_shm_transport is not None:
            context.register_transport(-1, "shm", create_shm_transport())
        context.register_channel(0, "basic", tp.create_basic_channel())
        create_cma_channel = getattr(tp, "create_cma_channel", None)
        if create_cma_channel is not None:
            context.register_channel(-1, "cma", create_cma_channel())

        # We must keep a reference to it, or it will be destroyed early.
        server_pipe = None

        listener: tp.Listener = context.listen(["tcp://127.0.0.1"])

        write_completed = threading.Event()

        def on_connection(pipe: tp.Pipe) -> None:
            global server_pipe
            payload = tp.OutgoingPayload(b"Hello ", b"a greeting")
            tensor = tp.OutgoingTensor(b"World!", b"a place")
            message = tp.OutgoingMessage(b"metadata", [payload], [tensor])
            pipe.write(message, on_write)
            server_pipe = pipe

        def on_write() -> None:
            write_completed.set()

        listener.listen(on_connection)

        client_pipe: tp.Pipe = context.connect(listener.get_url("tcp"))

        received_payloads = None
        received_tensors = None
        read_completed = threading.Event()

        def on_read_descriptor(message: tp.IncomingMessage) -> None:
            nonlocal received_payloads, received_tensors
            self.assertEqual(message.metadata, bytearray(b"metadata"))
            received_payloads = []
            for payload in message.payloads:
                self.assertEqual(payload.metadata, bytearray(b"a greeting"))
                received_payloads.append(bytearray(payload.length))
                payload.buffer = received_payloads[-1]
            received_tensors = []
            for tensor in message.tensors:
                self.assertEqual(tensor.metadata, bytearray(b"a place"))
                received_tensors.append(bytearray(tensor.length))
                tensor.buffer = received_tensors[-1]
            client_pipe.read(message, on_read)

        def on_read() -> None:
            read_completed.set()

        client_pipe.read_descriptor(on_read_descriptor)

        write_completed.wait()
        read_completed.wait()

        self.assertEqual(received_payloads, [bytearray(b"Hello ")])
        self.assertEqual(received_tensors, [bytearray(b"World!")])

        # Due to a current limitation we're not releasing the GIL when calling
        # the context's destructor, which implicitly calls join, which may fire
        # some callbacks that also try to acquire the GIL and thus deadlock.
        # So, for now, we must explicitly call join.
        # See https://github.com/pybind/pybind11/issues/1446.
        context.join()


if __name__ == "__main__":
    unittest.main()


================================================
FILE: tensorpipe/test/test.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <signal.h>

// One-time init to use EPIPE errors instead of SIGPIPE
namespace {

struct Initializer {
  explicit Initializer() {
    signal(SIGPIPE, SIG_IGN);
  }
};

Initializer initializer;

} // namespace


================================================
FILE: tensorpipe/test/test_environment.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/test/test_environment.h>

#if TP_USE_CUDA
#include <cuda_runtime.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <tensorpipe/common/cuda.h>
#include <tensorpipe/common/defs.h>
#include <unistd.h>
#endif // TP_USE_CUDA

int TestEnvironment::numCudaDevices() {
  static int count = -1;
  if (count == -1) {
#if TP_USE_CUDA
    pid_t pid = fork();
    TP_THROW_SYSTEM_IF(pid < 0, errno) << "Failed to fork";
    if (pid == 0) {
      int res;
      TP_CUDA_CHECK(cudaGetDeviceCount(&res));
      std::exit(res);
    } else {
      int status;
      TP_THROW_SYSTEM_IF(waitpid(pid, &status, 0) < 0, errno)
          << "Failed to wait for child process";
      TP_THROW_ASSERT_IF(!WIFEXITED(status));
      count = WEXITSTATUS(status);
    }
#else // TP_USE_CUDA
    count = 0;
#endif // TP_USE_CUDA
  }

  return count;
}


================================================
FILE: tensorpipe/test/test_environment.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

class TestEnvironment {
 public:
  static int numCudaDevices();
};


================================================
FILE: tensorpipe/test/transport/connection_test.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/test/transport/transport_test.h>

#include <array>

#include <nop/serializer.h>
#include <nop/structure.h>

using namespace tensorpipe;
using namespace tensorpipe::transport;

TEST_P(TransportTest, Connection_Initialization) {
  constexpr size_t numBytes = 13;
  std::array<char, numBytes> garbage;

  testConnection(
      [&](std::shared_ptr<Connection> conn) {
        doRead(
            conn,
            [&](const Error& error, const void* /* unused */, size_t len) {
              ASSERT_FALSE(error) << error.what();
              ASSERT_EQ(len, garbage.size());
              peers_->done(PeerGroup::kServer);
            });
        peers_->join(PeerGroup::kServer);
      },
      [&](std::shared_ptr<Connection> conn) {
        doWrite(conn, garbage.data(), garbage.size(), [&](const Error& error) {
          ASSERT_FALSE(error) << error.what();
          peers_->done(PeerGroup::kClient);
        });
        peers_->join(PeerGroup::kClient);
      });
}

TEST_P(TransportTest, Connection_InitializationError) {
  int numRequests = 10;

  testConnection(
      [&](std::shared_ptr<Connection> /* unused */) {
        // Closes connection
      },
      [&](std::shared_ptr<Connection> conn) {
        for (int i = 0; i < numRequests; i++) {
          std::promise<void> readCompletedProm;
          doRead(
              conn,
              [&, conn](
                  const Error& error,
                  const void* /* unused */,
                  size_t /* unused */) {
                ASSERT_TRUE(error);
                readCompletedProm.set_value();
              });
          readCompletedProm.get_future().wait();
        }
      });
}

// Disabled because no one really knows what this test was meant to check.
TEST_P(TransportTest, DISABLED_Connection_DestroyConnectionFromCallback) {
  testConnection(
      [&](std::shared_ptr<Connection> /* unused */) {
        // Closes connection
      },
      [&](std::shared_ptr<Connection> conn) {
        // This should be the only connection instance.
        EXPECT_EQ(conn.use_count(), 1);
        // Move connection instance to lambda scope, so we can destroy
        // the only instance we have from the callback itself. This
        // tests that the transport keeps the connection alive as long
        // as it's executing a callback.
        doRead(
            conn,
            [conn](
                const Error& /* unused */,
                const void* /* unused */,
                size_t /* unused */) mutable {
              // Destroy connection from within callback.
              EXPECT_GT(conn.use_count(), 1);
              conn.reset();
            });
      });
}

namespace {

struct MyNopType {
  uint32_t myIntField;
  NOP_STRUCTURE(MyNopType, myIntField);
};

} // namespace

TEST_P(TransportTest, Connection_NopWrite) {
  constexpr size_t kSize = 0x42;

  testConnection(
      [&](std::shared_ptr<Connection> conn) {
        auto holder = std::make_shared<NopHolder<MyNopType>>();
        MyNopType& object = holder->getObject();
        conn->read(*holder, [&, conn, holder](const Error& error) {
          ASSERT_FALSE(error) << error.what();
          ASSERT_EQ(object.myIntField, kSize);
          peers_->done(PeerGroup::kServer);
        });
        peers_->join(PeerGroup::kServer);
      },
      [&](std::shared_ptr<Connection> conn) {
        auto holder = std::make_shared<NopHolder<MyNopType>>();
        MyNopType& object = holder->getObject();
        object.myIntField = kSize;
        conn->write(*holder, [&, conn, holder](const Error& error) {
          ASSERT_FALSE(error) << error.what();
          peers_->done(PeerGroup::kClient);
        });
        peers_->join(PeerGroup::kClient);
      });
}

TEST_P(TransportTest, Connection_QueueWritesBeforeReads) {
  constexpr int kMsgSize = 16 * 1024;
  constexpr int numMsg = 10;
  const std::string kReady = "ready";
  std::string msg[numMsg];

  for (int i = 0; i < numMsg; i++) {
    msg[i] = std::string(kMsgSize, static_cast<char>(i));
  }

  testConnection(
      [&](std::shared_ptr<Connection> conn) {
        for (int i = 0; i < numMsg; i++) {
          doWrite(
              conn,
              msg[i].c_str(),
              msg[i].length(),
              [&, conn, i](const Error& error) {
                ASSERT_FALSE(error) << error.what();
                if (i == numMsg - 1) {
                  peers_->send(PeerGroup::kClient, kReady);
                  peers_->done(PeerGroup::kServer);
                }
              });
        }
        peers_->join(PeerGroup::kServer);
      },
      [&](std::shared_ptr<Connection> conn) {
        ASSERT_EQ(kReady, peers_->recv(PeerGroup::kClient));
        for (int i = 0; i < numMsg; i++) {
          doRead(
              conn,
              [&, conn, i](const Error& error, const void* data, size_t len) {
                ASSERT_FALSE(error) << error.what();
                ASSERT_EQ(len, msg[i].length());
                const char* cdata = (const char*)data;
                for (int j = 0; j < len; ++j) {
                  const char c = cdata[j];
                  ASSERT_EQ(c, msg[i][j]) << "Wrong value at position " << j
                                          << " of " << msg[i].length();
                }
                if (i == numMsg - 1) {
                  peers_->done(PeerGroup::kClient);
                }
              });
        }
        peers_->join(PeerGroup::kClient);
      });
}

// TODO: Enable this test when uv transport could handle
TEST_P(TransportTest, DISABLED_Connection_EmptyBuffer) {
  constexpr size_t numBytes = 13;
  std::array<char, numBytes> garbage;
  int ioNum = 100;

  testConnection(
      [&](std::shared_ptr<Connection> conn) {
        std::atomic<int> n(ioNum);
        for (int i = 0; i < ioNum; i++) {
          if (i % 2 == 0) {
            // Empty buffer
            doRead(
                conn,
                nullptr,
                0,
                [&, conn](const Error& error, const void* ptr, size_t len) {
                  ASSERT_FALSE(error) << error.what();
                  ASSERT_EQ(len, 0);
                  ASSERT_EQ(ptr, nullptr);
                  if (--n == 0) {
                    peers_->done(PeerGroup::kServer);
                  }
                });
          } else {
            // Garbage buffer
            doRead(
                conn,
                [&, conn](
                    const Error& error, const void* /* unused */, size_t len) {
                  ASSERT_FALSE(error) << error.what();
                  ASSERT_EQ(len, garbage.size());
                  if (--n == 0) {
                    peers_->done(PeerGroup::kServer);
                  }
                });
          }
        }

        peers_->join(PeerGroup::kServer);
      },
      [&](std::shared_ptr<Connection> conn) {
        std::atomic<int> n(ioNum);
        for (int i = 0; i < ioNum; i++) {
          if ((i & 1) == 0) {
            // Empty buffer
            doWrite(conn, nullptr, 0, [&, conn](const Error& error) {
              ASSERT_FALSE(error) << error.what();
              if (--n == 0) {
                peers_->done(PeerGroup::kClient);
              }
            });
          } else {
            // Garbage buffer
            doWrite(
                conn,
                garbage.data(),
                garbage.size(),
                [&, conn](const Error& error) {
                  ASSERT_FALSE(error) << error.what();
                  if (--n == 0) {
                    peers_->done(PeerGroup::kClient);
                  }
                });
          }
        }

        peers_->join(PeerGroup::kClient);
      });
}


================================================
FILE: tensorpipe/test/transport/context_test.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/test/transport/transport_test.h>

#include <gtest/gtest.h>

using namespace tensorpipe;
using namespace tensorpipe::transport;

TEST_P(TransportTest, Context_Basics) {
  auto context = GetParam()->getContext();
  auto addr = GetParam()->defaultAddr();

  {
    std::mutex mutex;
    std::condition_variable cv;
    std::vector<std::shared_ptr<Connection>> connections;

    // Listener runs callback for every new connection.
    auto listener = context->listen(addr);
    listener->accept(
        [&](const Error& error, std::shared_ptr<Connection> connection) {
          ASSERT_FALSE(error) << error.what();
          std::lock_guard<std::mutex> lock(mutex);
          connections.push_back(std::move(connection));
          cv.notify_one();
        });

    // Connect to listener.
    auto conn = context->connect(listener->addr());

    // Wait for new connection
    {
      std::unique_lock<std::mutex> lock(mutex);
      while (connections.empty()) {
        cv.wait(lock);
      }
    }
  }

  context->join();
}

TEST_P(TransportTest, Context_DomainDescriptor) {
  auto context = GetParam()->getContext();

  {
    const auto& domainDescriptor = context->domainDescriptor();
    EXPECT_FALSE(domainDescriptor.empty());
  }

  context->join();
}


================================================
FILE: tensorpipe/test/transport/ibv/connection_test.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/test/transport/ibv/ibv_test.h>

#include <gtest/gtest.h>
#include <nop/serializer.h>
#include <nop/structure.h>

using namespace tensorpipe;
using namespace tensorpipe::transport;

namespace {

class IbvTransportTest : public TransportTest {};

IbvTransportTestHelper helper;

// This value is defined in tensorpipe/transport/ibv/connection.h
static constexpr auto kBufferSize = 2 * 1024 * 1024;

} // namespace

TEST_P(IbvTransportTest, Chunking) {
  // This is larger than the default ring buffer size.
  const int kMsgSize = 5 * kBufferSize;
  std::string srcBuf(kMsgSize, 0x42);
  auto dstBuf = std::make_unique<char[]>(kMsgSize);

  testConnection(
      [&](std::shared_ptr<Connection> conn) {
        doRead(
            conn,
            dstBuf.get(),
            kMsgSize,
            [&, conn](const Error& error, const void* ptr, size_t len) {
              ASSERT_FALSE(error) << error.what();
              ASSERT_EQ(len, kMsgSize);
              ASSERT_EQ(ptr, dstBuf.get());
              for (int i = 0; i < kMsgSize; ++i) {
                ASSERT_EQ(dstBuf[i], srcBuf[i]);
              }
              peers_->done(PeerGroup::kServer);
            });
        peers_->join(PeerGroup::kServer);
      },
      [&](std::shared_ptr<Connection> conn) {
        doWrite(
            conn,
            srcBuf.c_str(),
            srcBuf.length(),
            [&, conn](const Error& error) {
              ASSERT_FALSE(error) << error.what();
              peers_->done(PeerGroup::kClient);
            });
        peers_->join(PeerGroup::kClient);
      });
}

TEST_P(IbvTransportTest, ChunkingImplicitRead) {
  // This is larger than the default ring buffer size.
  const size_t kMsgSize = 5 * kBufferSize;
  std::string msg(kMsgSize, 0x42);

  testConnection(
      [&](std::shared_ptr<Connection> conn) {
        doRead(
            conn, [&, conn](const Error& error, const void* ptr, size_t len) {
              ASSERT_FALSE(error) << error.what();
              ASSERT_EQ(len, kMsgSize);
              for (int i = 0; i < kMsgSize; ++i) {
                ASSERT_EQ(static_cast<const uint8_t*>(ptr)[i], msg[i]);
              }
              peers_->done(PeerGroup::kServer);
            });
        peers_->join(PeerGroup::kServer);
      },
      [&](std::shared_ptr<Connection> conn) {
        doWrite(conn, msg.c_str(), msg.length(), [&, conn](const Error& error) {
          ASSERT_FALSE(error) << error.what();
          peers_->done(PeerGroup::kClient);
        });
        peers_->join(PeerGroup::kClient);
      });
}

TEST_P(IbvTransportTest, QueueWrites) {
  // This is large enough that two of those will not fit in the ring buffer at
  // the same time.
  constexpr int numMsg = 2;
  constexpr size_t numBytes = (3 * kBufferSize) / 4;
  const std::string kReady = "ready";
  std::array<char, numBytes> garbage;

  testConnection(
      [&](std::shared_ptr<Connection> conn) {
        // Wait for peer to queue up writes before attempting to read
        EXPECT_EQ(kReady, peers_->recv(PeerGroup::kServer));

        for (int i = 0; i < numMsg; ++i) {
          doRead(
              conn,
              [&, conn, i](const Error& error, const void* ptr, size_t len) {
                ASSERT_FALSE(error) << error.what();
                ASSERT_EQ(len, numBytes);
                if (i == numMsg - 1) {
                  peers_->done(PeerGroup::kServer);
                }
              });
        }
        peers_->join(PeerGroup::kServer);
      },
      [&](std::shared_ptr<Connection> conn) {
        for (int i = 0; i < numMsg; ++i) {
          doWrite(
              conn,
              garbage.data(),
              garbage.size(),
              [&, conn, i](const Error& error) {
                ASSERT_FALSE(error) << error.what();
                if (i == numMsg - 1) {
                  peers_->done(PeerGroup::kClient);
                }
              });
        }
        peers_->send(PeerGroup::kServer, kReady);
        peers_->join(PeerGroup::kClient);
      });
}

namespace {

struct MyNopType {
  std::string myStringField;
  NOP_STRUCTURE(MyNopType, myStringField);
};

} // namespace

TEST_P(IbvTransportTest, NopWriteWrapAround) {
  constexpr int numMsg = 2;
  constexpr size_t kSize = (3 * kBufferSize) / 4;

  testConnection(
      [&](std::shared_ptr<Connection> conn) {
        for (int i = 0; i < numMsg; ++i) {
          auto holder = std::make_shared<NopHolder<MyNopType>>();
          conn->read(*holder, [&, conn, holder, i](const Error& error) {
            ASSERT_FALSE(error) << error.what();
            ASSERT_EQ(holder->getObject().myStringField.length(), kSize);
            if (i == numMsg - 1) {
              peers_->done(PeerGroup::kServer);
            }
          });
        }
        peers_->join(PeerGroup::kServer);
      },
      [&](std::shared_ptr<Connection> conn) {
        for (int i = 0; i < numMsg; ++i) {
          auto holder = std::make_shared<NopHolder<MyNopType>>();
          holder->getObject().myStringField = std::string(kSize, 'B');
          conn->write(*holder, [&, conn, holder, i](const Error& error) {
            ASSERT_FALSE(error) << error.what();
            if (i == numMsg - 1) {
              peers_->done(PeerGroup::kClient);
            }
          });
        }
        peers_->join(PeerGroup::kClient);
      });
}

INSTANTIATE_TEST_CASE_P(Ibv, IbvTransportTest, ::testing::Values(&helper));


================================================
FILE: tensorpipe/test/transport/ibv/context_test.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/test/transport/ibv/ibv_test.h>
#include <tensorpipe/transport/ibv/utility.h>

#include <gtest/gtest.h>

namespace {

class IbvTransportContextTest : public TransportTest {};

IbvTransportTestHelper helper;

} // namespace

using namespace tensorpipe;

// Linux-only because OSX machines on CircleCI cannot resolve their hostname
#ifdef __linux__
TEST_P(IbvTransportContextTest, LookupHostnameAddress) {
  Error error;
  std::string addr;
  std::tie(error, addr) = transport::ibv::lookupAddrForHostname();
  EXPECT_FALSE(error) << error.what();
  EXPECT_NE(addr, "");
}
#endif

// Interface name conventions change based on platform. Linux uses "lo", OSX
// uses lo0, Windows uses integers.
#ifdef __linux__
#define LOOPBACK_INTERFACE "lo"
#elif __APPLE__
#define LOOPBACK_INTERFACE "lo0"
#endif

#ifdef LOOPBACK_INTERFACE
TEST_P(IbvTransportContextTest, LookupInterfaceAddress) {
  Error error;
  std::string addr;
  std::tie(error, addr) =
      transport::ibv::lookupAddrForIface(LOOPBACK_INTERFACE);
  EXPECT_FALSE(error) << error.what();
  EXPECT_NE(addr, "");
}
#endif

INSTANTIATE_TEST_CASE_P(
    Ibv,
    IbvTransportContextTest,
    ::testing::Values(&helper));


================================================
FILE: tensorpipe/test/transport/ibv/ibv_test.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/test/transport/ibv/ibv_test.h>

namespace {

IbvTransportTestHelper helper;

} // namespace

INSTANTIATE_TEST_CASE_P(Ibv, TransportTest, ::testing::Values(&helper));


================================================
FILE: tensorpipe/test/transport/ibv/ibv_test.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <tensorpipe/test/transport/transport_test.h>
#include <tensorpipe/transport/ibv/factory.h>

class IbvTransportTestHelper : public TransportTestHelper {
 protected:
  std::shared_ptr<tensorpipe::transport::Context> getContextInternal()
      override {
    return tensorpipe::transport::ibv::create();
  }

 public:
  std::string defaultAddr() override {
    return "127.0.0.1";
  }
};


================================================
FILE: tensorpipe/test/transport/ibv/sockaddr_test.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/transport/ibv/sockaddr.h>

#include <netinet/in.h>

#include <gtest/gtest.h>

using namespace tensorpipe::transport;

namespace {

int family(const ibv::Sockaddr& addr) {
  auto sockaddr = addr.addr();
  return sockaddr->sa_family;
}

int port(const ibv::Sockaddr& addr) {
  auto sockaddr = addr.addr();
  if (sockaddr->sa_family == AF_INET) {
    auto in = reinterpret_cast<const struct sockaddr_in*>(sockaddr);
    return in->sin_port;
  }
  if (sockaddr->sa_family == AF_INET6) {
    auto in6 = reinterpret_cast<const struct sockaddr_in6*>(sockaddr);
    return in6->sin6_port;
  }
  return -1;
}

} // namespace

TEST(IbvSockaddr, InetBadPort) {
  ASSERT_THROW(
      ibv::Sockaddr::createInetSockAddr("1.2.3.4:-1"), std::invalid_argument);
  ASSERT_THROW(
      ibv::Sockaddr::createInetSockAddr("1.2.3.4:65536"),
      std::invalid_argument);
}

TEST(IbvSockaddr, Inet) {
  {
    auto sa = ibv::Sockaddr::createInetSockAddr("1.2.3.4:5");
    ASSERT_EQ(family(sa), AF_INET);
    ASSERT_EQ(port(sa), ntohs(5));
    ASSERT_EQ(sa.str(), "1.2.3.4:5");
  }

  {
    auto sa = ibv::Sockaddr::createInetSockAddr("1.2.3.4:0");
    ASSERT_EQ(family(sa), AF_INET);
    ASSERT_EQ(port(sa), 0);
    ASSERT_EQ(sa.str(), "1.2.3.4:0");
  }

  {
    auto sa = ibv::Sockaddr::createInetSockAddr("1.2.3.4");
    ASSERT_EQ(family(sa), AF_INET);
    ASSERT_EQ(port(sa), 0);
    ASSERT_EQ(sa.str(), "1.2.3.4:0");
  }
}

TEST(IbvSockaddr, Inet6BadPort) {
  ASSERT_THROW(
      ibv::Sockaddr::createInetSockAddr("[::1]:-1"), std::invalid_argument);
  ASSERT_THROW(
      ibv::Sockaddr::createInetSockAddr("[::1]:65536"), std::invalid_argument);
  ASSERT_THROW(
      ibv::Sockaddr::createInetSockAddr("]::1["), std::invalid_argument);
}

// Interface name conventions change based on platform. Linux uses "lo", OSX
// uses lo0, Windows uses integers.
#ifdef __linux__
#define LOOPBACK_INTERFACE "lo"
#elif __APPLE__
#define LOOPBACK_INTERFACE "lo0"
#endif

TEST(IbvSockaddr, Inet6) {
  {
    auto sa = ibv::Sockaddr::createInetSockAddr("[::1]:5");
    ASSERT_EQ(family(sa), AF_INET6);
    ASSERT_EQ(port(sa), ntohs(5));
    ASSERT_EQ(sa.str(), "[::1]:5");
  }

  {
    auto sa = ibv::Sockaddr::createInetSockAddr("[::1]:0");
    ASSERT_EQ(family(sa), AF_INET6);
    ASSERT_EQ(port(sa), 0);
    ASSERT_EQ(sa.str(), "[::1]:0");
  }

  {
    auto sa = ibv::Sockaddr::createInetSockAddr("::1");
    ASSERT_EQ(family(sa), AF_INET6);
    ASSERT_EQ(port(sa), 0);
    ASSERT_EQ(sa.str(), "[::1]:0");
  }

#ifdef LOOPBACK_INTERFACE
  {
    auto sa = ibv::Sockaddr::createInetSockAddr("::1%" LOOPBACK_INTERFACE);
    ASSERT_EQ(family(sa), AF_INET6);
    ASSERT_EQ(port(sa), 0);
    ASSERT_EQ(sa.str(), "[::1%" LOOPBACK_INTERFACE "]:0");
  }

  {
    sockaddr_in6 sa;
    std::memset(&sa, 0, sizeof(sa));
    sa.sin6_family = AF_INET6;
    sa.sin6_port = ntohs(42);
    sa.sin6_flowinfo = 0;
    sa.sin6_addr.s6_addr[15] = 1;
    // Implicitly assuming that the loopback interface is the first one.
    sa.sin6_scope_id = 1;
    ibv::Sockaddr tpSa(reinterpret_cast<sockaddr*>(&sa), sizeof(sa));
    ASSERT_EQ(tpSa.str(), "[::1%" LOOPBACK_INTERFACE "]:42");
  }
#endif
}


================================================
FILE: tensorpipe/test/transport/listener_test.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/test/transport/transport_test.h>

#include <gtest/gtest.h>

using namespace tensorpipe;
using namespace tensorpipe::transport;

TEST_P(TransportTest, Listener_Basics) {
  auto context = GetParam()->getContext();
  auto addr = GetParam()->defaultAddr();

  {
    std::mutex mutex;
    std::condition_variable cv;
    std::vector<std::shared_ptr<Connection>> connections;

    // Listener runs callback for every new connection.
    auto listener = context->listen(addr);
    listener->accept(
        [&](const Error& error, std::shared_ptr<Connection> connection) {
          ASSERT_FALSE(error) << error.what();
          std::lock_guard<std::mutex> lock(mutex);
          connections.push_back(std::move(connection));
          cv.notify_one();
        });

    // Connect to listener.
    auto connection = context->connect(listener->addr());

    // Wait for new connection
    {
      std::unique_lock<std::mutex> lock(mutex);
      while (connections.empty()) {
        cv.wait(lock);
      }
    }
  }

  context->join();
}

TEST_P(TransportTest, Listener_AcceptCallbacksAreQueued) {
  auto context = GetParam()->getContext();
  auto addr = GetParam()->defaultAddr();

  {
    auto listener = context->listen(addr);
    int numAccepts = 0;
    std::promise<void> donePromise;
    for (int i = 0; i < 10; ++i) {
      listener->accept(
          [&, i](const Error& error, std::shared_ptr<Connection> /*unused*/) {
            if (error) {
              donePromise.set_exception(
                  std::make_exception_ptr(std::runtime_error(error.what())));
            } else {
              EXPECT_EQ(i, numAccepts);
              numAccepts++;
              if (numAccepts == 10) {
                donePromise.set_value();
              }
            }
          });
    }

    // Avoid connections to be destroyed before being established.
    std::vector<std::shared_ptr<Connection>> conns;
    for (int i = 0; i < 10; ++i) {
      auto c = context->connect(listener->addr());
      conns.push_back(std::move(c));
    }
    donePromise.get_future().get();
  }

  context->join();
}

TEST_P(TransportTest, Listener_IncomingConnectionsAreQueued) {
  auto context = GetParam()->getContext();
  auto addr = GetParam()->defaultAddr();

  {
    auto listener = context->listen(addr);
    int numAccepts = 0;
    std::promise<void> donePromise;
    // Avoid connections to be destroyed before being established.
    std::vector<std::shared_ptr<Connection>> conns;
    for (int i = 0; i < 10; ++i) {
      auto c = context->connect(listener->addr());
      conns.push_back(std::move(c));
    }
    for (int i = 0; i < 10; ++i) {
      listener->accept(
          [&, i](const Error& error, std::shared_ptr<Connection> /*unused*/) {
            if (error) {
              donePromise.set_exception(
                  std::make_exception_ptr(std::runtime_error(error.what())));
            } else {
              EXPECT_EQ(i, numAccepts);
              numAccepts++;
              if (numAccepts == 10) {
                donePromise.set_value();
              }
            }
          });
    }

    donePromise.get_future().get();
  }

  context->join();
}

TEST_P(TransportTest, Listener_CreateThenCloseAndThenGetAddress) {
  auto context = GetParam()->getContext();

  auto listener = context->listen(GetParam()->defaultAddr());
  listener->close();
  auto addr = listener->addr();

  std::promise<void> acceptPromise;
  listener->accept(
      [&](const Error& error, std::shared_ptr<Connection> /*unused*/) {
        if (error) {
          acceptPromise.set_exception(
              std::make_exception_ptr(std::runtime_error(error.what())));
        } else {
          acceptPromise.set_value();
        }
      });

  auto connection = context->connect(addr);
  std::promise<void> writePromise;
  connection->write(nullptr, 0, [&](const Error& error) {
    if (error) {
      writePromise.set_exception(
          std::make_exception_ptr(std::runtime_error(error.what())));
    } else {
      writePromise.set_value();
    }
  });

  try {
    acceptPromise.get_future().get();
  } catch (const std::runtime_error&) {
    // Expected
  }

  try {
    writePromise.get_future().get();
  } catch (const std::runtime_error&) {
    // Expected
  }

  context->join();
}

TEST_P(TransportTest, Listener_CreateAfterClosingContextAndThenGetAddress) {
  auto context = GetParam()->getContext();

  // This means the listener will be created in an already-closed state.
  context->close();
  auto listener = context->listen(GetParam()->defaultAddr());
  auto addr = listener->addr();

  std::promise<void> acceptPromise;
  listener->accept(
      [&](const Error& error, std::shared_ptr<Connection> /*unused*/) {
        if (error) {
          acceptPromise.set_exception(
              std::make_exception_ptr(std::runtime_error(error.what())));
        } else {
          acceptPromise.set_value();
        }
      });

  auto connection = context->connect(addr);
  std::promise<void> writePromise;
  connection->write(nullptr, 0, [&](const Error& error) {
    if (error) {
      writePromise.set_exception(
          std::make_exception_ptr(std::runtime_error(error.what())));
    } else {
      writePromise.set_value();
    }
  });

  try {
    acceptPromise.get_future().get();
  } catch (const std::runtime_error&) {
    // Expected
  }

  try {
    writePromise.get_future().get();
  } catch (const std::runtime_error&) {
    // Expected
  }

  context->join();
}


================================================
FILE: tensorpipe/test/transport/shm/connection_test.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/test/transport/shm/shm_test.h>

#include <gtest/gtest.h>
#include <nop/serializer.h>
#include <nop/structure.h>

using namespace tensorpipe;
using namespace tensorpipe::transport;

namespace {

class ShmTransportTest : public TransportTest {};

SHMTransportTestHelper helper;

// This value is defined in tensorpipe/transport/shm/connection.h
static constexpr auto kBufferSize = 2 * 1024 * 1024;

} // namespace

TEST_P(ShmTransportTest, Chunking) {
  // This is larger than the default ring buffer size.
  const int kMsgSize = 5 * kBufferSize;
  std::string srcBuf(kMsgSize, 0x42);
  auto dstBuf = std::make_unique<char[]>(kMsgSize);

  testConnection(
      [&](std::shared_ptr<Connection> conn) {
        doRead(
            conn,
            dstBuf.get(),
            kMsgSize,
            [&, conn](const Error& error, const void* ptr, size_t len) {
              ASSERT_FALSE(error) << error.what();
              ASSERT_EQ(len, kMsgSize);
              ASSERT_EQ(ptr, dstBuf.get());
              for (int i = 0; i < kMsgSize; ++i) {
                ASSERT_EQ(dstBuf[i], srcBuf[i]);
              }
              peers_->done(PeerGroup::kServer);
            });
        peers_->join(PeerGroup::kServer);
      },
      [&](std::shared_ptr<Connection> conn) {
        doWrite(
            conn,
            srcBuf.c_str(),
            srcBuf.length(),
            [&, conn](const Error& error) {
              ASSERT_FALSE(error) << error.what();
              peers_->done(PeerGroup::kClient);
            });
        peers_->join(PeerGroup::kClient);
      });
}

TEST_P(ShmTransportTest, ChunkingImplicitRead) {
  // This is larger than the default ring buffer size.
  const size_t kMsgSize = 5 * kBufferSize;
  std::string msg(kMsgSize, 0x42);

  testConnection(
      [&](std::shared_ptr<Connection> conn) {
        doRead(
            conn, [&, conn](const Error& error, const void* ptr, size_t len) {
              ASSERT_FALSE(error) << error.what();
              ASSERT_EQ(len, kMsgSize);
              for (int i = 0; i < kMsgSize; ++i) {
                ASSERT_EQ(static_cast<const uint8_t*>(ptr)[i], msg[i]);
              }
              peers_->done(PeerGroup::kServer);
            });
        peers_->join(PeerGroup::kServer);
      },
      [&](std::shared_ptr<Connection> conn) {
        doWrite(conn, msg.c_str(), msg.length(), [&, conn](const Error& error) {
          ASSERT_FALSE(error) << error.what();
          peers_->done(PeerGroup::kClient);
        });
        peers_->join(PeerGroup::kClient);
      });
}

TEST_P(ShmTransportTest, QueueWrites) {
  // This is large enough that two of those will not fit in the ring buffer at
  // the same time.
  constexpr int numMsg = 2;
  constexpr size_t numBytes = (3 * kBufferSize) / 4;
  const std::string kReady = "ready";
  std::array<char, numBytes> garbage;

  testConnection(
      [&](std::shared_ptr<Connection> conn) {
        // Wait for peer to queue up writes before attempting to read
        EXPECT_EQ(kReady, peers_->recv(PeerGroup::kServer));

        for (int i = 0; i < numMsg; ++i) {
          doRead(
              conn,
              [&, conn, i](const Error& error, const void* ptr, size_t len) {
                ASSERT_FALSE(error) << error.what();
                ASSERT_EQ(len, numBytes);
                if (i == numMsg - 1) {
                  peers_->done(PeerGroup::kServer);
                }
              });
        }
        peers_->join(PeerGroup::kServer);
      },
      [&](std::shared_ptr<Connection> conn) {
        for (int i = 0; i < numMsg; ++i) {
          doWrite(
              conn,
              garbage.data(),
              garbage.size(),
              [&, conn, i](const Error& error) {
                ASSERT_FALSE(error) << error.what();
                if (i == numMsg - 1) {
                  peers_->done(PeerGroup::kClient);
                }
              });
        }
        peers_->send(PeerGroup::kServer, kReady);
        peers_->join(PeerGroup::kClient);
      });
}

namespace {

struct MyNopType {
  std::string myStringField;
  NOP_STRUCTURE(MyNopType, myStringField);
};

} // namespace

TEST_P(ShmTransportTest, NopWriteWrapAround) {
  constexpr int numMsg = 2;
  constexpr size_t kSize = (3 * kBufferSize) / 4;

  testConnection(
      [&](std::shared_ptr<Connection> conn) {
        for (int i = 0; i < numMsg; ++i) {
          auto holder = std::make_shared<NopHolder<MyNopType>>();
          conn->read(*holder, [&, conn, holder, i](const Error& error) {
            ASSERT_FALSE(error) << error.what();
            ASSERT_EQ(holder->getObject().myStringField.length(), kSize);
            if (i == numMsg - 1) {
              peers_->done(PeerGroup::kServer);
            }
          });
        }
        peers_->join(PeerGroup::kServer);
      },
      [&](std::shared_ptr<Connection> conn) {
        for (int i = 0; i < numMsg; ++i) {
          auto holder = std::make_shared<NopHolder<MyNopType>>();
          holder->getObject().myStringField = std::string(kSize, 'B');
          conn->write(*holder, [&, conn, holder, i](const Error& error) {
            ASSERT_FALSE(error) << error.what();
            if (i == numMsg - 1) {
              peers_->done(PeerGroup::kClient);
            }
          });
        }
        peers_->join(PeerGroup::kClient);
      });
}

INSTANTIATE_TEST_CASE_P(Shm, ShmTransportTest, ::testing::Values(&helper));


================================================
FILE: tensorpipe/test/transport/shm/listener_test.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/test/transport/shm/shm_test.h>

#include <chrono>
#include <future>

#include <gmock/gmock.h>
#include <gtest/gtest.h>
#include <nop/serializer.h>
#include <nop/structure.h>

using namespace tensorpipe;
using namespace tensorpipe::transport;

namespace {

class ShmListenerTest : public TransportTest {};

SHMTransportTestHelper helper;

std::string generateUniqueAddr() {
  const ::testing::TestInfo* const testInfo =
      ::testing::UnitTest::GetInstance()->current_test_info();
  std::ostringstream ss;
  ss << "tensorpipe_test_" << testInfo->test_suite_name() << "."
     << testInfo->name() << "_" << ::getpid();
  return ss.str();
}

} // namespace

TEST_P(ShmListenerTest, ExplicitAbstractSocketName) {
  std::string expectedAddr = generateUniqueAddr();
  std::shared_ptr<Context> ctx = GetParam()->getContext();
  std::shared_ptr<Listener> listener = ctx->listen(expectedAddr);
  std::string actualAddr = listener->addr();
  ASSERT_EQ(actualAddr, expectedAddr);
  std::shared_ptr<Connection> outgoingConnection = ctx->connect(actualAddr);
  std::promise<void> prom;
  listener->accept(
      [&](const Error& error, std::shared_ptr<Connection> /* unused */) {
        EXPECT_FALSE(error) << error.what();
        prom.set_value();
      });
  std::future_status res = prom.get_future().wait_for(std::chrono::seconds(1));
  ASSERT_NE(res, std::future_status::timeout);
}

TEST_P(ShmListenerTest, AutobindAbstractSocketName) {
  std::shared_ptr<Context> ctx = GetParam()->getContext();
  std::shared_ptr<Listener> listener = ctx->listen("");
  std::string addr = listener->addr();
  ASSERT_NE(addr, "");
  // Since Linux 2.3.15 (Aug 1999) the address is in this format, see unix(7).
  ASSERT_THAT(addr, ::testing::MatchesRegex("[0-9a-f]{5}"));
  std::shared_ptr<Connection> outgoingConnection = ctx->connect(addr);
  std::promise<void> prom;
  listener->accept(
      [&](const Error& error, std::shared_ptr<Connection> /* unused */) {
        EXPECT_FALSE(error) << error.what();
        prom.set_value();
      });
  std::future_status res = prom.get_future().wait_for(std::chrono::seconds(1));
  ASSERT_NE(res, std::future_status::timeout);
}

INSTANTIATE_TEST_CASE_P(Shm, ShmListenerTest, ::testing::Values(&helper));


================================================
FILE: tensorpipe/test/transport/shm/reactor_test.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <sys/types.h>
#include <unistd.h>

#include <tensorpipe/common/defs.h>
#include <tensorpipe/common/queue.h>
#include <tensorpipe/common/socket.h>
#include <tensorpipe/transport/shm/reactor.h>

#include <gtest/gtest.h>

using namespace tensorpipe;
using namespace tensorpipe::transport::shm;

namespace {

void run(std::function<void(int)> fn1, std::function<void(int)> fn2) {
  int fds[2];

  {
    auto rv = socketpair(AF_UNIX, SOCK_STREAM, 0, fds);
    if (rv != 0) {
      TP_THROW_SYSTEM(errno) << "Failed to create socket pair";
    }
  }

  {
    auto pid = fork();
    TP_DCHECK_GE(pid, 0);
    if (pid == 0) {
      close(fds[0]);
      fn2(fds[1]);
      close(fds[1]);
      exit(0);
    }
  }

  close(fds[1]);
  fn1(fds[0]);
  close(fds[0]);
  wait(nullptr);
}

} // namespace

TEST(ShmReactor, Basic) {
  run(
      [](int fd) {
        tensorpipe::Queue<int> queue;
        auto reactor = std::make_shared<Reactor>();
        auto token1 = reactor->add([&] { queue.push(1); });
        auto token2 = reactor->add([&] { queue.push(2); });

        // Share reactor fds and token with other process.
        {
          auto socket = Socket(fd);
          auto fds = reactor->fds();
          auto error = socket.sendPayloadAndFds(
              token1, token2, std::get<0>(fds), std::get<1>(fds));
          ASSERT_FALSE(error) << error.what();
        }

        // Wait for other process to run trigger.
        ASSERT_EQ(queue.pop(), 1);
        ASSERT_EQ(queue.pop(), 2);

        reactor->remove(token1);
        reactor->remove(token2);
      },
      [](int fd) {
        Reactor::TToken token1;
        Reactor::TToken token2;
        Fd header;
        Fd data;

        // Wait for other process to share reactor fds and token.
        {
          auto socket = Socket(fd);
          auto error = socket.recvPayloadAndFds(token1, token2, header, data);
          ASSERT_FALSE(error) << error.what();
        }

        // Create and run trigger. This should wake up the other
        // process and run the registered function.
        Reactor::Trigger trigger(std::move(header), std::move(data));
        trigger.run(token1);
        trigger.run(token2);
      });
}

TEST(ShmReactor, TokenReuse) {
  tensorpipe::Queue<int> queue(3);
  auto reactor = std::make_shared<Reactor>();
  auto t1 = reactor->add([&] { queue.push(1); });
  auto t2 = reactor->add([&] { queue.push(2); });
  auto t3 = reactor->add([&] { queue.push(3); });

  // Check that they're monotonically increasing.
  ASSERT_GT(t2, t1);
  ASSERT_GT(t3, t2);

  // Remove token and check that it is reused.
  reactor->remove(t1);
  auto t4 = reactor->add([&] { queue.push(4); });
  ASSERT_EQ(t4, t1);

  // Remove multiple tokens and check that they're reused in order.
  reactor->remove(t2);
  reactor->remove(t3);
  auto t5 = reactor->add([&] { queue.push(5); });
  auto t6 = reactor->add([&] { queue.push(6); });
  ASSERT_EQ(t5, t2);
  ASSERT_EQ(t6, t3);

  reactor->remove(t4);
  reactor->remove(t5);
  reactor->remove(t6);
}


================================================
FILE: tensorpipe/test/transport/shm/shm_test.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/test/transport/shm/shm_test.h>

namespace {

SHMTransportTestHelper helper;

} // namespace

INSTANTIATE_TEST_CASE_P(Shm, TransportTest, ::testing::Values(&helper));


================================================
FILE: tensorpipe/test/transport/shm/shm_test.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <sstream>

#include <tensorpipe/test/transport/transport_test.h>
#include <tensorpipe/transport/shm/factory.h>

class SHMTransportTestHelper : public TransportTestHelper {
 protected:
  std::shared_ptr<tensorpipe::transport::Context> getContextInternal()
      override {
    return tensorpipe::transport::shm::create();
  }

 public:
  std::string defaultAddr() override {
    return "";
  }
};


================================================
FILE: tensorpipe/test/transport/shm/sockaddr_test.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/transport/shm/sockaddr.h>

#include <gtest/gtest.h>

using namespace tensorpipe::transport;

TEST(ShmSockaddr, FromToString) {
  auto addr = shm::Sockaddr::createAbstractUnixAddr("foo");
  ASSERT_EQ(addr.str(), std::string("foo"));
}


================================================
FILE: tensorpipe/test/transport/transport_test.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <future>
#include <memory>

#include <gtest/gtest.h>

#include <tensorpipe/test/peer_group.h>
#include <tensorpipe/transport/connection.h>
#include <tensorpipe/transport/context.h>
#include <tensorpipe/transport/listener.h>

class TransportTestHelper {
 public:
  std::shared_ptr<tensorpipe::transport::Context> getContext(
      bool skipViabilityCheck = false) {
    std::shared_ptr<tensorpipe::transport::Context> ctx = getContextInternal();
    if (!skipViabilityCheck) {
      EXPECT_TRUE(ctx->isViable());
    }
    return ctx;
  }

  virtual std::string defaultAddr() = 0;

  virtual std::unique_ptr<PeerGroup> makePeerGroup() {
    return std::make_unique<ThreadPeerGroup>();
  }

  virtual ~TransportTestHelper() = default;

 protected:
  virtual std::shared_ptr<tensorpipe::transport::Context>
  getContextInternal() = 0;
};

class TransportTest : public ::testing::TestWithParam<TransportTestHelper*> {
 protected:
  std::unique_ptr<PeerGroup> peers_;

 public:
  TransportTest() : peers_(GetParam()->makePeerGroup()) {}

  void testConnection(
      std::function<void(std::shared_ptr<tensorpipe::transport::Connection>)>
          listeningFn,
      std::function<void(std::shared_ptr<tensorpipe::transport::Connection>)>
          connectingFn) {
    using namespace tensorpipe::transport;

    peers_->spawn(
        [&] {
          auto ctx = GetParam()->getContext();
          ctx->setId("server");
          auto addr = GetParam()->defaultAddr();
          auto listener = ctx->listen(addr);
          std::promise<std::shared_ptr<Connection>> connectionProm;
          listener->accept([&](const tensorpipe::Error& error,
                               std::shared_ptr<Connection> conn) {
            ASSERT_FALSE(error) << error.what();
            connectionProm.set_value(std::move(conn));
          });

          peers_->send(PeerGroup::kClient, listener->addr());

          listeningFn(connectionProm.get_future().get());

          ctx->join();
        },
        [&] {
          auto ctx = GetParam()->getContext();
          ctx->setId("client");
          auto listenerAddr = peers_->recv(PeerGroup::kClient);

          connectingFn(ctx->connect(listenerAddr));

          ctx->join();
        });
  }

  // Add to a closure to check the callback is called before being destroyed
  class Bomb {
   public:
    Bomb() = default;

    Bomb(const Bomb&) = delete;
    Bomb(Bomb&& b) {
      defused_ = b.defused_;
      b.defused_ = false;
    }

    Bomb& operator=(const Bomb&) = delete;
    Bomb& operator=(Bomb&&) = delete;

    void defuse() {
      defused_ = true;
    }

    ~Bomb() {
      EXPECT_TRUE(defused_);
    }

   private:
    bool defused_ = false;
  };

  std::shared_ptr<Bomb> armBomb() {
    return std::make_shared<Bomb>();
  }

  void doRead(
      std::shared_ptr<tensorpipe::transport::Connection> conn,
      tensorpipe::transport::Connection::read_callback_fn fn) {
    auto mutex = std::make_shared<std::mutex>();
    std::lock_guard<std::mutex> outerLock(*mutex);
    // We acquire the same mutex while calling read and inside its callback so
    // that we deadlock if the callback is invoked inline.
    conn->read(
        [fn{std::move(fn)}, mutex, bomb{armBomb()}](
            const tensorpipe::Error& error, const void* ptr, size_t len) {
          std::lock_guard<std::mutex> innerLock(*mutex);
          bomb->defuse();
          fn(error, ptr, len);
        });
  }

  void doRead(
      std::shared_ptr<tensorpipe::transport::Connection> conn,
      void* ptr,
      size_t length,
      tensorpipe::transport::Connection::read_callback_fn fn) {
    auto mutex = std::make_shared<std::mutex>();
    std::lock_guard<std::mutex> outerLock(*mutex);
    // We acquire the same mutex while calling read and inside its callback so
    // that we deadlock if the callback is invoked inline.
    conn->read(
        ptr,
        length,
        [fn{std::move(fn)}, mutex, bomb{armBomb()}](
            const tensorpipe::Error& error, const void* ptr, size_t len) {
          std::lock_guard<std::mutex> innerLock(*mutex);
          bomb->defuse();
          fn(error, ptr, len);
        });
  }

  void doWrite(
      std::shared_ptr<tensorpipe::transport::Connection> conn,
      const void* ptr,
      size_t length,
      tensorpipe::transport::Connection::write_callback_fn fn) {
    auto mutex = std::make_shared<std::mutex>();
    // We acquire the same mutex while calling write and inside its callback
    // so that we deadlock if the callback is invoked inline.
    std::lock_guard<std::mutex> outerLock(*mutex);
    conn->write(
        ptr,
        length,
        [fn{std::move(fn)}, mutex, bomb{armBomb()}](
            const tensorpipe::Error& error) {
          std::lock_guard<std::mutex> innerLock(*mutex);
          bomb->defuse();
          fn(error);
        });
  }
};


================================================
FILE: tensorpipe/test/transport/uv/connection_test.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/test/transport/uv/uv_test.h>

#include <gtest/gtest.h>

namespace {

class UVTransportConnectionTest : public TransportTest {};

UVTransportTestHelper helper;

} // namespace

using namespace tensorpipe;
using namespace tensorpipe::transport;

TEST_P(UVTransportConnectionTest, LargeWrite) {
  constexpr int kMsgSize = 16 * 1024 * 1024;
  std::string msg(kMsgSize, 0x42);

  testConnection(
      [&](std::shared_ptr<Connection> conn) {
        doWrite(conn, msg.c_str(), msg.length(), [&, conn](const Error& error) {
          ASSERT_FALSE(error) << error.what();
          peers_->done(PeerGroup::kServer);
        });
        peers_->join(PeerGroup::kServer);
      },
      [&](std::shared_ptr<Connection> conn) {
        doRead(
            conn, [&, conn](const Error& error, const void* data, size_t len) {
              ASSERT_FALSE(error) << error.what();
              ASSERT_EQ(len, msg.length());
              const char* cdata = (const char*)data;
              for (int i = 0; i < len; ++i) {
                const char c = cdata[i];
                ASSERT_EQ(c, msg[i]) << "Wrong value at position " << i
                                     << " of " << msg.length();
              }
              peers_->done(PeerGroup::kClient);
            });
        peers_->join(PeerGroup::kClient);
      });
}

INSTANTIATE_TEST_CASE_P(
    Uv,
    UVTransportConnectionTest,
    ::testing::Values(&helper));


================================================
FILE: tensorpipe/test/transport/uv/context_test.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/test/transport/uv/uv_test.h>
#include <tensorpipe/transport/uv/utility.h>

#include <gtest/gtest.h>

namespace {

class UVTransportContextTest : public TransportTest {};

UVTransportTestHelper helper;

} // namespace

using namespace tensorpipe;

// Linux-only because OSX machines on CircleCI cannot resolve their hostname
#ifdef __linux__
TEST_P(UVTransportContextTest, LookupHostnameAddress) {
  Error error;
  std::string addr;
  std::tie(error, addr) = transport::uv::lookupAddrForHostname();
  EXPECT_FALSE(error) << error.what();
  EXPECT_NE(addr, "");
}
#endif

// Interface name conventions change based on platform. Linux uses "lo", OSX
// uses lo0, Windows uses integers.
#ifdef __linux__
#define LOOPBACK_INTERFACE "lo"
#elif __APPLE__
#define LOOPBACK_INTERFACE "lo0"
#endif

#ifdef LOOPBACK_INTERFACE
TEST_P(UVTransportContextTest, LookupInterfaceAddress) {
  Error error;
  std::string addr;
  std::tie(error, addr) = transport::uv::lookupAddrForIface(LOOPBACK_INTERFACE);
  EXPECT_FALSE(error) << error.what();
  EXPECT_NE(addr, "");
}
#endif

TEST_P(UVTransportContextTest, LookupAddressLikeNccl) {
  Error error;
  std::string addr;
  std::tie(error, addr) = transport::uv::lookupAddrLikeNccl();
  EXPECT_FALSE(error) << error.what();
  EXPECT_NE(addr, "");
}

INSTANTIATE_TEST_CASE_P(Uv, UVTransportContextTest, ::testing::Values(&helper));


================================================
FILE: tensorpipe/test/transport/uv/loop_test.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <gtest/gtest.h>

#include <tensorpipe/transport/uv/loop.h>

using namespace tensorpipe::transport::uv;

namespace test {
namespace transport {
namespace uv {

TEST(UvLoop, Defer) {
  Loop loop;

  {
    // Defer function on event loop thread.
    std::promise<std::thread::id> prom;
    loop.deferToLoop([&] { prom.set_value(std::this_thread::get_id()); });
    ASSERT_NE(std::this_thread::get_id(), prom.get_future().get());
  }

  loop.join();
}

} // namespace uv
} // namespace transport
} // namespace test


================================================
FILE: tensorpipe/test/transport/uv/sockaddr_test.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/transport/uv/sockaddr.h>

#include <netinet/in.h>

#include <gtest/gtest.h>

using namespace tensorpipe::transport;

namespace {

int family(const uv::Sockaddr& addr) {
  auto sockaddr = addr.addr();
  return sockaddr->sa_family;
}

int port(const uv::Sockaddr& addr) {
  auto sockaddr = addr.addr();
  if (sockaddr->sa_family == AF_INET) {
    auto in = reinterpret_cast<const struct sockaddr_in*>(sockaddr);
    return in->sin_port;
  }
  if (sockaddr->sa_family == AF_INET6) {
    auto in6 = reinterpret_cast<const struct sockaddr_in6*>(sockaddr);
    return in6->sin6_port;
  }
  return -1;
}

} // namespace

TEST(UvSockaddr, InetBadPort) {
  ASSERT_THROW(
      uv::Sockaddr::createInetSockAddr("1.2.3.4:-1"), std::invalid_argument);
  ASSERT_THROW(
      uv::Sockaddr::createInetSockAddr("1.2.3.4:65536"), std::invalid_argument);
}

TEST(UvSockaddr, Inet) {
  {
    auto sa = uv::Sockaddr::createInetSockAddr("1.2.3.4:5");
    ASSERT_EQ(family(sa), AF_INET);
    ASSERT_EQ(port(sa), ntohs(5));
    ASSERT_EQ(sa.str(), "1.2.3.4:5");
  }

  {
    auto sa = uv::Sockaddr::createInetSockAddr("1.2.3.4:0");
    ASSERT_EQ(family(sa), AF_INET);
    ASSERT_EQ(port(sa), 0);
    ASSERT_EQ(sa.str(), "1.2.3.4:0");
  }

  {
    auto sa = uv::Sockaddr::createInetSockAddr("1.2.3.4");
    ASSERT_EQ(family(sa), AF_INET);
    ASSERT_EQ(port(sa), 0);
    ASSERT_EQ(sa.str(), "1.2.3.4:0");
  }
}

TEST(UvSockaddr, Inet6BadPort) {
  ASSERT_THROW(
      uv::Sockaddr::createInetSockAddr("[::1]:-1"), std::invalid_argument);
  ASSERT_THROW(
      uv::Sockaddr::createInetSockAddr("[::1]:65536"), std::invalid_argument);
  ASSERT_THROW(
      uv::Sockaddr::createInetSockAddr("]::1["), std::invalid_argument);
}

// Interface name conventions change based on platform. Linux uses "lo", OSX
// uses lo0, Windows uses integers.
#ifdef __linux__
#define LOOPBACK_INTERFACE "lo"
#elif __APPLE__
#define LOOPBACK_INTERFACE "lo0"
#endif

TEST(UvSockaddr, Inet6) {
  {
    auto sa = uv::Sockaddr::createInetSockAddr("[::1]:5");
    ASSERT_EQ(family(sa), AF_INET6);
    ASSERT_EQ(port(sa), ntohs(5));
    ASSERT_EQ(sa.str(), "[::1]:5");
  }

  {
    auto sa = uv::Sockaddr::createInetSockAddr("[::1]:0");
    ASSERT_EQ(family(sa), AF_INET6);
    ASSERT_EQ(port(sa), 0);
    ASSERT_EQ(sa.str(), "[::1]:0");
  }

  {
    auto sa = uv::Sockaddr::createInetSockAddr("::1");
    ASSERT_EQ(family(sa), AF_INET6);
    ASSERT_EQ(port(sa), 0);
    ASSERT_EQ(sa.str(), "[::1]:0");
  }

#ifdef LOOPBACK_INTERFACE
  {
    auto sa = uv::Sockaddr::createInetSockAddr("::1%" LOOPBACK_INTERFACE);
    ASSERT_EQ(family(sa), AF_INET6);
    ASSERT_EQ(port(sa), 0);
    ASSERT_EQ(sa.str(), "[::1%" LOOPBACK_INTERFACE "]:0");
  }

  {
    sockaddr_in6 sa;
    std::memset(&sa, 0, sizeof(sa));
    sa.sin6_family = AF_INET6;
    sa.sin6_port = ntohs(42);
    sa.sin6_flowinfo = 0;
    sa.sin6_addr.s6_addr[15] = 1;
    // Implicitly assuming that the loopback interface is the first one.
    sa.sin6_scope_id = 1;
    uv::Sockaddr tpSa(reinterpret_cast<sockaddr*>(&sa), sizeof(sa));
    ASSERT_EQ(tpSa.str(), "[::1%" LOOPBACK_INTERFACE "]:42");
  }
#endif
}


================================================
FILE: tensorpipe/test/transport/uv/uv_test.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/test/transport/uv/uv_test.h>

namespace {

UVTransportTestHelper helper;

} // namespace

INSTANTIATE_TEST_CASE_P(Uv, TransportTest, ::testing::Values(&helper));


================================================
FILE: tensorpipe/test/transport/uv/uv_test.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <tensorpipe/test/transport/transport_test.h>
#include <tensorpipe/transport/uv/factory.h>

class UVTransportTestHelper : public TransportTestHelper {
 protected:
  std::shared_ptr<tensorpipe::transport::Context> getContextInternal()
      override {
    return tensorpipe::transport::uv::create();
  }

 public:
  std::string defaultAddr() override {
    return "127.0.0.1";
  }
};


================================================
FILE: tensorpipe/transport/connection.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <functional>
#include <string>

#include <tensorpipe/common/error.h>
#include <tensorpipe/common/nop.h>
#include <tensorpipe/transport/context.h>

namespace tensorpipe {
namespace transport {

class Connection {
 public:
  using read_callback_fn =
      std::function<void(const Error& error, const void* ptr, size_t length)>;

  virtual void read(read_callback_fn fn) = 0;

  virtual void read(void* ptr, size_t length, read_callback_fn fn) = 0;

  using write_callback_fn = std::function<void(const Error& error)>;

  virtual void write(const void* ptr, size_t length, write_callback_fn fn) = 0;

  //
  // Helper functions for reading/writing nop objects.
  //

  // Read and parse a nop object.
  //
  // This function may be overridden by a subclass.
  //
  // For example, the shm transport may be able to bypass reading into a
  // temporary buffer and instead instead read directly from its peer's
  // ring buffer. This saves an allocation and a memory copy.
  //
  using read_nop_callback_fn = std::function<void(const Error& error)>;

  virtual void read(AbstractNopHolder& object, read_nop_callback_fn fn) = 0;

  // Serialize and write nop object.
  //
  // This function may be overridden by a subclass.
  //
  // For example, the shm transport may be able to bypass serialization
  // into a temporary buffer and instead instead serialize directly into
  // its peer's ring buffer. This saves an allocation and a memory copy.
  //
  virtual void write(const AbstractNopHolder& object, write_callback_fn fn) = 0;

  // Tell the connection what its identifier is.
  //
  // This is only supposed to be called from the high-level pipe or from
  // channels. It will only used for logging and debugging purposes.
  virtual void setId(std::string id) = 0;

  virtual void close() = 0;

  virtual ~Connection() = default;
};

} // namespace transport
} // namespace tensorpipe


================================================
FILE: tensorpipe/transport/connection_boilerplate.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <cstddef>
#include <memory>
#include <string>
#include <type_traits>
#include <utility>

#include <tensorpipe/transport/connection.h>
#include <tensorpipe/transport/connection_impl_boilerplate.h>

namespace tensorpipe {
namespace transport {

template <typename TCtx, typename TList, typename TConn>
class ConnectionBoilerplate : public Connection {
 public:
  template <typename... Args>
  ConnectionBoilerplate(
      typename ConnectionImplBoilerplate<TCtx, TList, TConn>::ConstructorToken
          token,
      std::shared_ptr<TCtx> context,
      std::string id,
      Args... args);

  explicit ConnectionBoilerplate(std::shared_ptr<TConn> connection);

  ConnectionBoilerplate(const ConnectionBoilerplate&) = delete;
  ConnectionBoilerplate(ConnectionBoilerplate&&) = delete;
  ConnectionBoilerplate& operator=(const ConnectionBoilerplate&) = delete;
  ConnectionBoilerplate& operator=(ConnectionBoilerplate&&) = delete;

  // Queue a read operation.
  void read(read_callback_fn fn) override;
  void read(AbstractNopHolder& object, read_nop_callback_fn fn) override;
  void read(void* ptr, size_t length, read_callback_fn fn) override;

  // Perform a write operation.
  void write(const void* ptr, size_t length, write_callback_fn fn) override;
  void write(const AbstractNopHolder& object, write_callback_fn fn) override;

  // Tell the connection what its identifier is.
  void setId(std::string id) override;

  // Shut down the connection and its resources.
  void close() override;

  ~ConnectionBoilerplate() override;

 protected:
  // Using a shared_ptr allows us to detach the lifetime of the implementation
  // from the public object's one and perform the destruction asynchronously.
  const std::shared_ptr<TConn> impl_;
};

template <typename TCtx, typename TList, typename TConn>
template <typename... Args>
ConnectionBoilerplate<TCtx, TList, TConn>::ConnectionBoilerplate(
    typename ConnectionImplBoilerplate<TCtx, TList, TConn>::ConstructorToken
        token,
    std::shared_ptr<TCtx> context,
    std::string id,
    Args... args)
    : impl_(std::make_shared<TConn>(
          token,
          std::move(context),
          std::move(id),
          std::forward<Args>(args)...)) {
  static_assert(
      std::is_base_of<ConnectionImplBoilerplate<TCtx, TList, TConn>, TConn>::
          value,
      "");
  impl_->init();
}

template <typename TCtx, typename TList, typename TConn>
ConnectionBoilerplate<TCtx, TList, TConn>::ConnectionBoilerplate(
    std::shared_ptr<TConn> connection)
    : impl_(std::move(connection)) {
  static_assert(
      std::is_base_of<ConnectionImplBoilerplate<TCtx, TList, TConn>, TConn>::
          value,
      "");
}

template <typename TCtx, typename TList, typename TConn>
void ConnectionBoilerplate<TCtx, TList, TConn>::read(read_callback_fn fn) {
  if (unlikely(!impl_)) {
    // FIXME In C++-17 perhaps a global static inline variable would be better?
    static Error error = TP_CREATE_ERROR(ContextNotViableError);
    fn(error, nullptr, 0);
    return;
  }
  impl_->read(std::move(fn));
}

template <typename TCtx, typename TList, typename TConn>
void ConnectionBoilerplate<TCtx, TList, TConn>::read(
    AbstractNopHolder& object,
    read_nop_callback_fn fn) {
  if (unlikely(!impl_)) {
    // FIXME In C++-17 perhaps a global static inline variable would be better?
    static Error error = TP_CREATE_ERROR(ContextNotViableError);
    fn(error);
    return;
  }
  impl_->read(object, std::move(fn));
}

template <typename TCtx, typename TList, typename TConn>
void ConnectionBoilerplate<TCtx, TList, TConn>::read(
    void* ptr,
    size_t length,
    read_callback_fn fn) {
  if (unlikely(!impl_)) {
    // FIXME In C++-17 perhaps a global static inline variable would be better?
    static Error error = TP_CREATE_ERROR(ContextNotViableError);
    fn(error, ptr, length);
    return;
  }
  impl_->read(ptr, length, std::move(fn));
}

template <typename TCtx, typename TList, typename TConn>
void ConnectionBoilerplate<TCtx, TList, TConn>::write(
    const void* ptr,
    size_t length,
    write_callback_fn fn) {
  if (unlikely(!impl_)) {
    // FIXME In C++-17 perhaps a global static inline variable would be better?
    static Error error = TP_CREATE_ERROR(ContextNotViableError);
    fn(error);
    return;
  }
  impl_->write(ptr, length, std::move(fn));
}

template <typename TCtx, typename TList, typename TConn>
void ConnectionBoilerplate<TCtx, TList, TConn>::write(
    const AbstractNopHolder& object,
    write_callback_fn fn) {
  if (unlikely(!impl_)) {
    // FIXME In C++-17 perhaps a global static inline variable would be better?
    static Error error = TP_CREATE_ERROR(ContextNotViableError);
    fn(error);
    return;
  }
  impl_->write(object, std::move(fn));
}

template <typename TCtx, typename TList, typename TConn>
void ConnectionBoilerplate<TCtx, TList, TConn>::setId(std::string id) {
  if (unlikely(!impl_)) {
    return;
  }
  impl_->setId(std::move(id));
}

template <typename TCtx, typename TList, typename TConn>
void ConnectionBoilerplate<TCtx, TList, TConn>::close() {
  if (unlikely(!impl_)) {
    return;
  }
  impl_->close();
}

template <typename TCtx, typename TList, typename TConn>
ConnectionBoilerplate<TCtx, TList, TConn>::~ConnectionBoilerplate() {
  close();
}

} // namespace transport
} // namespace tensorpipe


================================================
FILE: tensorpipe/transport/connection_impl_boilerplate.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <memory>
#include <string>
#include <utility>

#include <tensorpipe/common/callback.h>
#include <tensorpipe/common/defs.h>
#include <tensorpipe/common/error.h>
#include <tensorpipe/common/error_macros.h>
#include <tensorpipe/transport/connection.h>
#include <tensorpipe/transport/error.h>

namespace tensorpipe {
namespace transport {

template <typename TCtx, typename TList, typename TConn>
class ContextImplBoilerplate;

template <typename TCtx, typename TList, typename TConn>
class ListenerImplBoilerplate;

template <typename TCtx, typename TList, typename TConn>
class ConnectionImplBoilerplate : public std::enable_shared_from_this<TConn> {
 public:
  class ConstructorToken {
   public:
    ConstructorToken(const ConstructorToken&) = default;

   private:
    explicit ConstructorToken() {}
    friend ContextImplBoilerplate<TCtx, TList, TConn>;
    friend ListenerImplBoilerplate<TCtx, TList, TConn>;
  };

  ConnectionImplBoilerplate(
      ConstructorToken token,
      std::shared_ptr<TCtx> context,
      std::string id);

  ConnectionImplBoilerplate(const ConnectionImplBoilerplate&) = delete;
  ConnectionImplBoilerplate(ConnectionImplBoilerplate&&) = delete;
  ConnectionImplBoilerplate& operator=(const ConnectionImplBoilerplate&) =
      delete;
  ConnectionImplBoilerplate& operator=(ConnectionImplBoilerplate&&) = delete;

  // Initialize member fields that need `shared_from_this`.
  void init();

  // Queue a read operation.
  using read_callback_fn = Connection::read_callback_fn;
  using read_nop_callback_fn = Connection::read_nop_callback_fn;
  void read(read_callback_fn fn);
  void read(AbstractNopHolder& object, read_nop_callback_fn fn);
  void read(void* ptr, size_t length, read_callback_fn fn);

  // Perform a write operation.
  using write_callback_fn = Connection::write_callback_fn;
  void write(const void* ptr, size_t length, write_callback_fn fn);
  void write(const AbstractNopHolder& object, write_callback_fn fn);

  // Tell the connection what its identifier is.
  void setId(std::string id);

  // Shut down the connection and its resources.
  void close();

  virtual ~ConnectionImplBoilerplate() = default;

 protected:
  virtual void initImplFromLoop() = 0;
  virtual void readImplFromLoop(read_callback_fn fn) = 0;
  virtual void readImplFromLoop(
      AbstractNopHolder& object,
      read_nop_callback_fn fn);
  virtual void readImplFromLoop(
      void* ptr,
      size_t length,
      read_callback_fn fn) = 0;
  virtual void writeImplFromLoop(
      const void* ptr,
      size_t length,
      write_callback_fn fn) = 0;
  virtual void writeImplFromLoop(
      const AbstractNopHolder& object,
      write_callback_fn fn);
  virtual void handleErrorImpl() = 0;

  void setError(Error error);

  const std::shared_ptr<TCtx> context_;

  Error error_{Error::kSuccess};

  // An identifier for the connection, composed of the identifier for the
  // context or listener, combined with an increasing sequence number. It will
  // only be used for logging and debugging purposes.
  std::string id_;

 private:
  // Initialize member fields that need `shared_from_this`.
  void initFromLoop();

  // Queue a read operation.
  void readFromLoop(read_callback_fn fn);
  void readFromLoop(AbstractNopHolder& object, read_nop_callback_fn fn);
  void readFromLoop(void* ptr, size_t length, read_callback_fn fn);

  // Perform a write operation.
  void writeFromLoop(const void* ptr, size_t length, write_callback_fn fn);
  void writeFromLoop(const AbstractNopHolder& object, write_callback_fn fn);

  void setIdFromLoop(std::string id);

  // Shut down the connection and its resources.
  void closeFromLoop();

  // Deal with an error.
  void handleError();

  // A sequence number for the calls to read and write.
  uint64_t nextBufferBeingRead_{0};
  uint64_t nextBufferBeingWritten_{0};

  // Contexts and listeners do sometimes need to call directly into initFromLoop
  // and closeFromLoop, in order to make sure that some of their operations can
  // happen "atomically" on the connection, without possibly other operations
  // occurring in between (e.g., an error).
  friend ContextImplBoilerplate<TCtx, TList, TConn>;
  friend ListenerImplBoilerplate<TCtx, TList, TConn>;
};

template <typename TCtx, typename TList, typename TConn>
ConnectionImplBoilerplate<TCtx, TList, TConn>::ConnectionImplBoilerplate(
    ConstructorToken /* unused */,
    std::shared_ptr<TCtx> context,
    std::string id)
    : context_(std::move(context)), id_(std::move(id)) {}

template <typename TCtx, typename TList, typename TConn>
void ConnectionImplBoilerplate<TCtx, TList, TConn>::init() {
  context_->deferToLoop(
      [impl{this->shared_from_this()}]() { impl->initFromLoop(); });
}

template <typename TCtx, typename TList, typename TConn>
void ConnectionImplBoilerplate<TCtx, TList, TConn>::initFromLoop() {
  if (context_->closed()) {
    // Set the error without calling setError because we do not want to invoke
    // the subclass's handleErrorImpl as it would find itself in a weird state
    // (since initFromLoop wouldn't have been called).
    error_ = TP_CREATE_ERROR(ConnectionClosedError);
    TP_VLOG(7) << "Connection " << id_ << " is closing (without initing)";
    return;
  }

  initImplFromLoop();
}

template <typename TCtx, typename TList, typename TConn>
void ConnectionImplBoilerplate<TCtx, TList, TConn>::read(read_callback_fn fn) {
  context_->deferToLoop(
      [impl{this->shared_from_this()}, fn{std::move(fn)}]() mutable {
        impl->readFromLoop(std::move(fn));
      });
}

template <typename TCtx, typename TList, typename TConn>
void ConnectionImplBoilerplate<TCtx, TList, TConn>::readFromLoop(
    read_callback_fn fn) {
  TP_DCHECK(context_->inLoop());

  uint64_t sequenceNumber = nextBufferBeingRead_++;
  TP_VLOG(7) << "Connection " << id_ << " received a read request (#"
             << sequenceNumber << ")";

  fn = [this, sequenceNumber, fn{std::move(fn)}](
           const Error& error, const void* ptr, size_t length) {
    TP_VLOG(7) << "Connection " << id_ << " is calling a read callback (#"
               << sequenceNumber << ")";
    fn(error, ptr, length);
    TP_VLOG(7) << "Connection " << id_ << " done calling a read callback (#"
               << sequenceNumber << ")";
  };

  if (error_) {
    fn(error_, nullptr, 0);
    return;
  }

  readImplFromLoop(std::move(fn));
}

template <typename TCtx, typename TList, typename TConn>
void ConnectionImplBoilerplate<TCtx, TList, TConn>::read(
    AbstractNopHolder& object,
    read_nop_callback_fn fn) {
  context_->deferToLoop(
      [impl{this->shared_from_this()}, &object, fn{std::move(fn)}]() mutable {
        impl->readFromLoop(object, std::move(fn));
      });
}

template <typename TCtx, typename TList, typename TConn>
void ConnectionImplBoilerplate<TCtx, TList, TConn>::readFromLoop(
    AbstractNopHolder& object,
    read_nop_callback_fn fn) {
  TP_DCHECK(context_->inLoop());

  uint64_t sequenceNumber = nextBufferBeingRead_++;
  TP_VLOG(7) << "Connection " << id_ << " received a nop object read request (#"
             << sequenceNumber << ")";

  fn = [this, sequenceNumber, fn{std::move(fn)}](const Error& error) {
    TP_VLOG(7) << "Connection " << id_
               << " is calling a nop object read callback (#" << sequenceNumber
               << ")";
    fn(error);
    TP_VLOG(7) << "Connection " << id_
               << " done calling a nop object read callback (#"
               << sequenceNumber << ")";
  };

  if (error_) {
    fn(error_);
    return;
  }

  readImplFromLoop(object, std::move(fn));
}

template <typename TCtx, typename TList, typename TConn>
void ConnectionImplBoilerplate<TCtx, TList, TConn>::readImplFromLoop(
    AbstractNopHolder& object,
    read_nop_callback_fn fn) {
  readImplFromLoop([&object, fn{std::move(fn)}](
                       const Error& error, const void* ptr, size_t len) {
    if (!error) {
      NopReader reader(reinterpret_cast<const uint8_t*>(ptr), len);
      nop::Status<void> status = object.read(reader);
      TP_THROW_ASSERT_IF(status.has_error())
          << "Error reading nop object: " << status.GetErrorMessage();
    }
    fn(error);
  });
}

template <typename TCtx, typename TList, typename TConn>
void ConnectionImplBoilerplate<TCtx, TList, TConn>::read(
    void* ptr,
    size_t length,
    read_callback_fn fn) {
  context_->deferToLoop([impl{this->shared_from_this()},
                         ptr,
                         length,
                         fn{std::move(fn)}]() mutable {
    impl->readFromLoop(ptr, length, std::move(fn));
  });
}

template <typename TCtx, typename TList, typename TConn>
void ConnectionImplBoilerplate<TCtx, TList, TConn>::readFromLoop(
    void* ptr,
    size_t length,
    read_callback_fn fn) {
  TP_DCHECK(context_->inLoop());

  uint64_t sequenceNumber = nextBufferBeingRead_++;
  TP_VLOG(7) << "Connection " << id_ << " received a read request (#"
             << sequenceNumber << ")";

  fn = [this, sequenceNumber, fn{std::move(fn)}](
           const Error& error, const void* ptr, size_t length) {
    TP_VLOG(7) << "Connection " << id_ << " is calling a read callback (#"
               << sequenceNumber << ")";
    fn(error, ptr, length);
    TP_VLOG(7) << "Connection " << id_ << " done calling a read callback (#"
               << sequenceNumber << ")";
  };

  if (error_) {
    fn(error_, ptr, length);
    return;
  }

  readImplFromLoop(ptr, length, std::move(fn));
}

template <typename TCtx, typename TList, typename TConn>
void ConnectionImplBoilerplate<TCtx, TList, TConn>::write(
    const void* ptr,
    size_t length,
    write_callback_fn fn) {
  context_->deferToLoop([impl{this->shared_from_this()},
                         ptr,
                         length,
                         fn{std::move(fn)}]() mutable {
    impl->writeFromLoop(ptr, length, std::move(fn));
  });
}

template <typename TCtx, typename TList, typename TConn>
void ConnectionImplBoilerplate<TCtx, TList, TConn>::writeFromLoop(
    const void* ptr,
    size_t length,
    write_callback_fn fn) {
  TP_DCHECK(context_->inLoop());

  uint64_t sequenceNumber = nextBufferBeingWritten_++;
  TP_VLOG(7) << "Connection " << id_ << " received a write request (#"
             << sequenceNumber << ")";

  fn = [this, sequenceNumber, fn{std::move(fn)}](const Error& error) {
    TP_VLOG(7) << "Connection " << id_ << " is calling a write callback (#"
               << sequenceNumber << ")";
    fn(error);
    TP_VLOG(7) << "Connection " << id_ << " done calling a write callback (#"
               << sequenceNumber << ")";
  };

  if (error_) {
    fn(error_);
    return;
  }

  writeImplFromLoop(ptr, length, std::move(fn));
}

template <typename TCtx, typename TList, typename TConn>
void ConnectionImplBoilerplate<TCtx, TList, TConn>::write(
    const AbstractNopHolder& object,
    write_callback_fn fn) {
  context_->deferToLoop(
      [impl{this->shared_from_this()}, &object, fn{std::move(fn)}]() mutable {
        impl->writeFromLoop(object, std::move(fn));
      });
}

template <typename TCtx, typename TList, typename TConn>
void ConnectionImplBoilerplate<TCtx, TList, TConn>::writeFromLoop(
    const AbstractNopHolder& object,
    write_callback_fn fn) {
  TP_DCHECK(context_->inLoop());

  uint64_t sequenceNumber = nextBufferBeingWritten_++;
  TP_VLOG(7) << "Connection " << id_
             << " received a nop object write request (#" << sequenceNumber
             << ")";

  fn = [this, sequenceNumber, fn{std::move(fn)}](const Error& error) {
    TP_VLOG(7) << "Connection " << id_
               << " is calling a nop object write callback (#" << sequenceNumber
               << ")";
    fn(error);
    TP_VLOG(7) << "Connection " << id_
               << " done calling a nop object write callback (#"
               << sequenceNumber << ")";
  };

  if (error_) {
    fn(error_);
    return;
  }

  writeImplFromLoop(object, std::move(fn));
}

template <typename TCtx, typename TList, typename TConn>
void ConnectionImplBoilerplate<TCtx, TList, TConn>::writeImplFromLoop(
    const AbstractNopHolder& object,
    write_callback_fn fn) {
  const size_t len = object.getSize();

  // Using a shared_ptr instead of unique_ptr because if the lambda captures a
  // unique_ptr then it becomes non-copyable, which prevents it from being
  // converted to a function. In C++20 use std::make_shared<uint8_t[]>(len).
  //
  // Note: this is a std::shared_ptr<uint8_t[]> semantically. A shared_ptr
  // with array type is supported in C++17 and higher.
  //
  auto buf = std::shared_ptr<uint8_t>(
      new uint8_t[len], std::default_delete<uint8_t[]>());
  auto ptr = buf.get();

  NopWriter writer(ptr, len);
  nop::Status<void> status = object.write(writer);
  TP_THROW_ASSERT_IF(status.has_error())
      << "Error writing nop object: " << status.GetErrorMessage();

  // Perform write and forward callback.
  writeImplFromLoop(
      ptr,
      len,
      [buf{std::move(buf)}, fn{std::move(fn)}](const Error& error) mutable {
        // The write has completed; destroy write buffer.
        buf.reset();
        fn(error);
      });
}

template <typename TCtx, typename TList, typename TConn>
void ConnectionImplBoilerplate<TCtx, TList, TConn>::setId(std::string id) {
  context_->deferToLoop(
      [impl{this->shared_from_this()}, id{std::move(id)}]() mutable {
        impl->setIdFromLoop(std::move(id));
      });
}

template <typename TCtx, typename TList, typename TConn>
void ConnectionImplBoilerplate<TCtx, TList, TConn>::setIdFromLoop(
    std::string id) {
  TP_DCHECK(context_->inLoop());
  TP_VLOG(7) << "Connection " << id_ << " was renamed to " << id;
  id_ = std::move(id);
}

template <typename TCtx, typename TList, typename TConn>
void ConnectionImplBoilerplate<TCtx, TList, TConn>::close() {
  context_->deferToLoop(
      [impl{this->shared_from_this()}]() { impl->closeFromLoop(); });
}

template <typename TCtx, typename TList, typename TConn>
void ConnectionImplBoilerplate<TCtx, TList, TConn>::closeFromLoop() {
  TP_DCHECK(context_->inLoop());
  TP_VLOG(7) << "Connection " << id_ << " is closing";
  setError(TP_CREATE_ERROR(ConnectionClosedError));
}

template <typename TCtx, typename TList, typename TConn>
void ConnectionImplBoilerplate<TCtx, TList, TConn>::setError(Error error) {
  // Don't overwrite an error that's already set.
  if (error_ || !error) {
    return;
  }

  error_ = std::move(error);

  handleError();
}

template <typename TCtx, typename TList, typename TConn>
void ConnectionImplBoilerplate<TCtx, TList, TConn>::handleError() {
  TP_DCHECK(context_->inLoop());
  TP_VLOG(8) << "Connection " << id_ << " is handling error " << error_.what();

  handleErrorImpl();
}

} // namespace transport
} // namespace tensorpipe


================================================
FILE: tensorpipe/transport/context.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <memory>
#include <string>

namespace tensorpipe {
namespace transport {

class Connection;
class Listener;

class Context {
 public:
  virtual std::shared_ptr<Connection> connect(std::string addr) = 0;

  virtual std::shared_ptr<Listener> listen(std::string addr) = 0;

  // Return whether the context is able to operate correctly.
  //
  // Some transport types may be unable to perform as intended under
  // some circumstances (e.g., specialized hardware unavailable, lack
  // of permissions). They can report it through this method in order
  // for the core context to avoid registering them in the first place.
  //
  virtual bool isViable() const = 0;

  // Return string to describe the domain for this context.
  //
  // Two processes with a context of the same type can connect to each
  // other if one side's domain descriptor is "accepted" by the other
  // one, using the canCommunicateWithRemote method below. That method
  // must be symmetric, and unless overridden defaults to string
  // comparison.
  //
  // For example, for a transport that leverages TCP/IP, this may be
  // as simple as the address family (assuming we can route between
  // any two processes). For a transport that leverages shared memory,
  // this descriptor must uniquely identify the machine, such that
  // only co-located processes generate the same domain descriptor.
  //
  virtual const std::string& domainDescriptor() const = 0;

  // Compare local and remote domain descriptor for compatibility.
  //
  // Determine whether a connection can be opened between this context
  // and a remote one that has the given domain descriptor. This
  // function needs to be symmetric: if we called this method on the
  // remote context with the local descriptor we should get the same
  // answer. Unless overridden it defaults to string comparison.
  //
  virtual bool canCommunicateWithRemote(
      const std::string& remoteDomainDescriptor) const {
    return domainDescriptor() == remoteDomainDescriptor;
  }

  // Tell the context what its identifier is.
  //
  // This is only supposed to be called from the high-level context or from
  // channel contexts. It will only used for logging and debugging purposes.
  virtual void setId(std::string id) = 0;

  virtual void close() = 0;

  virtual void join() = 0;

  virtual ~Context() = default;
};

} // namespace transport
} // namespace tensorpipe


================================================
FILE: tensorpipe/transport/context_boilerplate.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <cstddef>
#include <memory>
#include <string>
#include <type_traits>
#include <utility>

#include <tensorpipe/transport/context.h>
#include <tensorpipe/transport/context_impl_boilerplate.h>

namespace tensorpipe {
namespace transport {

template <typename TCtx, typename TList, typename TConn>
class ContextBoilerplate : public Context {
 public:
  template <typename... Args>
  explicit ContextBoilerplate(Args&&... args);

  ContextBoilerplate(const ContextBoilerplate&) = delete;
  ContextBoilerplate(ContextBoilerplate&&) = delete;
  ContextBoilerplate& operator=(const ContextBoilerplate&) = delete;
  ContextBoilerplate& operator=(ContextBoilerplate&&) = delete;

  std::shared_ptr<Connection> connect(std::string addr) override;

  std::shared_ptr<Listener> listen(std::string addr) override;

  bool isViable() const override;

  const std::string& domainDescriptor() const override;

  void setId(std::string id) override;

  void close() override;

  void join() override;

  ~ContextBoilerplate() override;

 protected:
  // The implementation is managed by a shared_ptr because each child object
  // will also hold a shared_ptr to it (downcast as a shared_ptr to the private
  // interface). However, its lifetime is tied to the one of this public object,
  // since when the latter is destroyed the implementation is closed and joined.
  const std::shared_ptr<TCtx> impl_;
};

template <typename TCtx, typename TList, typename TConn>
template <typename... Args>
ContextBoilerplate<TCtx, TList, TConn>::ContextBoilerplate(Args&&... args)
    : impl_(TCtx::create(std::forward<Args>(args)...)) {
  static_assert(
      std::is_base_of<ContextImplBoilerplate<TCtx, TList, TConn>, TCtx>::value,
      "");
  if (unlikely(!impl_)) {
    return;
  }
  impl_->init();
}

template <typename TCtx, typename TList, typename TConn>
std::shared_ptr<Connection> ContextBoilerplate<TCtx, TList, TConn>::connect(
    std::string addr) {
  if (unlikely(!impl_)) {
    return std::make_shared<ConnectionBoilerplate<TCtx, TList, TConn>>(nullptr);
  }
  return impl_->connect(std::move(addr));
}

template <typename TCtx, typename TList, typename TConn>
std::shared_ptr<Listener> ContextBoilerplate<TCtx, TList, TConn>::listen(
    std::string addr) {
  if (unlikely(!impl_)) {
    return std::make_shared<ListenerBoilerplate<TCtx, TList, TConn>>(nullptr);
  }
  return impl_->listen(std::move(addr));
}

template <typename TCtx, typename TList, typename TConn>
bool ContextBoilerplate<TCtx, TList, TConn>::isViable() const {
  return impl_ != nullptr;
}

template <typename TCtx, typename TList, typename TConn>
const std::string& ContextBoilerplate<TCtx, TList, TConn>::domainDescriptor()
    const {
  if (unlikely(!impl_)) {
    // FIXME In C++-17 perhaps a global static inline variable would be better?
    static std::string empty = "";
    return empty;
  }
  return impl_->domainDescriptor();
}

template <typename TCtx, typename TList, typename TConn>
void ContextBoilerplate<TCtx, TList, TConn>::setId(std::string id) {
  if (unlikely(!impl_)) {
    return;
  }
  impl_->setId(std::move(id));
}

template <typename TCtx, typename TList, typename TConn>
void ContextBoilerplate<TCtx, TList, TConn>::close() {
  if (unlikely(!impl_)) {
    return;
  }
  impl_->close();
}

template <typename TCtx, typename TList, typename TConn>
void ContextBoilerplate<TCtx, TList, TConn>::join() {
  if (unlikely(!impl_)) {
    return;
  }
  impl_->join();
}

template <typename TCtx, typename TList, typename TConn>
ContextBoilerplate<TCtx, TList, TConn>::~ContextBoilerplate() {
  join();
}

} // namespace transport
} // namespace tensorpipe


================================================
FILE: tensorpipe/transport/context_impl_boilerplate.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <atomic>
#include <future>
#include <memory>
#include <string>
#include <unordered_map>
#include <utility>

#include <tensorpipe/common/defs.h>
#include <tensorpipe/transport/connection_boilerplate.h>
#include <tensorpipe/transport/listener_boilerplate.h>

namespace tensorpipe {
namespace transport {

template <typename TCtx, typename TList, typename TConn>
class ContextImplBoilerplate : public virtual DeferredExecutor,
                               public std::enable_shared_from_this<TCtx> {
 public:
  explicit ContextImplBoilerplate(std::string domainDescriptor);

  ContextImplBoilerplate(const ContextImplBoilerplate&) = delete;
  ContextImplBoilerplate(ContextImplBoilerplate&&) = delete;
  ContextImplBoilerplate& operator=(const ContextImplBoilerplate&) = delete;
  ContextImplBoilerplate& operator=(ContextImplBoilerplate&&) = delete;

  void init();

  std::shared_ptr<Connection> connect(std::string addr);

  std::shared_ptr<Listener> listen(std::string addr);

  const std::string& domainDescriptor() const;

  // Enrolling dependent objects (listeners and connections) causes them to be
  // kept alive for as long as the context exists. These objects should enroll
  // themselves as soon as they're created (in their initImplFromLoop method)
  // and unenroll themselves after they've completed handling an error (either
  // right in the handleErrorImpl method or in a subsequent callback). The
  // context, on the other hand, should avoid terminating (i.e., complete
  // joining) until all objects have unenrolled themselves.
  void enroll(TList& listener);
  void enroll(TConn& connection);
  void unenroll(TList& listener);
  void unenroll(TConn& connection);

  // Return whether the context is in a closed state. To avoid race conditions,
  // this must be called from within the loop.
  bool closed();

  void setId(std::string id);

  void close();

  void join();

  virtual ~ContextImplBoilerplate() = default;

 protected:
  virtual void initImplFromLoop() {}
  virtual void handleErrorImpl() = 0;
  virtual void joinImpl() = 0;

  void setError(Error error);

  Error error_{Error::kSuccess};

  // An identifier for the context, composed of the identifier for the context,
  // combined with the transport's name. It will only be used for logging and
  // debugging purposes.
  std::string id_{"N/A"};

  CallbackWrapper<TCtx> callbackWrapper_{*this, *this};

 private:
  void initFromLoop();
  void closeFromLoop();

  void handleError();

  std::atomic<bool> joined_{false};

  const std::string domainDescriptor_;

  // Sequence numbers for the listeners and connections created by this context,
  // used to create their identifiers based off this context's identifier. They
  // will only be used for logging and debugging.
  std::atomic<uint64_t> listenerCounter_{0};
  std::atomic<uint64_t> connectionCounter_{0};

  // Store shared_ptrs to dependent objects that have enrolled themselves to
  // keep them alive. We use a map, indexed by raw pointers, rather than a set
  // of shared_ptrs so that we can erase objects without them having to create
  // a fresh shared_ptr just for that.
  std::unordered_map<TList*, std::shared_ptr<TList>> listeners_;
  std::unordered_map<TConn*, std::shared_ptr<TConn>> connections_;

  // For some odd reason it seems we need to use a qualified name here...
  template <typename T>
  friend class tensorpipe::CallbackWrapper;
};

template <typename TCtx, typename TList, typename TConn>
ContextImplBoilerplate<TCtx, TList, TConn>::ContextImplBoilerplate(
    std::string domainDescriptor)
    : domainDescriptor_(std::move(domainDescriptor)) {}

template <typename TCtx, typename TList, typename TConn>
void ContextImplBoilerplate<TCtx, TList, TConn>::init() {
  deferToLoop([this]() { initFromLoop(); });
}

template <typename TCtx, typename TList, typename TConn>
void ContextImplBoilerplate<TCtx, TList, TConn>::initFromLoop() {
  TP_DCHECK(inLoop());

  TP_DCHECK(!error_);

  initImplFromLoop();
}

template <typename TCtx, typename TList, typename TConn>
std::shared_ptr<Connection> ContextImplBoilerplate<TCtx, TList, TConn>::connect(
    std::string addr) {
  std::string connectionId = id_ + ".c" + std::to_string(connectionCounter_++);
  TP_VLOG(7) << "Transport context " << id_ << " is opening connection "
             << connectionId << " to address " << addr;
  return std::make_shared<ConnectionBoilerplate<TCtx, TList, TConn>>(
      typename ConnectionImplBoilerplate<TCtx, TList, TConn>::
          ConstructorToken(),
      this->shared_from_this(),
      std::move(connectionId),
      std::move(addr));
}

template <typename TCtx, typename TList, typename TConn>
std::shared_ptr<Listener> ContextImplBoilerplate<TCtx, TList, TConn>::listen(
    std::string addr) {
  std::string listenerId = id_ + ".l" + std::to_string(listenerCounter_++);
  TP_VLOG(7) << "Transport context " << id_ << " is opening listener "
             << listenerId << " on address " << addr;
  return std::make_shared<ListenerBoilerplate<TCtx, TList, TConn>>(
      typename ListenerImplBoilerplate<TCtx, TList, TConn>::ConstructorToken(),
      this->shared_from_this(),
      std::move(listenerId),
      std::move(addr));
}

template <typename TCtx, typename TList, typename TConn>
const std::string& ContextImplBoilerplate<TCtx, TList, TConn>::
    domainDescriptor() const {
  return domainDescriptor_;
}

template <typename TCtx, typename TList, typename TConn>
void ContextImplBoilerplate<TCtx, TList, TConn>::enroll(TList& listener) {
  TP_DCHECK(inLoop());
  bool wasInserted;
  std::tie(std::ignore, wasInserted) =
      listeners_.emplace(&listener, listener.shared_from_this());
  TP_DCHECK(wasInserted);
}

template <typename TCtx, typename TList, typename TConn>
void ContextImplBoilerplate<TCtx, TList, TConn>::enroll(TConn& connection) {
  TP_DCHECK(inLoop());
  bool wasInserted;
  std::tie(std::ignore, wasInserted) =
      connections_.emplace(&connection, connection.shared_from_this());
  TP_DCHECK(wasInserted);
}

template <typename TCtx, typename TList, typename TConn>
void ContextImplBoilerplate<TCtx, TList, TConn>::unenroll(TList& listener) {
  TP_DCHECK(inLoop());
  auto numRemoved = listeners_.erase(&listener);
  TP_DCHECK_EQ(numRemoved, 1);
}

template <typename TCtx, typename TList, typename TConn>
void ContextImplBoilerplate<TCtx, TList, TConn>::unenroll(TConn& connection) {
  TP_DCHECK(inLoop());
  auto numRemoved = connections_.erase(&connection);
  TP_DCHECK_EQ(numRemoved, 1);
}

template <typename TCtx, typename TList, typename TConn>
bool ContextImplBoilerplate<TCtx, TList, TConn>::closed() {
  TP_DCHECK(inLoop());
  return error_;
};

template <typename TCtx, typename TList, typename TConn>
void ContextImplBoilerplate<TCtx, TList, TConn>::setId(std::string id) {
  TP_VLOG(7) << "Transport context " << id_ << " was renamed to " << id;
  id_ = std::move(id);
}

template <typename TCtx, typename TList, typename TConn>
void ContextImplBoilerplate<TCtx, TList, TConn>::close() {
  deferToLoop([this]() { closeFromLoop(); });
}

template <typename TCtx, typename TList, typename TConn>
void ContextImplBoilerplate<TCtx, TList, TConn>::closeFromLoop() {
  TP_DCHECK(inLoop());
  TP_VLOG(7) << "Transport context " << id_ << " is closing";
  setError(TP_CREATE_ERROR(ContextClosedError));
  TP_VLOG(7) << "Transport context " << id_ << " done closing";
}

template <typename TCtx, typename TList, typename TConn>
void ContextImplBoilerplate<TCtx, TList, TConn>::setError(Error error) {
  // Don't overwrite an error that's already set.
  if (error_ || !error) {
    return;
  }

  error_ = std::move(error);

  handleError();
}

template <typename TCtx, typename TList, typename TConn>
void ContextImplBoilerplate<TCtx, TList, TConn>::handleError() {
  TP_DCHECK(inLoop());
  TP_VLOG(8) << "Transport context " << id_ << " is handling error "
             << error_.what();

  // Make a copy as they could unenroll themselves inline.
  auto listenersCopy = listeners_;
  auto connectionsCopy = connections_;
  // We call closeFromLoop, rather than just close, because we need these
  // objects to transition _immediately_ to error, "atomically". If we just
  // deferred closing to later, this could come after some already-enqueued
  // operations that could try to access the context, which would be closed,
  // and this could fail.
  for (auto& iter : listenersCopy) {
    iter.second->closeFromLoop();
  }
  for (auto& iter : connectionsCopy) {
    iter.second->closeFromLoop();
  }

  handleErrorImpl();
}

template <typename TCtx, typename TList, typename TConn>
void ContextImplBoilerplate<TCtx, TList, TConn>::join() {
  close();

  if (!joined_.exchange(true)) {
    TP_VLOG(7) << "Transport context " << id_ << " is joining";

    // As closing is deferred to the loop, we must wait for closeImpl to be
    // actually called before we call joinImpl, to avoid race conditions. For
    // this, we defer another task to the loop, which we know will run after the
    // closing, and then we wait for that task to be run.
    std::promise<void> hasClosed;
    deferToLoop([&]() { hasClosed.set_value(); });
    hasClosed.get_future().wait();

    joinImpl();

    TP_VLOG(7) << "Transport context " << id_ << " done joining";

    TP_DCHECK(listeners_.empty());
    TP_DCHECK(connections_.empty());
  }
}

} // namespace transport
} // namespace tensorpipe


================================================
FILE: tensorpipe/transport/error.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/transport/error.h>

namespace tensorpipe {
namespace transport {

std::string ContextClosedError::what() const {
  return "context closed";
}

std::string ListenerClosedError::what() const {
  return "listener closed";
}

std::string ConnectionClosedError::what() const {
  return "connection closed";
}

std::string ContextNotViableError::what() const {
  return "context not viable";
}

} // namespace transport
} // namespace tensorpipe


================================================
FILE: tensorpipe/transport/error.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <string>

#include <tensorpipe/common/error.h>

namespace tensorpipe {
namespace transport {

class ContextClosedError final : public BaseError {
 public:
  ContextClosedError() {}

  std::string what() const override;
};

class ListenerClosedError final : public BaseError {
 public:
  ListenerClosedError() {}

  std::string what() const override;
};

class ConnectionClosedError final : public BaseError {
 public:
  ConnectionClosedError() {}

  std::string what() const override;
};

class ContextNotViableError final : public BaseError {
 public:
  ContextNotViableError() {}

  std::string what() const override;
};

} // namespace transport
} // namespace tensorpipe


================================================
FILE: tensorpipe/transport/ibv/connection_impl.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/transport/ibv/connection_impl.h>

#include <string.h>

#include <deque>
#include <vector>

#include <tensorpipe/common/callback.h>
#include <tensorpipe/common/defs.h>
#include <tensorpipe/common/epoll_loop.h>
#include <tensorpipe/common/error_macros.h>
#include <tensorpipe/common/ibv.h>
#include <tensorpipe/common/memory.h>
#include <tensorpipe/common/ringbuffer_read_write_ops.h>
#include <tensorpipe/common/ringbuffer_role.h>
#include <tensorpipe/common/socket.h>
#include <tensorpipe/transport/error.h>
#include <tensorpipe/transport/ibv/constants.h>
#include <tensorpipe/transport/ibv/context_impl.h>
#include <tensorpipe/transport/ibv/error.h>
#include <tensorpipe/transport/ibv/reactor.h>
#include <tensorpipe/transport/ibv/sockaddr.h>

namespace tensorpipe {
namespace transport {
namespace ibv {

namespace {

// The data that each queue pair endpoint needs to send to the other endpoint in
// order to set up the queue pair itself. This data is transferred over a TCP
// connection.
struct Exchange {
  IbvSetupInformation setupInfo;
  uint64_t memoryRegionPtr;
  uint32_t memoryRegionKey;
};

} // namespace

ConnectionImpl::ConnectionImpl(
    ConstructorToken token,
    std::shared_ptr<ContextImpl> context,
    std::string id,
    Socket socket)
    : ConnectionImplBoilerplate<ContextImpl, ListenerImpl, ConnectionImpl>(
          token,
          std::move(context),
          std::move(id)),
      socket_(std::move(socket)) {}

ConnectionImpl::ConnectionImpl(
    ConstructorToken token,
    std::shared_ptr<ContextImpl> context,
    std::string id,
    std::string addr)
    : ConnectionImplBoilerplate<ContextImpl, ListenerImpl, ConnectionImpl>(
          token,
          std::move(context),
          std::move(id)),
      sockaddr_(Sockaddr::createInetSockAddr(addr)) {}

void ConnectionImpl::initImplFromLoop() {
  context_->enroll(*this);

  Error error;
  // The connection either got a socket or an address, but not both.
  TP_DCHECK(socket_.hasValue() ^ sockaddr_.has_value());
  if (!socket_.hasValue()) {
    std::tie(error, socket_) =
        Socket::createForFamily(sockaddr_->addr()->sa_family);
    if (error) {
      setError(std::move(error));
      return;
    }
    error = socket_.reuseAddr(true);
    if (error) {
      setError(std::move(error));
      return;
    }
    error = socket_.connect(sockaddr_.value());
    if (error) {
      setError(std::move(error));
      return;
    }
  }
  // Ensure underlying control socket is non-blocking such that it
  // works well with event driven I/O.
  error = socket_.block(false);
  if (error) {
    setError(std::move(error));
    return;
  }

  // Create ringbuffer for inbox.
  std::tie(error, inboxBuf_) = MmappedPtr::create(
      kBufferSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1);
  TP_THROW_ASSERT_IF(error)
      << "Couldn't allocate ringbuffer for connection inbox: " << error.what();
  inboxRb_ =
      RingBuffer<kNumInboxRingbufferRoles>(&inboxHeader_, inboxBuf_.ptr());
  inboxMr_ = createIbvMemoryRegion(
      context_->getReactor().getIbvLib(),
      context_->getReactor().getIbvPd(),
      inboxBuf_.ptr(),
      kBufferSize,
      IbvLib::ACCESS_LOCAL_WRITE | IbvLib::ACCESS_REMOTE_WRITE);

  // Create ringbuffer for outbox.
  std::tie(error, outboxBuf_) = MmappedPtr::create(
      kBufferSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1);
  TP_THROW_ASSERT_IF(error)
      << "Couldn't allocate ringbuffer for connection outbox: " << error.what();
  outboxRb_ =
      RingBuffer<kNumOutboxRingbufferRoles>(&outboxHeader_, outboxBuf_.ptr());
  outboxMr_ = createIbvMemoryRegion(
      context_->getReactor().getIbvLib(),
      context_->getReactor().getIbvPd(),
      outboxBuf_.ptr(),
      kBufferSize,
      0);

  // Create and init queue pair.
  {
    IbvLib::qp_init_attr initAttr;
    std::memset(&initAttr, 0, sizeof(initAttr));
    initAttr.qp_type = IbvLib::QPT_RC;
    initAttr.send_cq = context_->getReactor().getIbvCq().get();
    initAttr.recv_cq = context_->getReactor().getIbvCq().get();
    initAttr.cap.max_send_wr = kSendQueueSize;
    initAttr.cap.max_send_sge = 1;
    initAttr.srq = context_->getReactor().getIbvSrq().get();
    initAttr.sq_sig_all = 1;
    qp_ = createIbvQueuePair(
        context_->getReactor().getIbvLib(),
        context_->getReactor().getIbvPd(),
        initAttr);
  }
  transitionIbvQueuePairToInit(
      context_->getReactor().getIbvLib(),
      qp_,
      context_->getReactor().getIbvAddress());

  // Register methods to be called when our peer writes to our inbox and reads
  // from our outbox.
  context_->getReactor().registerQp(qp_->qp_num, shared_from_this());

  // We're sending address first, so wait for writability.
  state_ = SEND_ADDR;
  context_->registerDescriptor(socket_.fd(), EPOLLOUT, shared_from_this());
}

void ConnectionImpl::readImplFromLoop(read_callback_fn fn) {
  readOperations_.emplace_back(std::move(fn));

  // If the inbox already contains some data, we may be able to process this
  // operation right away.
  processReadOperationsFromLoop();
}

void ConnectionImpl::readImplFromLoop(
    AbstractNopHolder& object,
    read_nop_callback_fn fn) {
  readOperations_.emplace_back(
      &object,
      [fn{std::move(fn)}](
          const Error& error, const void* /* unused */, size_t /* unused */) {
        fn(error);
      });

  // If the inbox already contains some data, we may be able to process this
  // operation right away.
  processReadOperationsFromLoop();
}

void ConnectionImpl::readImplFromLoop(
    void* ptr,
    size_t length,
    read_callback_fn fn) {
  readOperations_.emplace_back(ptr, length, std::move(fn));

  // If the inbox already contains some data, we may be able to process this
  // operation right away.
  processReadOperationsFromLoop();
}

void ConnectionImpl::writeImplFromLoop(
    const void* ptr,
    size_t length,
    write_callback_fn fn) {
  writeOperations_.emplace_back(ptr, length, std::move(fn));

  // If the outbox has some free space, we may be able to process this operation
  // right away.
  processWriteOperationsFromLoop();
}

void ConnectionImpl::writeImplFromLoop(
    const AbstractNopHolder& object,
    write_callback_fn fn) {
  writeOperations_.emplace_back(&object, std::move(fn));

  // If the outbox has some free space, we may be able to process this operation
  // right away.
  processWriteOperationsFromLoop();
}

void ConnectionImpl::handleEventsFromLoop(int events) {
  TP_DCHECK(context_->inLoop());
  TP_VLOG(9) << "Connection " << id_ << " is handling an event on its socket ("
             << EpollLoop::formatEpollEvents(events) << ")";

  // Handle only one of the events in the mask. Events on the control
  // file descriptor are rare enough for the cost of having epoll call
  // into this function multiple times to not matter. The benefit is
  // that every handler can close and unregister the control file
  // descriptor from the event loop, without worrying about the next
  // handler trying to do so as well.
  // In some cases the socket could be in a state where it's both in an error
  // state and readable/writable. If we checked for EPOLLIN or EPOLLOUT first
  // and then returned after handling them, we would keep doing so forever and
  // never reach the error handling. So we should keep the error check first.
  if (events & EPOLLERR) {
    int error;
    socklen_t errorlen = sizeof(error);
    int rv = getsockopt(
        socket_.fd(),
        SOL_SOCKET,
        SO_ERROR,
        reinterpret_cast<void*>(&error),
        &errorlen);
    if (rv == -1) {
      setError(TP_CREATE_ERROR(SystemError, "getsockopt", rv));
    } else {
      setError(TP_CREATE_ERROR(SystemError, "async error on socket", error));
    }
    return;
  }
  if (events & EPOLLIN) {
    handleEventInFromLoop();
    return;
  }
  if (events & EPOLLOUT) {
    handleEventOutFromLoop();
    return;
  }
  // Check for hangup last, as there could be cases where we get EPOLLHUP but
  // there's still data to be read from the socket, so we want to deal with that
  // before dealing with the hangup.
  if (events & EPOLLHUP) {
    setError(TP_CREATE_ERROR(EOFError));
    return;
  }
}

void ConnectionImpl::handleEventInFromLoop() {
  TP_DCHECK(context_->inLoop());
  if (state_ == RECV_ADDR) {
    struct Exchange ex;

    auto err = socket_.read(&ex, sizeof(ex));
    // Crossing our fingers that the exchange information is small enough that
    // it can be read in a single chunk.
    if (err != sizeof(ex)) {
      setError(TP_CREATE_ERROR(ShortReadError, sizeof(ex), err));
      return;
    }

    transitionIbvQueuePairToReadyToReceive(
        context_->getReactor().getIbvLib(),
        qp_,
        context_->getReactor().getIbvAddress(),
        ex.setupInfo);
    transitionIbvQueuePairToReadyToSend(
        context_->getReactor().getIbvLib(), qp_);

    peerInboxKey_ = ex.memoryRegionKey;
    peerInboxPtr_ = ex.memoryRegionPtr;

    // The connection is usable now.
    state_ = ESTABLISHED;
    processWriteOperationsFromLoop();
    // Trigger read operations in case a pair of local read() and remote
    // write() happened before connection is established. Otherwise read()
    // callback would lose if it's the only read() request.
    processReadOperationsFromLoop();
    return;
  }

  if (state_ == ESTABLISHED) {
    // We don't expect to read anything on this socket once the
    // connection has been established. If we do, assume it's a
    // zero-byte read indicating EOF.
    setError(TP_CREATE_ERROR(EOFError));
    return;
  }

  TP_THROW_ASSERT() << "EPOLLIN event not handled in state " << state_;
}

void ConnectionImpl::handleEventOutFromLoop() {
  TP_DCHECK(context_->inLoop());
  if (state_ == SEND_ADDR) {
    Exchange ex;
    ex.setupInfo =
        makeIbvSetupInformation(context_->getReactor().getIbvAddress(), qp_);
    ex.memoryRegionPtr = reinterpret_cast<uint64_t>(inboxBuf_.ptr());
    ex.memoryRegionKey = inboxMr_->rkey;

    auto err = socket_.write(reinterpret_cast<void*>(&ex), sizeof(ex));
    // Crossing our fingers that the exchange information is small enough that
    // it can be written in a single chunk.
    if (err != sizeof(ex)) {
      setError(TP_CREATE_ERROR(ShortWriteError, sizeof(ex), err));
      return;
    }

    // Sent our address. Wait for address from peer.
    state_ = RECV_ADDR;
    context_->registerDescriptor(socket_.fd(), EPOLLIN, shared_from_this());
    return;
  }

  TP_THROW_ASSERT() << "EPOLLOUT event not handled in state " << state_;
}

void ConnectionImpl::processReadOperationsFromLoop() {
  TP_DCHECK(context_->inLoop());

  // Process all read read operations that we can immediately serve, only
  // when connection is established.
  if (state_ != ESTABLISHED) {
    return;
  }
  // Serve read operations
  InboxConsumer inboxConsumer(inboxRb_);
  while (!readOperations_.empty()) {
    RingbufferReadOperation& readOperation = readOperations_.front();
    ssize_t len = readOperation.handleRead(inboxConsumer);
    if (len > 0) {
      Reactor::AckInfo info;
      info.length = len;

      TP_VLOG(9) << "Connection " << id_
                 << " is posting a send request (acknowledging " << info.length
                 << " bytes) on QP " << qp_->qp_num;
      context_->getReactor().postAck(qp_, info);
      numAcksInFlight_++;
    }
    if (readOperation.completed()) {
      readOperations_.pop_front();
    } else {
      break;
    }
  }
}

void ConnectionImpl::processWriteOperationsFromLoop() {
  TP_DCHECK(context_->inLoop());

  if (state_ != ESTABLISHED) {
    return;
  }

  OutboxProducer outboxProducer(outboxRb_);
  while (!writeOperations_.empty()) {
    RingbufferWriteOperation& writeOperation = writeOperations_.front();
    ssize_t len = writeOperation.handleWrite(outboxProducer);
    if (len > 0) {
      ssize_t ret;
      OutboxIbvWriter outboxConsumer(outboxRb_);

      ret = outboxConsumer.startTx();
      TP_THROW_SYSTEM_IF(ret < 0, -ret);

      ssize_t numBuffers;
      std::array<OutboxIbvWriter::Buffer, 2> buffers;

      std::tie(numBuffers, buffers) =
          outboxConsumer.accessContiguousInTx</*AllowPartial=*/false>(len);
      TP_THROW_SYSTEM_IF(numBuffers < 0, -numBuffers);

      for (int bufferIdx = 0; bufferIdx < numBuffers; bufferIdx++) {
        Reactor::WriteInfo info;
        info.addr = buffers[bufferIdx].ptr;
        info.length = buffers[bufferIdx].len;
        info.lkey = outboxMr_->lkey;

        uint64_t peerInboxOffset = peerInboxHead_ & (kBufferSize - 1);
        peerInboxHead_ += buffers[bufferIdx].len;

        info.remoteAddr = peerInboxPtr_ + peerInboxOffset;
        info.rkey = peerInboxKey_;

        TP_VLOG(9) << "Connection " << id_
                   << " is posting a RDMA write request (transmitting "
                   << info.length << " bytes) on QP " << qp_->qp_num;
        context_->getReactor().postWrite(qp_, info);
        numWritesInFlight_++;
      }

      ret = outboxConsumer.commitTx();
      TP_THROW_SYSTEM_IF(ret < 0, -ret);
    }
    if (writeOperation.completed()) {
      writeOperations_.pop_front();
    } else {
      break;
    }
  }
}

void ConnectionImpl::onRemoteProducedData(uint32_t length) {
  TP_DCHECK(context_->inLoop());
  TP_VLOG(9) << "Connection " << id_ << " was signalled that " << length
             << " bytes were written to its inbox on QP " << qp_->qp_num;

  ssize_t ret;
  InboxIbvRecver inboxProducer(inboxRb_);

  ret = inboxProducer.startTx();
  TP_THROW_SYSTEM_IF(ret < 0, -ret);

  ret = inboxProducer.incMarkerInTx(length);
  TP_THROW_SYSTEM_IF(ret < 0, -ret);

  ret = inboxProducer.commitTx();
  TP_THROW_SYSTEM_IF(ret < 0, -ret);

  processReadOperationsFromLoop();
}

void ConnectionImpl::onRemoteConsumedData(uint32_t length) {
  TP_DCHECK(context_->inLoop());
  TP_VLOG(9) << "Connection " << id_ << " was signalled that " << length
             << " bytes were read from its outbox on QP " << qp_->qp_num;
  ssize_t ret;
  OutboxIbvAcker outboxConsumer(outboxRb_);

  ret = outboxConsumer.startTx();
  TP_THROW_SYSTEM_IF(ret < 0, -ret);

  ret = outboxConsumer.incMarkerInTx(length);
  TP_THROW_SYSTEM_IF(ret < 0, -ret);

  ret = outboxConsumer.commitTx();
  TP_THROW_SYSTEM_IF(ret < 0, -ret);

  processWriteOperationsFromLoop();
}

void ConnectionImpl::onWriteCompleted() {
  TP_DCHECK(context_->inLoop());
  TP_VLOG(9) << "Connection " << id_
             << " done posting a RDMA write request on QP " << qp_->qp_num;
  numWritesInFlight_--;
  tryCleanup();
}

void ConnectionImpl::onAckCompleted() {
  TP_DCHECK(context_->inLoop());
  TP_VLOG(9) << "Connection " << id_ << " done posting a send request on QP "
             << qp_->qp_num;
  numAcksInFlight_--;
  tryCleanup();
}

void ConnectionImpl::onError(IbvLib::wc_status status, uint64_t wrId) {
  TP_DCHECK(context_->inLoop());
  setError(TP_CREATE_ERROR(
      IbvError, context_->getReactor().getIbvLib().wc_status_str(status)));
  if (wrId == kWriteRequestId) {
    onWriteCompleted();
  } else if (wrId == kAckRequestId) {
    onAckCompleted();
  }
}

void ConnectionImpl::handleErrorImpl() {
  for (auto& readOperation : readOperations_) {
    readOperation.handleError(error_);
  }
  readOperations_.clear();
  for (auto& writeOperation : writeOperations_) {
    writeOperation.handleError(error_);
  }
  writeOperations_.clear();

  transitionIbvQueuePairToError(context_->getReactor().getIbvLib(), qp_);

  tryCleanup();

  if (socket_.hasValue()) {
    if (state_ > INITIALIZING) {
      context_->unregisterDescriptor(socket_.fd());
    }
    socket_.reset();
  }

  context_->unenroll(*this);
}

void ConnectionImpl::tryCleanup() {
  TP_DCHECK(context_->inLoop());
  // Setting the queue pair to an error state will cause all its work requests
  // (both those that had started being served, and those that hadn't; including
  // those from a shared receive queue) to be flushed. We need to wait for the
  // completion events of all those requests to be retrieved from the completion
  // queue before we can destroy the queue pair. We can do so by deferring the
  // destruction to the loop, since the reactor will only proceed to invoke
  // deferred functions once it doesn't have any completion events to handle.
  // However the RDMA writes and the sends may be queued up inside the reactor
  // and thus may not have even been scheduled yet, so we explicitly wait for
  // them to complete.
  if (error_) {
    if (numWritesInFlight_ == 0 && numAcksInFlight_ == 0) {
      TP_VLOG(8) << "Connection " << id_ << " is ready to clean up";
      context_->deferToLoop([impl{shared_from_this()}]() { impl->cleanup(); });
    } else {
      TP_VLOG(9) << "Connection " << id_
                 << " cannot proceed to cleanup because it has "
                 << numWritesInFlight_ << " pending RDMA write requests and "
                 << numAcksInFlight_ << " pending send requests on QP "
                 << qp_->qp_num;
    }
  }
}

void ConnectionImpl::cleanup() {
  TP_DCHECK(context_->inLoop());
  TP_VLOG(8) << "Connection " << id_ << " is cleaning up";

  context_->getReactor().unregisterQp(qp_->qp_num);

  qp_.reset();
  inboxMr_.reset();
  inboxBuf_.reset();
  outboxMr_.reset();
  outboxBuf_.reset();
}

} // namespace ibv
} // namespace transport
} // namespace tensorpipe


================================================
FILE: tensorpipe/transport/ibv/connection_impl.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <string.h>

#include <deque>
#include <memory>
#include <string>

#include <tensorpipe/common/epoll_loop.h>
#include <tensorpipe/common/ibv.h>
#include <tensorpipe/common/memory.h>
#include <tensorpipe/common/nop.h>
#include <tensorpipe/common/ringbuffer.h>
#include <tensorpipe/common/ringbuffer_read_write_ops.h>
#include <tensorpipe/common/socket.h>
#include <tensorpipe/transport/connection_impl_boilerplate.h>
#include <tensorpipe/transport/ibv/reactor.h>
#include <tensorpipe/transport/ibv/sockaddr.h>

namespace tensorpipe {
namespace transport {
namespace ibv {

class ContextImpl;
class ListenerImpl;

class ConnectionImpl final : public ConnectionImplBoilerplate<
                                 ContextImpl,
                                 ListenerImpl,
                                 ConnectionImpl>,
                             public EpollLoop::EventHandler,
                             public IbvEventHandler {
  constexpr static size_t kBufferSize = 2 * 1024 * 1024;

  constexpr static int kNumOutboxRingbufferRoles = 3;
  using OutboxIbvAcker = RingBufferRole<kNumOutboxRingbufferRoles, 0>;
  using OutboxIbvWriter = RingBufferRole<kNumOutboxRingbufferRoles, 1>;
  using OutboxProducer = RingBufferRole<kNumOutboxRingbufferRoles, 2>;

  constexpr static int kNumInboxRingbufferRoles = 2;
  using InboxConsumer = RingBufferRole<kNumInboxRingbufferRoles, 0>;
  using InboxIbvRecver = RingBufferRole<kNumInboxRingbufferRoles, 1>;

  enum State {
    INITIALIZING = 1,
    SEND_ADDR,
    RECV_ADDR,
    ESTABLISHED,
  };

 public:
  // Create a connection that is already connected (e.g. from a listener).
  ConnectionImpl(
      ConstructorToken token,
      std::shared_ptr<ContextImpl> context,
      std::string id,
      Socket socket);

  // Create a connection that connects to the specified address.
  ConnectionImpl(
      ConstructorToken token,
      std::shared_ptr<ContextImpl> context,
      std::string id,
      std::string addr);

  // Implementation of EventHandler.
  void handleEventsFromLoop(int events) override;

  // Implementation of IbvEventHandler.
  void onRemoteProducedData(uint32_t length) override;
  void onRemoteConsumedData(uint32_t length) override;
  void onWriteCompleted() override;
  void onAckCompleted() override;
  void onError(IbvLib::wc_status status, uint64_t wrId) override;

 protected:
  // Implement the entry points called by ConnectionImplBoilerplate.
  void initImplFromLoop() override;
  void readImplFromLoop(read_callback_fn fn) override;
  void readImplFromLoop(AbstractNopHolder& object, read_nop_callback_fn fn)
      override;
  void readImplFromLoop(void* ptr, size_t length, read_callback_fn fn) override;
  void writeImplFromLoop(const void* ptr, size_t length, write_callback_fn fn)
      override;
  void writeImplFromLoop(const AbstractNopHolder& object, write_callback_fn fn)
      override;
  void handleErrorImpl() override;

 private:
  // Handle events of type EPOLLIN on the UNIX domain socket.
  //
  // The only data that is expected on that socket is the address and other
  // setup information for the other side's queue pair and inbox.
  void handleEventInFromLoop();

  // Handle events of type EPOLLOUT on the UNIX domain socket.
  //
  // Once the socket is writable we send the address and other setup information
  // for this side's queue pair and inbox.
  void handleEventOutFromLoop();

  State state_{INITIALIZING};
  Socket socket_;
  optional<Sockaddr> sockaddr_;

  IbvQueuePair qp_;

  // Inbox.
  // Initialize header during construction because it isn't assignable.
  RingBufferHeader<kNumInboxRingbufferRoles> inboxHeader_{kBufferSize};
  // Use mmapped memory so it's page-aligned (and, one day, to use huge pages).
  MmappedPtr inboxBuf_;
  RingBuffer<kNumInboxRingbufferRoles> inboxRb_;
  IbvMemoryRegion inboxMr_;

  // Outbox.
  // Initialize header during construction because it isn't assignable.
  RingBufferHeader<kNumOutboxRingbufferRoles> outboxHeader_{kBufferSize};
  // Use mmapped memory so it's page-aligned (and, one day, to use huge pages).
  MmappedPtr outboxBuf_;
  RingBuffer<kNumOutboxRingbufferRoles> outboxRb_;
  IbvMemoryRegion outboxMr_;

  // Peer inbox key, pointer and head.
  uint32_t peerInboxKey_{0};
  uint64_t peerInboxPtr_{0};
  uint64_t peerInboxHead_{0};

  // The connection performs two types of send requests: writing to the remote
  // inbox, or acknowledging a write into its own inbox. These send operations
  // could be delayed and stalled by the reactor as only a limited number of
  // work requests can be outstanding at the same time globally. Thus we keep
  // count of how many we have pending to make sure they have all completed or
  // flushed when we close, and that none is stuck in the pipeline.
  uint32_t numWritesInFlight_{0};
  uint32_t numAcksInFlight_{0};

  // Pending read operations.
  std::deque<RingbufferReadOperation> readOperations_;

  // Pending write operations.
  std::deque<RingbufferWriteOperation> writeOperations_;

  // Process pending read operations if in an operational state.
  //
  // This may be triggered by the other side of the connection (by pushing this
  // side's inbox token to the reactor) when it has written some new data to its
  // outbox (which is this side's inbox). It is also called by this connection
  // when it moves into an established state or when a new read operation is
  // queued, in case data was already available before this connection was ready
  // to consume it.
  void processReadOperationsFromLoop();

  // Process pending write operations if in an operational state.
  //
  // This may be triggered by the other side of the connection (by pushing this
  // side's outbox token to the reactor) when it has read some data from its
  // inbox (which is this side's outbox). This is important when some of this
  // side's writes couldn't complete because the outbox was full, and thus they
  // needed to wait for some of its data to be read. This method is also called
  // by this connection when it moves into an established state, in case some
  // writes were queued before the connection was ready to process them, or when
  // a new write operation is queued.
  void processWriteOperationsFromLoop();

  void tryCleanup();
  void cleanup();
};

} // namespace ibv
} // namespace transport
} // namespace tensorpipe


================================================
FILE: tensorpipe/transport/ibv/constants.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <cstdint>

namespace {

// We should probably allow these to be user-configured. But, for now, we'll set
// them to the lowest value they can have, the rationale being that this way
// they will always be valid.
constexpr uint8_t kPortNum = 1;
constexpr uint8_t kGlobalIdentifierIndex = 0;

// FIXME Instead of hardcoding the next three values, we could use
// ibv_query_device to obtain max_cqe, max_qp_wr and max_srq_wr and deduce from
// them the maximum allowed values for these parameters.

// How many simultaneous receive requests to keep queued on the shared receive
// queue. Incoming RDMA writes and sends will consume one such request. The
// reactor loop will fill the SRQ back up to this value once some requests
// complete. So this number should just be large enough to accommodate all the
// requests that could finish between two reactor loop iterations. And, even if
// this number ends up being too low, the excess incoming requests will just
// retry, causing a performance penalty but not a failure.
constexpr uint32_t kNumPendingRecvReqs = 1024;

// How many RDMA write requests can be pending at the same time across all
// connections. We need to put a limit on them because they all use the same
// global completion queue which has a fixed capacity and if it overruns it will
// enter an unrecoverable error state. This value is also set as the capacity of
// the send queue of each queue pair.
constexpr uint32_t kNumPendingWriteReqs = 1024;

// How many send requests (used by the receiver to acknowledge the RDMA writes
// from the sender) can be pending at the same time across all connections.
constexpr uint32_t kNumPendingAckReqs = 1024;

// How many elements the completion queue should be able to hold. These elements
// will be either the completed receive requests of the SRQ, or the completed
// send requests from a connection's queue pair. We can bound the former value
// but not the latter, so we try to add some margin.
constexpr int kCompletionQueueSize =
    kNumPendingRecvReqs + kNumPendingWriteReqs + kNumPendingAckReqs;

// How many pending outgoing work requests each send queue should be able to
// hold. The operations we post on a send queue are the RDMA_WRITEs to send
// outgoing data and the SENDs to acknowledge incoming data, hence we size the
// queue to the sum of the maximum amount of these two ops.
constexpr int kSendQueueSize = kNumPendingWriteReqs + kNumPendingAckReqs;

// How many work completions to poll from the completion queue at each reactor
// iteration.
constexpr int kNumPolledWorkCompletions = 32;

// When the connection gets closed, to avoid leaks, it needs to "reclaim" all
// the work requests that it had posted, by waiting for their completion. They
// may however complete with error, which makes it harder to identify and
// distinguish them from failing incoming requests because, in principle, we
// cannot access the opcode field of a failed work completion. Therefore, we
// assign a special ID to those types of requests, to match them later on.
constexpr uint64_t kWriteRequestId = 1;
constexpr uint64_t kAckRequestId = 2;

} // namespace


================================================
FILE: tensorpipe/transport/ibv/context_impl.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/transport/ibv/context_impl.h>

#include <tensorpipe/transport/ibv/connection_impl.h>
#include <tensorpipe/transport/ibv/listener_impl.h>

namespace tensorpipe {
namespace transport {
namespace ibv {

namespace {

// Prepend descriptor with transport name so it's easy to
// disambiguate descriptors when debugging.
const std::string kDomainDescriptorPrefix{"ibv:"};

std::string generateDomainDescriptor() {
  // It would be very cool if we could somehow obtain an "identifier" for the
  // InfiniBand subnet that our device belongs to, but nothing of that sort
  // seems to be available. So instead we say that if the user is trying to
  // connect two processes which both have access to an InfiniBand device then
  // they must know what they are doing and probably must have set up things
  // properly.
  return kDomainDescriptorPrefix + "*";
}

} // namespace

std::shared_ptr<ContextImpl> ContextImpl::create() {
  Error error;
  IbvLib ibvLib;
  std::tie(error, ibvLib) = IbvLib::create();
  if (error) {
    TP_VLOG(7)
        << "IBV transport is not viable because libibverbs couldn't be loaded: "
        << error.what();
    return nullptr;
  }

  IbvDeviceList deviceList;
  std::tie(error, deviceList) = IbvDeviceList::create(ibvLib);
  if (error && error.isOfType<SystemError>() &&
      error.castToType<SystemError>()->errorCode() == ENOSYS) {
    TP_VLOG(7) << "IBV transport is not viable because it couldn't get list of "
               << "InfiniBand devices because the kernel module isn't loaded";
    return nullptr;
  }
  TP_THROW_ASSERT_IF(error)
      << "Couldn't get list of InfiniBand devices: " << error.what();

  if (deviceList.size() == 0) {
    TP_VLOG(7) << "IBV transport is not viable because it couldn't find any "
               << "InfiniBand NICs";
    return nullptr;
  }

  return std::make_shared<ContextImpl>(
      std::move(ibvLib), std::move(deviceList));
}

ContextImpl::ContextImpl(IbvLib ibvLib, IbvDeviceList deviceList)
    : ContextImplBoilerplate<ContextImpl, ListenerImpl, ConnectionImpl>(
          generateDomainDescriptor()),
      reactor_(std::move(ibvLib), std::move(deviceList)) {}

void ContextImpl::handleErrorImpl() {
  loop_.close();
  reactor_.close();
}

void ContextImpl::joinImpl() {
  loop_.join();
  reactor_.join();
}

bool ContextImpl::inLoop() const {
  return reactor_.inLoop();
};

void ContextImpl::deferToLoop(std::function<void()> fn) {
  reactor_.deferToLoop(std::move(fn));
};

void ContextImpl::registerDescriptor(
    int fd,
    int events,
    std::shared_ptr<EpollLoop::EventHandler> h) {
  loop_.registerDescriptor(fd, events, std::move(h));
}

void ContextImpl::unregisterDescriptor(int fd) {
  loop_.unregisterDescriptor(fd);
}

Reactor& ContextImpl::getReactor() {
  return reactor_;
}

} // namespace ibv
} // namespace transport
} // namespace tensorpipe


================================================
FILE: tensorpipe/transport/ibv/context_impl.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <functional>
#include <memory>
#include <string>
#include <tuple>

#include <tensorpipe/common/epoll_loop.h>
#include <tensorpipe/transport/context_impl_boilerplate.h>
#include <tensorpipe/transport/ibv/reactor.h>

namespace tensorpipe {
namespace transport {
namespace ibv {

class ConnectionImpl;
class ListenerImpl;

class ContextImpl final
    : public ContextImplBoilerplate<ContextImpl, ListenerImpl, ConnectionImpl> {
 public:
  static std::shared_ptr<ContextImpl> create();

  ContextImpl(IbvLib ibvLib, IbvDeviceList deviceList);

  // Implement the DeferredExecutor interface.
  bool inLoop() const override;
  void deferToLoop(std::function<void()> fn) override;

  void registerDescriptor(
      int fd,
      int events,
      std::shared_ptr<EpollLoop::EventHandler> h);

  void unregisterDescriptor(int fd);

  Reactor& getReactor();

 protected:
  // Implement the entry points called by ContextImplBoilerplate.
  void handleErrorImpl() override;
  void joinImpl() override;

 private:
  Reactor reactor_;
  EpollLoop loop_{this->reactor_};
};

} // namespace ibv
} // namespace transport
} // namespace tensorpipe


================================================
FILE: tensorpipe/transport/ibv/error.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/transport/ibv/error.h>

#include <netdb.h>

#include <sstream>

#include <tensorpipe/common/ibv.h>

namespace tensorpipe {
namespace transport {
namespace ibv {

std::string IbvError::what() const {
  return error_;
}

std::string GetaddrinfoError::what() const {
  std::ostringstream ss;
  ss << "getaddrinfo: " << gai_strerror(error_);
  return ss.str();
}

std::string NoAddrFoundError::what() const {
  return "no address found";
}

} // namespace ibv
} // namespace transport
} // namespace tensorpipe


================================================
FILE: tensorpipe/transport/ibv/error.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <string>

#include <tensorpipe/transport/error.h>

namespace tensorpipe {
namespace transport {
namespace ibv {

class IbvError final : public BaseError {
 public:
  explicit IbvError(std::string error) : error_(error) {}

  std::string what() const override;

 private:
  std::string error_;
};

class GetaddrinfoError final : public BaseError {
 public:
  explicit GetaddrinfoError(int error) : error_(error) {}

  std::string what() const override;

 private:
  int error_;
};

class NoAddrFoundError final : public BaseError {
 public:
  NoAddrFoundError() {}

  std::string what() const override;
};

} // namespace ibv
} // namespace transport
} // namespace tensorpipe


================================================
FILE: tensorpipe/transport/ibv/factory.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/transport/ibv/factory.h>

#include <tensorpipe/transport/context_boilerplate.h>
#include <tensorpipe/transport/ibv/connection_impl.h>
#include <tensorpipe/transport/ibv/context_impl.h>
#include <tensorpipe/transport/ibv/listener_impl.h>

namespace tensorpipe {
namespace transport {
namespace ibv {

std::shared_ptr<Context> create() {
  return std::make_shared<
      ContextBoilerplate<ContextImpl, ListenerImpl, ConnectionImpl>>();
}

} // namespace ibv
} // namespace transport
} // namespace tensorpipe


================================================
FILE: tensorpipe/transport/ibv/factory.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <memory>

#include <tensorpipe/transport/context.h>

namespace tensorpipe {
namespace transport {
namespace ibv {

std::shared_ptr<Context> create();

} // namespace ibv
} // namespace transport
} // namespace tensorpipe


================================================
FILE: tensorpipe/transport/ibv/listener_impl.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/transport/ibv/listener_impl.h>

#include <deque>
#include <functional>
#include <mutex>
#include <vector>

#include <tensorpipe/common/callback.h>
#include <tensorpipe/common/defs.h>
#include <tensorpipe/common/epoll_loop.h>
#include <tensorpipe/common/optional.h>
#include <tensorpipe/transport/error.h>
#include <tensorpipe/transport/ibv/connection_impl.h>
#include <tensorpipe/transport/ibv/context_impl.h>
#include <tensorpipe/transport/ibv/sockaddr.h>

namespace tensorpipe {
namespace transport {
namespace ibv {

ListenerImpl::ListenerImpl(
    ConstructorToken token,
    std::shared_ptr<ContextImpl> context,
    std::string id,
    std::string addr)
    : ListenerImplBoilerplate<ContextImpl, ListenerImpl, ConnectionImpl>(
          token,
          std::move(context),
          std::move(id)),
      sockaddr_(Sockaddr::createInetSockAddr(addr)) {}

void ListenerImpl::initImplFromLoop() {
  context_->enroll(*this);

  Error error;
  TP_DCHECK(!socket_.hasValue());
  std::tie(error, socket_) =
      Socket::createForFamily(sockaddr_.addr()->sa_family);
  if (error) {
    setError(std::move(error));
    return;
  }
  error = socket_.reuseAddr(true);
  if (error) {
    setError(std::move(error));
    return;
  }
  error = socket_.bind(sockaddr_);
  if (error) {
    setError(std::move(error));
    return;
  }
  error = socket_.block(false);
  if (error) {
    setError(std::move(error));
    return;
  }
  error = socket_.listen(128);
  if (error) {
    setError(std::move(error));
    return;
  }
  struct sockaddr_storage addr;
  socklen_t addrlen;
  std::tie(error, addr, addrlen) = socket_.getSockName();
  if (error) {
    setError(std::move(error));
    return;
  }
  sockaddr_ = Sockaddr(reinterpret_cast<struct sockaddr*>(&addr), addrlen);
}

void ListenerImpl::handleErrorImpl() {
  if (!fns_.empty()) {
    context_->unregisterDescriptor(socket_.fd());
  }
  socket_.reset();
  for (auto& fn : fns_) {
    fn(error_, std::shared_ptr<Connection>());
  }
  fns_.clear();

  context_->unenroll(*this);
}

void ListenerImpl::acceptImplFromLoop(accept_callback_fn fn) {
  fns_.push_back(std::move(fn));

  // Only register if we go from 0 to 1 pending callbacks. In other cases we
  // already had a pending callback and thus we were already registered.
  if (fns_.size() == 1) {
    // Register with loop for readability events.
    context_->registerDescriptor(socket_.fd(), EPOLLIN, shared_from_this());
  }
}

std::string ListenerImpl::addrImplFromLoop() const {
  return sockaddr_.str();
}

void ListenerImpl::handleEventsFromLoop(int events) {
  TP_DCHECK(context_->inLoop());
  TP_VLOG(9) << "Listener " << id_ << " is handling an event on its socket ("
             << EpollLoop::formatEpollEvents(events) << ")";

  if (events & EPOLLERR) {
    int error;
    socklen_t errorlen = sizeof(error);
    int rv = getsockopt(
        socket_.fd(),
        SOL_SOCKET,
        SO_ERROR,
        reinterpret_cast<void*>(&error),
        &errorlen);
    if (rv == -1) {
      setError(TP_CREATE_ERROR(SystemError, "getsockopt", rv));
    } else {
      setError(TP_CREATE_ERROR(SystemError, "async error on socket", error));
    }
    return;
  }
  if (events & EPOLLHUP) {
    setError(TP_CREATE_ERROR(EOFError));
    return;
  }
  TP_ARG_CHECK_EQ(events, EPOLLIN);

  Error error;
  Socket socket;
  std::tie(error, socket) = socket_.accept();
  if (error) {
    setError(std::move(error));
    return;
  }

  TP_DCHECK(!fns_.empty())
      << "when the callback is disarmed the listener's descriptor is supposed "
      << "to be unregistered";
  auto fn = std::move(fns_.front());
  fns_.pop_front();
  if (fns_.empty()) {
    context_->unregisterDescriptor(socket_.fd());
  }
  fn(Error::kSuccess, createAndInitConnection(std::move(socket)));
}

} // namespace ibv
} // namespace transport
} // namespace tensorpipe


================================================
FILE: tensorpipe/transport/ibv/listener_impl.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <deque>
#include <functional>
#include <mutex>
#include <vector>

#include <tensorpipe/common/callback.h>
#include <tensorpipe/common/defs.h>
#include <tensorpipe/common/epoll_loop.h>
#include <tensorpipe/common/optional.h>
#include <tensorpipe/transport/error.h>
#include <tensorpipe/transport/ibv/sockaddr.h>
#include <tensorpipe/transport/listener_impl_boilerplate.h>

namespace tensorpipe {
namespace transport {
namespace ibv {

class ConnectionImpl;
class ContextImpl;

class ListenerImpl final
    : public ListenerImplBoilerplate<ContextImpl, ListenerImpl, ConnectionImpl>,
      public EpollLoop::EventHandler {
 public:
  // Create a listener that listens on the specified address.
  ListenerImpl(
      ConstructorToken token,
      std::shared_ptr<ContextImpl> context,
      std::string id,
      std::string addr);

  // Implementation of EventHandler.
  void handleEventsFromLoop(int events) override;

 protected:
  // Implement the entry points called by ListenerImplBoilerplate.
  void initImplFromLoop() override;
  void acceptImplFromLoop(accept_callback_fn fn) override;
  std::string addrImplFromLoop() const override;
  void handleErrorImpl() override;

 private:
  Socket socket_;
  Sockaddr sockaddr_;
  std::deque<accept_callback_fn> fns_;
};

} // namespace ibv
} // namespace transport
} // namespace tensorpipe


================================================
FILE: tensorpipe/transport/ibv/reactor.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/transport/ibv/reactor.h>

#include <tensorpipe/common/system.h>
#include <tensorpipe/transport/ibv/constants.h>

namespace tensorpipe {
namespace transport {
namespace ibv {

Reactor::Reactor(IbvLib ibvLib, IbvDeviceList deviceList)
    : ibvLib_(std::move(ibvLib)) {
  TP_DCHECK_GE(deviceList.size(), 1);
  ctx_ = createIbvContext(getIbvLib(), deviceList[0]);
  pd_ = createIbvProtectionDomain(getIbvLib(), ctx_);
  cq_ = createIbvCompletionQueue(
      getIbvLib(),
      ctx_,
      kCompletionQueueSize,
      /*cq_context=*/nullptr,
      /*channel=*/nullptr,
      /*comp_vector=*/0);

  IbvLib::srq_init_attr srqInitAttr;
  std::memset(&srqInitAttr, 0, sizeof(srqInitAttr));
  srqInitAttr.attr.max_wr = kNumPendingRecvReqs;
  srq_ = createIbvSharedReceiveQueue(getIbvLib(), pd_, srqInitAttr);

  addr_ = makeIbvAddress(getIbvLib(), ctx_, kPortNum, kGlobalIdentifierIndex);

  postRecvRequestsOnSRQ(kNumPendingRecvReqs);

  startThread("TP_IBV_reactor");
}

void Reactor::postRecvRequestsOnSRQ(int num) {
  while (num > 0) {
    IbvLib::recv_wr* badRecvWr = nullptr;
    std::array<IbvLib::recv_wr, kNumPolledWorkCompletions> wrs;
    std::memset(wrs.data(), 0, sizeof(wrs));
    for (int i = 0; i < std::min(num, kNumPolledWorkCompletions) - 1; i++) {
      wrs[i].next = &wrs[i + 1];
    }
    int rv = getIbvLib().post_srq_recv(srq_.get(), wrs.data(), &badRecvWr);
    TP_THROW_SYSTEM_IF(rv != 0, errno);
    TP_THROW_ASSERT_IF(badRecvWr != nullptr);
    num -= std::min(num, kNumPolledWorkCompletions);
  }
}

void Reactor::setId(std::string id) {
  id_ = std::move(id);
}

void Reactor::close() {
  if (!closed_.exchange(true)) {
    stopBusyPolling();
  }
}

void Reactor::join() {
  close();

  if (!joined_.exchange(true)) {
    joinThread();
  }
}

Reactor::~Reactor() {
  join();
}

bool Reactor::pollOnce() {
  std::array<IbvLib::wc, kNumPolledWorkCompletions> wcs;
  auto rv = getIbvLib().poll_cq(cq_.get(), wcs.size(), wcs.data());

  if (rv == 0) {
    return false;
  }
  TP_THROW_SYSTEM_IF(rv < 0, errno);

  int numRecvs = 0;
  int numWrites = 0;
  int numAcks = 0;
  for (int wcIdx = 0; wcIdx < rv; wcIdx++) {
    IbvLib::wc& wc = wcs[wcIdx];

    TP_VLOG(9) << "Transport context " << id_
               << " got work completion for request " << wc.wr_id << " for QP "
               << wc.qp_num << " with status "
               << getIbvLib().wc_status_str(wc.status) << " and opcode "
               << ibvWorkCompletionOpcodeToStr(wc.opcode)
               << " (byte length: " << wc.byte_len
               << ", immediate data: " << wc.imm_data << ")";

    auto iter = queuePairEventHandler_.find(wc.qp_num);
    TP_THROW_ASSERT_IF(iter == queuePairEventHandler_.end())
        << "Got work completion for unknown queue pair " << wc.qp_num;

    if (wc.status != IbvLib::WC_SUCCESS) {
      iter->second->onError(wc.status, wc.wr_id);
      continue;
    }

    switch (wc.opcode) {
      case IbvLib::WC_RECV_RDMA_WITH_IMM:
        TP_THROW_ASSERT_IF(!(wc.wc_flags & IbvLib::WC_WITH_IMM));
        iter->second->onRemoteProducedData(wc.imm_data);
        numRecvs++;
        break;
      case IbvLib::WC_RECV:
        TP_THROW_ASSERT_IF(!(wc.wc_flags & IbvLib::WC_WITH_IMM));
        iter->second->onRemoteConsumedData(wc.imm_data);
        numRecvs++;
        break;
      case IbvLib::WC_RDMA_WRITE:
        iter->second->onWriteCompleted();
        numWrites++;
        break;
      case IbvLib::WC_SEND:
        iter->second->onAckCompleted();
        numAcks++;
        break;
      default:
        TP_THROW_ASSERT() << "Unknown opcode: " << wc.opcode;
    }
  }

  postRecvRequestsOnSRQ(numRecvs);

  numAvailableWrites_ += numWrites;
  while (!pendingQpWrites_.empty() && numAvailableWrites_ > 0) {
    postWrite(
        std::get<0>(pendingQpWrites_.front()),
        std::get<1>(pendingQpWrites_.front()));
    pendingQpWrites_.pop_front();
  }

  numAvailableAcks_ += numAcks;
  while (!pendingQpAcks_.empty() && numAvailableAcks_ > 0) {
    postAck(
        std::get<0>(pendingQpAcks_.front()),
        std::get<1>(pendingQpAcks_.front()));
    pendingQpAcks_.pop_front();
  }

  return true;
}

bool Reactor::readyToClose() {
  return queuePairEventHandler_.size() == 0;
}

void Reactor::registerQp(
    uint32_t qpn,
    std::shared_ptr<IbvEventHandler> eventHandler) {
  queuePairEventHandler_.emplace(qpn, std::move(eventHandler));
}

void Reactor::unregisterQp(uint32_t qpn) {
  queuePairEventHandler_.erase(qpn);
}

void Reactor::postWrite(IbvQueuePair& qp, WriteInfo info) {
  if (numAvailableWrites_ > 0) {
    IbvLib::sge list;
    list.addr = reinterpret_cast<uint64_t>(info.addr);
    list.length = info.length;
    list.lkey = info.lkey;

    IbvLib::send_wr wr;
    std::memset(&wr, 0, sizeof(wr));
    wr.wr_id = kWriteRequestId;
    wr.sg_list = &list;
    wr.num_sge = 1;
    wr.opcode = IbvLib::WR_RDMA_WRITE_WITH_IMM;
    wr.imm_data = info.length;
    wr.wr.rdma.remote_addr = info.remoteAddr;
    wr.wr.rdma.rkey = info.rkey;

    IbvLib::send_wr* badWr = nullptr;
    TP_VLOG(9) << "Transport context " << id_ << " posting RDMA write for QP "
               << qp->qp_num;
    TP_CHECK_IBV_INT(getIbvLib().post_send(qp.get(), &wr, &badWr));
    TP_THROW_ASSERT_IF(badWr != nullptr);
    numAvailableWrites_--;
  } else {
    TP_VLOG(9) << "Transport context " << id_
               << " queueing up RDMA write for QP " << qp->qp_num;
    pendingQpWrites_.emplace_back(qp, info);
  }
}

void Reactor::postAck(IbvQueuePair& qp, AckInfo info) {
  if (numAvailableAcks_ > 0) {
    IbvLib::send_wr wr;
    std::memset(&wr, 0, sizeof(wr));
    wr.wr_id = kAckRequestId;
    wr.opcode = IbvLib::WR_SEND_WITH_IMM;
    wr.imm_data = info.length;

    IbvLib::send_wr* badWr = nullptr;
    TP_VLOG(9) << "Transport context " << id_ << " posting send for QP "
               << qp->qp_num;
    TP_CHECK_IBV_INT(getIbvLib().post_send(qp.get(), &wr, &badWr));
    TP_THROW_ASSERT_IF(badWr != nullptr);
    numAvailableAcks_--;
  } else {
    TP_VLOG(9) << "Transport context " << id_ << " queueing send for QP "
               << qp->qp_num;
    pendingQpAcks_.emplace_back(qp, info);
  }
}

} // namespace ibv
} // namespace transport
} // namespace tensorpipe


================================================
FILE: tensorpipe/transport/ibv/reactor.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <atomic>
#include <functional>
#include <future>
#include <list>
#include <mutex>
#include <set>
#include <thread>
#include <vector>

#include <tensorpipe/common/busy_polling_loop.h>
#include <tensorpipe/common/callback.h>
#include <tensorpipe/common/fd.h>
#include <tensorpipe/common/ibv.h>
#include <tensorpipe/common/optional.h>
#include <tensorpipe/transport/ibv/constants.h>

namespace tensorpipe {
namespace transport {
namespace ibv {

class IbvEventHandler {
 public:
  virtual void onRemoteProducedData(uint32_t length) = 0;

  virtual void onRemoteConsumedData(uint32_t length) = 0;

  virtual void onWriteCompleted() = 0;

  virtual void onAckCompleted() = 0;

  virtual void onError(IbvLib::wc_status status, uint64_t wrId) = 0;

  virtual ~IbvEventHandler() = default;
};

// Reactor loop.
//
// Companion class to the event loop in `loop.h` that executes
// functions on triggers. The triggers are posted to a shared memory
// ring buffer, so this can be done by other processes on the same
// machine. It uses extra data in the ring buffer header to store a
// mutex and condition variable to avoid a busy loop.
//
class Reactor final : public BusyPollingLoop {
 public:
  Reactor(IbvLib ibvLib, IbvDeviceList deviceList);

  const IbvLib& getIbvLib() {
    return ibvLib_;
  }

  IbvProtectionDomain& getIbvPd() {
    return pd_;
  }

  IbvCompletionQueue& getIbvCq() {
    return cq_;
  }

  IbvSharedReceiveQueue& getIbvSrq() {
    return srq_;
  }

  const IbvAddress& getIbvAddress() {
    return addr_;
  }

  void registerQp(uint32_t qpn, std::shared_ptr<IbvEventHandler> eventHandler);

  void unregisterQp(uint32_t qpn);

  struct WriteInfo {
    void* addr;
    size_t length;
    uint32_t lkey;
    uint64_t remoteAddr;
    uint32_t rkey;
  };

  void postWrite(IbvQueuePair& qp, WriteInfo info);

  struct AckInfo {
    size_t length;
  };

  void postAck(IbvQueuePair& qp, AckInfo info);

  void setId(std::string id);

  void close();

  void join();

  ~Reactor();

 protected:
  bool pollOnce() override;

  bool readyToClose() override;

 private:
  // InfiniBand stuff
  const IbvLib ibvLib_;
  IbvContext ctx_;
  IbvProtectionDomain pd_;
  IbvCompletionQueue cq_;
  IbvSharedReceiveQueue srq_;
  IbvAddress addr_;

  void postRecvRequestsOnSRQ(int num);

  std::atomic<bool> closed_{false};
  std::atomic<bool> joined_{false};

  // An identifier for the context, composed of the identifier for the context,
  // combined with the transport's name. It will only be used for logging and
  // debugging purposes.
  std::string id_{"N/A"};

  // The registered event handlers for each queue pair.
  std::unordered_map<uint32_t, std::shared_ptr<IbvEventHandler>>
      queuePairEventHandler_;

  uint32_t numAvailableWrites_{kNumPendingWriteReqs};
  uint32_t numAvailableAcks_{kNumPendingAckReqs};
  std::deque<std::tuple<IbvQueuePair&, WriteInfo>> pendingQpWrites_;
  std::deque<std::tuple<IbvQueuePair&, AckInfo>> pendingQpAcks_;
};

} // namespace ibv
} // namespace transport
} // namespace tensorpipe


================================================
FILE: tensorpipe/transport/ibv/sockaddr.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/transport/ibv/sockaddr.h>

#include <array>
#include <cstring>
#include <sstream>
#include <utility>

#include <arpa/inet.h>
#include <net/if.h>

#include <tensorpipe/common/defs.h>

namespace tensorpipe {
namespace transport {
namespace ibv {

Sockaddr Sockaddr::createInetSockAddr(const std::string& str) {
  int port = 0;
  std::string addrStr;
  std::string portStr;

  // If the input string is an IPv6 address with port, the address
  // itself must be wrapped with brackets.
  if (addrStr.empty()) {
    auto start = str.find("[");
    auto stop = str.find("]");
    if (start < stop && start != std::string::npos &&
        stop != std::string::npos) {
      addrStr = str.substr(start + 1, stop - (start + 1));
      if (stop + 1 < str.size() && str[stop + 1] == ':') {
        portStr = str.substr(stop + 2);
      }
    }
  }

  // If the input string is an IPv4 address with port, we expect
  // at least a single period and a single colon in the string.
  if (addrStr.empty()) {
    auto period = str.find(".");
    auto colon = str.find(":");
    if (period != std::string::npos && colon != std::string::npos) {
      addrStr = str.substr(0, colon);
      portStr = str.substr(colon + 1);
    }
  }

  // Fallback to using entire input string as address without port.
  if (addrStr.empty()) {
    addrStr = str;
  }

  // Parse port number if specified.
  if (!portStr.empty()) {
    port = std::stoi(portStr);
    if (port < 0 || port > std::numeric_limits<uint16_t>::max()) {
      TP_THROW_EINVAL() << str;
    }
  }

  // Try to convert an IPv4 address.
  {
    struct sockaddr_in addr;
    std::memset(&addr, 0, sizeof(addr));
    auto rv = inet_pton(AF_INET, addrStr.c_str(), &addr.sin_addr);
    TP_THROW_SYSTEM_IF(rv < 0, errno);
    if (rv == 1) {
      addr.sin_family = AF_INET;
      addr.sin_port = ntohs(port);
      return Sockaddr(reinterpret_cast<struct sockaddr*>(&addr), sizeof(addr));
    }
  }

  // Try to convert an IPv6 address.
  {
    struct sockaddr_in6 addr;
    std::memset(&addr, 0, sizeof(addr));

    auto interfacePos = addrStr.find('%');
    if (interfacePos != std::string::npos) {
      addr.sin6_scope_id =
          if_nametoindex(addrStr.substr(interfacePos + 1).c_str());
      addrStr = addrStr.substr(0, interfacePos);
    }

    auto rv = inet_pton(AF_INET6, addrStr.c_str(), &addr.sin6_addr);
    TP_THROW_SYSTEM_IF(rv < 0, errno);
    if (rv == 1) {
      addr.sin6_family = AF_INET6;
      addr.sin6_port = ntohs(port);
      return Sockaddr(reinterpret_cast<struct sockaddr*>(&addr), sizeof(addr));
    }
  }

  // Invalid address.
  TP_THROW_EINVAL() << str;

  // Return bogus to silence "return from non-void function" warning.
  // Note: we don't reach this point per the throw above.
  return Sockaddr(nullptr, 0);
}

std::string Sockaddr::str() const {
  std::ostringstream oss;

  if (addr_.ss_family == AF_INET) {
    std::array<char, 64> buf;
    auto in = reinterpret_cast<const struct sockaddr_in*>(&addr_);
    auto rv = inet_ntop(AF_INET, &in->sin_addr, buf.data(), buf.size());
    TP_THROW_SYSTEM_IF(rv == nullptr, errno);
    oss << buf.data() << ":" << htons(in->sin_port);
  } else if (addr_.ss_family == AF_INET6) {
    std::array<char, 64> buf;
    auto in6 = reinterpret_cast<const struct sockaddr_in6*>(&addr_);
    auto rv = inet_ntop(AF_INET6, &in6->sin6_addr, buf.data(), buf.size());
    TP_THROW_SYSTEM_IF(rv == nullptr, errno);
    oss << "[" << buf.data();
    if (in6->sin6_scope_id > 0) {
      std::array<char, IF_NAMESIZE> scopeBuf;
      rv = if_indextoname(in6->sin6_scope_id, scopeBuf.data());
      TP_THROW_SYSTEM_IF(rv == nullptr, errno);
      oss << "%" << scopeBuf.data();
    }
    oss << "]:" << htons(in6->sin6_port);

  } else {
    TP_THROW_EINVAL() << "invalid address family: " << addr_.ss_family;
  }

  return oss.str();
}

} // namespace ibv
} // namespace transport
} // namespace tensorpipe


================================================
FILE: tensorpipe/transport/ibv/sockaddr.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <sys/socket.h>

#include <cstring>
#include <string>

#include <tensorpipe/common/socket.h>

namespace tensorpipe {
namespace transport {
namespace ibv {

class Sockaddr final : public tensorpipe::Sockaddr {
 public:
  static Sockaddr createInetSockAddr(const std::string& str);

  Sockaddr(const struct sockaddr* addr, socklen_t addrlen) {
    TP_ARG_CHECK(addr != nullptr);
    TP_ARG_CHECK_LE(addrlen, sizeof(addr_));
    // Ensure the sockaddr_storage is zeroed, because we don't always
    // write to all fields in the `sockaddr_[in|in6]` structures.
    std::memset(&addr_, 0, sizeof(addr_));
    std::memcpy(&addr_, addr, addrlen);
    addrlen_ = addrlen;
  }

  inline const struct sockaddr* addr() const override {
    return reinterpret_cast<const struct sockaddr*>(&addr_);
  }

  inline struct sockaddr* addr() {
    return reinterpret_cast<struct sockaddr*>(&addr_);
  }

  inline socklen_t addrlen() const override {
    return addrlen_;
  }

  std::string str() const;

 private:
  struct sockaddr_storage addr_;
  socklen_t addrlen_;
};

} // namespace ibv
} // namespace transport
} // namespace tensorpipe


================================================
FILE: tensorpipe/transport/ibv/utility.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/transport/ibv/utility.h>

#include <array>
#include <climits>
#include <cstring>
#include <memory>
#include <string>
#include <tuple>
#include <utility>

#include <ifaddrs.h>
#include <netdb.h>
#include <netinet/in.h>
#include <netinet/ip.h>
#include <sys/socket.h>
#include <sys/types.h>
#include <unistd.h>

#include <tensorpipe/common/error_macros.h>
#include <tensorpipe/common/socket.h>
#include <tensorpipe/transport/ibv/error.h>
#include <tensorpipe/transport/ibv/sockaddr.h>

namespace tensorpipe {
namespace transport {
namespace ibv {

namespace {

struct InterfaceAddressesDeleter {
  void operator()(struct ifaddrs* ptr) {
    ::freeifaddrs(ptr);
  }
};

using InterfaceAddresses =
    std::unique_ptr<struct ifaddrs, InterfaceAddressesDeleter>;

std::tuple<Error, InterfaceAddresses> createInterfaceAddresses() {
  struct ifaddrs* ifaddrs;
  auto rv = ::getifaddrs(&ifaddrs);
  if (rv < 0) {
    return std::make_tuple(
        TP_CREATE_ERROR(SystemError, "getifaddrs", errno),
        InterfaceAddresses());
  }
  return std::make_tuple(Error::kSuccess, InterfaceAddresses(ifaddrs));
}

std::tuple<Error, std::string> getHostname() {
  std::array<char, HOST_NAME_MAX> hostname;
  auto rv = ::gethostname(hostname.data(), hostname.size());
  if (rv < 0) {
    return std::make_tuple(
        TP_CREATE_ERROR(SystemError, "gethostname", errno), std::string());
  }
  return std::make_tuple(Error::kSuccess, std::string(hostname.data()));
}

struct AddressInfoDeleter {
  void operator()(struct addrinfo* ptr) {
    ::freeaddrinfo(ptr);
  }
};

using AddressInfo = std::unique_ptr<struct addrinfo, AddressInfoDeleter>;

std::tuple<Error, AddressInfo> createAddressInfo(std::string host) {
  struct addrinfo hints;
  std::memset(&hints, 0, sizeof(hints));
  hints.ai_family = AF_UNSPEC;
  hints.ai_socktype = SOCK_STREAM;
  hints.ai_protocol = IPPROTO_TCP;

  struct addrinfo* result;
  auto rv = ::getaddrinfo(host.c_str(), nullptr, &hints, &result);
  if (rv != 0) {
    return std::make_tuple(
        TP_CREATE_ERROR(GetaddrinfoError, rv), AddressInfo());
  }
  return std::make_tuple(Error::kSuccess, AddressInfo(result));
}

} // namespace

std::tuple<Error, std::string> lookupAddrForIface(std::string iface) {
  Error error;
  InterfaceAddresses addresses;
  std::tie(error, addresses) = createInterfaceAddresses();
  if (error) {
    return std::make_tuple(std::move(error), std::string());
  }

  struct ifaddrs* ifa;
  for (ifa = addresses.get(); ifa != nullptr; ifa = ifa->ifa_next) {
    // Skip entry if ifa_addr is NULL (see getifaddrs(3))
    if (ifa->ifa_addr == nullptr) {
      continue;
    }

    if (iface != ifa->ifa_name) {
      continue;
    }

    switch (ifa->ifa_addr->sa_family) {
      case AF_INET:
        return std::make_tuple(
            Error::kSuccess,
            Sockaddr(ifa->ifa_addr, sizeof(struct sockaddr_in)).str());
      case AF_INET6:
        return std::make_tuple(
            Error::kSuccess,
            Sockaddr(ifa->ifa_addr, sizeof(struct sockaddr_in6)).str());
    }
  }

  return std::make_tuple(TP_CREATE_ERROR(NoAddrFoundError), std::string());
}

std::tuple<Error, std::string> lookupAddrForHostname() {
  Error error;
  std::string hostname;
  std::tie(error, hostname) = getHostname();
  if (error) {
    return std::make_tuple(std::move(error), std::string());
  }

  AddressInfo info;
  std::tie(error, info) = createAddressInfo(std::move(hostname));
  if (error) {
    return std::make_tuple(std::move(error), std::string());
  }

  Error firstError;
  for (struct addrinfo* rp = info.get(); rp != nullptr; rp = rp->ai_next) {
    TP_DCHECK(rp->ai_family == AF_INET || rp->ai_family == AF_INET6);
    TP_DCHECK_EQ(rp->ai_socktype, SOCK_STREAM);
    TP_DCHECK_EQ(rp->ai_protocol, IPPROTO_TCP);

    Sockaddr addr = Sockaddr(rp->ai_addr, rp->ai_addrlen);

    Socket socket;
    std::tie(error, socket) = Socket::createForFamily(rp->ai_family);

    if (!error) {
      error = socket.bind(addr);
    }

    if (error) {
      // Record the first binding error we encounter and return that in the end
      // if no working address is found, in order to help with debugging.
      if (!firstError) {
        firstError = error;
      }
      continue;
    }

    return std::make_tuple(Error::kSuccess, addr.str());
  }

  if (firstError) {
    return std::make_tuple(std::move(firstError), std::string());
  } else {
    return std::make_tuple(TP_CREATE_ERROR(NoAddrFoundError), std::string());
  }
}

} // namespace ibv
} // namespace transport
} // namespace tensorpipe


================================================
FILE: tensorpipe/transport/ibv/utility.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <string>
#include <tuple>

#include <tensorpipe/common/error.h>

namespace tensorpipe {
namespace transport {
namespace ibv {

std::tuple<Error, std::string> lookupAddrForIface(std::string iface);

std::tuple<Error, std::string> lookupAddrForHostname();

} // namespace ibv
} // namespace transport
} // namespace tensorpipe


================================================
FILE: tensorpipe/transport/listener.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <functional>
#include <memory>
#include <string>

#include <tensorpipe/common/error.h>
#include <tensorpipe/transport/context.h>

namespace tensorpipe {
namespace transport {

class Listener {
 public:
  using accept_callback_fn = std::function<
      void(const Error& error, std::shared_ptr<Connection> connection)>;

  virtual void accept(accept_callback_fn fn) = 0;

  // Return address that this listener is listening on.
  // This may be required if the listening address is not known up
  // front, or dynamically populated by the operating system (e.g. by
  // letting the operating system pick a TCP port to listen on).
  virtual std::string addr() const = 0;

  // Tell the listener what its identifier is.
  //
  // This is only supposed to be called from the high-level listener or from
  // channel contexts. It will only used for logging and debugging purposes.
  virtual void setId(std::string id) = 0;

  virtual void close() = 0;

  virtual ~Listener() = default;
};

} // namespace transport
} // namespace tensorpipe


================================================
FILE: tensorpipe/transport/listener_boilerplate.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <memory>
#include <string>
#include <type_traits>
#include <utility>

#include <tensorpipe/transport/listener.h>
#include <tensorpipe/transport/listener_impl_boilerplate.h>

namespace tensorpipe {
namespace transport {

template <typename TCtx, typename TList, typename TConn>
class ListenerBoilerplate : public Listener {
 public:
  template <typename... Args>
  ListenerBoilerplate(
      typename ListenerImplBoilerplate<TCtx, TList, TConn>::ConstructorToken
          token,
      std::shared_ptr<TCtx> context,
      std::string id,
      Args... args);

  explicit ListenerBoilerplate(std::shared_ptr<TList> listener);

  ListenerBoilerplate(const ListenerBoilerplate&) = delete;
  ListenerBoilerplate(ListenerBoilerplate&&) = delete;
  ListenerBoilerplate& operator=(const ListenerBoilerplate&) = delete;
  ListenerBoilerplate& operator=(ListenerBoilerplate&&) = delete;

  // Queue a callback to be called when a connection comes in.
  void accept(accept_callback_fn fn) override;

  // Obtain the listener's address.
  std::string addr() const override;

  // Tell the listener what its identifier is.
  void setId(std::string id) override;

  // Shut down the connection and its resources.
  void close() override;

  ~ListenerBoilerplate() override;

 protected:
  // Using a shared_ptr allows us to detach the lifetime of the implementation
  // from the public object's one and perform the destruction asynchronously.
  const std::shared_ptr<TList> impl_;
};

template <typename TCtx, typename TList, typename TConn>
template <typename... Args>
ListenerBoilerplate<TCtx, TList, TConn>::ListenerBoilerplate(
    typename ListenerImplBoilerplate<TCtx, TList, TConn>::ConstructorToken
        token,
    std::shared_ptr<TCtx> context,
    std::string id,
    Args... args)
    : impl_(std::make_shared<TList>(
          token,
          std::move(context),
          std::move(id),
          std::forward<Args>(args)...)) {
  static_assert(
      std::is_base_of<ListenerImplBoilerplate<TCtx, TList, TConn>, TList>::
          value,
      "");
  impl_->init();
}

template <typename TCtx, typename TList, typename TConn>
ListenerBoilerplate<TCtx, TList, TConn>::ListenerBoilerplate(
    std::shared_ptr<TList> listener)
    : impl_(std::move(listener)) {
  static_assert(
      std::is_base_of<ListenerImplBoilerplate<TCtx, TList, TConn>, TList>::
          value,
      "");
}

template <typename TCtx, typename TList, typename TConn>
void ListenerBoilerplate<TCtx, TList, TConn>::accept(accept_callback_fn fn) {
  if (unlikely(!impl_)) {
    // FIXME In C++-17 perhaps a global static inline variable would be better?
    static Error error = TP_CREATE_ERROR(ContextNotViableError);
    fn(error, nullptr);
    return;
  }
  impl_->accept(std::move(fn));
}

template <typename TCtx, typename TList, typename TConn>
std::string ListenerBoilerplate<TCtx, TList, TConn>::addr() const {
  if (unlikely(!impl_)) {
    return "";
  }
  return impl_->addr();
}

template <typename TCtx, typename TList, typename TConn>
void ListenerBoilerplate<TCtx, TList, TConn>::setId(std::string id) {
  if (unlikely(!impl_)) {
    return;
  }
  impl_->setId(std::move(id));
}

template <typename TCtx, typename TList, typename TConn>
void ListenerBoilerplate<TCtx, TList, TConn>::close() {
  if (unlikely(!impl_)) {
    return;
  }
  impl_->close();
}

template <typename TCtx, typename TList, typename TConn>
ListenerBoilerplate<TCtx, TList, TConn>::~ListenerBoilerplate() {
  close();
}

} // namespace transport
} // namespace tensorpipe


================================================
FILE: tensorpipe/transport/listener_impl_boilerplate.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <atomic>
#include <memory>
#include <string>
#include <utility>

#include <tensorpipe/common/callback.h>
#include <tensorpipe/common/defs.h>
#include <tensorpipe/common/error.h>
#include <tensorpipe/common/error_macros.h>
#include <tensorpipe/transport/connection_boilerplate.h>
#include <tensorpipe/transport/error.h>
#include <tensorpipe/transport/listener.h>

namespace tensorpipe {
namespace transport {

template <typename TCtx, typename TList, typename TConn>
class ListenerImplBoilerplate : public std::enable_shared_from_this<TList> {
 public:
  class ConstructorToken {
   public:
    ConstructorToken(const ConstructorToken&) = default;

   private:
    explicit ConstructorToken() {}
    friend ContextImplBoilerplate<TCtx, TList, TConn>;
    friend ListenerImplBoilerplate<TCtx, TList, TConn>;
  };

  ListenerImplBoilerplate(
      ConstructorToken token,
      std::shared_ptr<TCtx> context,
      std::string id);

  ListenerImplBoilerplate(const ListenerImplBoilerplate&) = delete;
  ListenerImplBoilerplate(ListenerImplBoilerplate&&) = delete;
  ListenerImplBoilerplate& operator=(const ListenerImplBoilerplate&) = delete;
  ListenerImplBoilerplate& operator=(ListenerImplBoilerplate&&) = delete;

  // Initialize member fields that need `shared_from_this`.
  void init();

  // Queue a callback to be called when a connection comes in.
  using accept_callback_fn = Listener::accept_callback_fn;
  void accept(accept_callback_fn fn);

  // Obtain the listener's address.
  std::string addr() const;

  // Tell the listener what its identifier is.
  void setId(std::string id);

  // Shut down the listener and its resources.
  void close();

  virtual ~ListenerImplBoilerplate() = default;

 protected:
  virtual void initImplFromLoop() = 0;
  virtual void acceptImplFromLoop(accept_callback_fn fn) = 0;
  virtual std::string addrImplFromLoop() const = 0;
  virtual void handleErrorImpl() = 0;

  void setError(Error error);

  const std::shared_ptr<TCtx> context_;

  Error error_{Error::kSuccess};

  template <typename... Args>
  std::shared_ptr<Connection> createAndInitConnection(Args&&... args);

  // An identifier for the listener, composed of the identifier for the context,
  // combined with an increasing sequence number. It will be used as a prefix
  // for the identifiers of connections. All of them will only be used for
  // logging and debugging purposes.
  std::string id_;

 private:
  // Initialize member fields that need `shared_from_this`.
  void initFromLoop();

  // Queue a callback to be called when a connection comes in.
  void acceptFromLoop(accept_callback_fn fn);

  // Obtain the listener's address.
  std::string addrFromLoop() const;

  void setIdFromLoop(std::string id);

  // Shut down the connection and its resources.
  void closeFromLoop();

  // Deal with an error.
  void handleError();

  // A sequence number for the calls to accept.
  uint64_t nextConnectionBeingAccepted_{0};

  // Sequence numbers for the connections created by this listener, used to
  // create their identifiers based off this listener's identifier. They will
  // only be used for logging and debugging.
  std::atomic<uint64_t> connectionCounter_{0};

  // Contexts do sometimes need to call directly into closeFromLoop, in order to
  // make sure that some of their operations can happen "atomically" on the
  // connection, without possibly other operations occurring in between (e.g.,
  // an error).
  friend ContextImplBoilerplate<TCtx, TList, TConn>;
};

template <typename TCtx, typename TList, typename TConn>
ListenerImplBoilerplate<TCtx, TList, TConn>::ListenerImplBoilerplate(
    ConstructorToken /* unused */,
    std::shared_ptr<TCtx> context,
    std::string id)
    : context_(std::move(context)), id_(std::move(id)) {}

template <typename TCtx, typename TList, typename TConn>
void ListenerImplBoilerplate<TCtx, TList, TConn>::init() {
  context_->deferToLoop(
      [impl{this->shared_from_this()}]() { impl->initFromLoop(); });
}

template <typename TCtx, typename TList, typename TConn>
void ListenerImplBoilerplate<TCtx, TList, TConn>::initFromLoop() {
  if (context_->closed()) {
    // Set the error without calling setError because we do not want to invoke
    // the subclass's handleErrorImpl as it would find itself in a weird state
    // (since initFromLoop wouldn't have been called).
    error_ = TP_CREATE_ERROR(ListenerClosedError);
    TP_VLOG(7) << "Listener " << id_ << " is closing (without initing)";
    return;
  }

  initImplFromLoop();
}

template <typename TCtx, typename TList, typename TConn>
void ListenerImplBoilerplate<TCtx, TList, TConn>::accept(
    accept_callback_fn fn) {
  context_->deferToLoop(
      [impl{this->shared_from_this()}, fn{std::move(fn)}]() mutable {
        impl->acceptFromLoop(std::move(fn));
      });
}

template <typename TCtx, typename TList, typename TConn>
void ListenerImplBoilerplate<TCtx, TList, TConn>::acceptFromLoop(
    accept_callback_fn fn) {
  TP_DCHECK(context_->inLoop());

  uint64_t sequenceNumber = nextConnectionBeingAccepted_++;
  TP_VLOG(7) << "Listener " << id_ << " received an accept request (#"
             << sequenceNumber << ")";

  fn = [this, sequenceNumber, fn{std::move(fn)}](
           const Error& error, std::shared_ptr<Connection> connection) {
    TP_VLOG(7) << "Listener " << id_ << " is calling an accept callback (#"
               << sequenceNumber << ")";
    fn(error, std::move(connection));
    TP_VLOG(7) << "Listener " << id_ << " done calling an accept callback (#"
               << sequenceNumber << ")";
  };

  if (error_) {
    fn(error_, std::shared_ptr<Connection>());
    return;
  }

  acceptImplFromLoop(std::move(fn));
}

template <typename TCtx, typename TList, typename TConn>
std::string ListenerImplBoilerplate<TCtx, TList, TConn>::addr() const {
  std::string addr;
  context_->runInLoop([this, &addr]() { addr = addrFromLoop(); });
  return addr;
}

template <typename TCtx, typename TList, typename TConn>
std::string ListenerImplBoilerplate<TCtx, TList, TConn>::addrFromLoop() const {
  TP_DCHECK(context_->inLoop());

  return addrImplFromLoop();
}

template <typename TCtx, typename TList, typename TConn>
template <typename... Args>
std::shared_ptr<Connection> ListenerImplBoilerplate<TCtx, TList, TConn>::
    createAndInitConnection(Args&&... args) {
  TP_DCHECK(context_->inLoop());
  std::string connectionId = id_ + ".c" + std::to_string(connectionCounter_++);
  TP_VLOG(7) << "Listener " << id_ << " is opening connection " << connectionId;
  auto connection = std::make_shared<TConn>(
      typename ConnectionImplBoilerplate<TCtx, TList, TConn>::
          ConstructorToken(),
      context_,
      std::move(connectionId),
      std::forward<Args>(args)...);
  // We initialize the connection from the loop immediately, inline, because the
  // initialization of a connection accepted by a listener typically happens
  // partly in the listener (e.g., opening and accepting the socket) and partly
  // in the connection's initFromLoop, and we need these two steps to happen
  // "atomicically" to make it impossible for an error to occur in between.
  connection->initFromLoop();
  return std::make_shared<ConnectionBoilerplate<TCtx, TList, TConn>>(
      std::move(connection));
}

template <typename TCtx, typename TList, typename TConn>
void ListenerImplBoilerplate<TCtx, TList, TConn>::setId(std::string id) {
  context_->deferToLoop(
      [impl{this->shared_from_this()}, id{std::move(id)}]() mutable {
        impl->setIdFromLoop(std::move(id));
      });
}

template <typename TCtx, typename TList, typename TConn>
void ListenerImplBoilerplate<TCtx, TList, TConn>::setIdFromLoop(
    std::string id) {
  TP_DCHECK(context_->inLoop());
  TP_VLOG(7) << "Listener " << id_ << " was renamed to " << id;
  id_ = std::move(id);
}

template <typename TCtx, typename TList, typename TConn>
void ListenerImplBoilerplate<TCtx, TList, TConn>::close() {
  context_->deferToLoop(
      [impl{this->shared_from_this()}]() { impl->closeFromLoop(); });
}

template <typename TCtx, typename TList, typename TConn>
void ListenerImplBoilerplate<TCtx, TList, TConn>::closeFromLoop() {
  TP_DCHECK(context_->inLoop());
  TP_VLOG(7) << "Listener " << id_ << " is closing";
  setError(TP_CREATE_ERROR(ListenerClosedError));
}

template <typename TCtx, typename TList, typename TConn>
void ListenerImplBoilerplate<TCtx, TList, TConn>::setError(Error error) {
  // Don't overwrite an error that's already set.
  if (error_ || !error) {
    return;
  }

  error_ = std::move(error);

  handleError();
}

template <typename TCtx, typename TList, typename TConn>
void ListenerImplBoilerplate<TCtx, TList, TConn>::handleError() {
  TP_DCHECK(context_->inLoop());
  TP_VLOG(8) << "Listener " << id_ << " is handling error " << error_.what();

  handleErrorImpl();
}

} // namespace transport
} // namespace tensorpipe


================================================
FILE: tensorpipe/transport/shm/connection_impl.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/transport/shm/connection_impl.h>

#include <string.h>

#include <deque>
#include <vector>

#include <tensorpipe/common/callback.h>
#include <tensorpipe/common/defs.h>
#include <tensorpipe/common/epoll_loop.h>
#include <tensorpipe/common/error_macros.h>
#include <tensorpipe/common/ringbuffer_read_write_ops.h>
#include <tensorpipe/common/ringbuffer_role.h>
#include <tensorpipe/common/shm_ringbuffer.h>
#include <tensorpipe/transport/error.h>
#include <tensorpipe/transport/shm/context_impl.h>
#include <tensorpipe/transport/shm/reactor.h>
#include <tensorpipe/transport/shm/sockaddr.h>

namespace tensorpipe {
namespace transport {
namespace shm {

ConnectionImpl::ConnectionImpl(
    ConstructorToken token,
    std::shared_ptr<ContextImpl> context,
    std::string id,
    Socket socket)
    : ConnectionImplBoilerplate<ContextImpl, ListenerImpl, ConnectionImpl>(
          token,
          std::move(context),
          std::move(id)),
      socket_(std::move(socket)) {}

ConnectionImpl::ConnectionImpl(
    ConstructorToken token,
    std::shared_ptr<ContextImpl> context,
    std::string id,
    std::string addr)
    : ConnectionImplBoilerplate<ContextImpl, ListenerImpl, ConnectionImpl>(
          token,
          std::move(context),
          std::move(id)),
      sockaddr_(Sockaddr::createAbstractUnixAddr(addr)) {}

void ConnectionImpl::initImplFromLoop() {
  context_->enroll(*this);

  Error error;
  // The connection either got a socket or an address, but not both.
  TP_DCHECK(socket_.hasValue() ^ sockaddr_.has_value());
  if (!socket_.hasValue()) {
    std::tie(error, socket_) = Socket::createForFamily(AF_UNIX);
    if (error) {
      setError(std::move(error));
      return;
    }
    error = socket_.connect(sockaddr_.value());
    if (error) {
      setError(std::move(error));
      return;
    }
  }
  // Ensure underlying control socket is non-blocking such that it
  // works well with event driven I/O.
  error = socket_.block(false);
  if (error) {
    setError(std::move(error));
    return;
  }

  // Create ringbuffer for inbox.
  std::tie(error, inboxHeaderSegment_, inboxDataSegment_, inboxRb_) =
      createShmRingBuffer<kNumRingbufferRoles>(kBufferSize);
  TP_THROW_ASSERT_IF(error)
      << "Couldn't allocate ringbuffer for connection inbox: " << error.what();

  // Register method to be called when our peer writes to our inbox.
  inboxReactorToken_ = context_->addReaction([this]() {
    TP_VLOG(9) << "Connection " << id_
               << " is reacting to the peer writing to the inbox";
    processReadOperationsFromLoop();
  });

  // Register method to be called when our peer reads from our outbox.
  outboxReactorToken_ = context_->addReaction([this]() {
    TP_VLOG(9) << "Connection " << id_
               << " is reacting to the peer reading from the outbox";
    processWriteOperationsFromLoop();
  });

  // We're sending file descriptors first, so wait for writability.
  state_ = SEND_FDS;
  context_->registerDescriptor(socket_.fd(), EPOLLOUT, shared_from_this());
}

void ConnectionImpl::readImplFromLoop(read_callback_fn fn) {
  readOperations_.emplace_back(std::move(fn));

  // If the inbox already contains some data, we may be able to process this
  // operation right away.
  processReadOperationsFromLoop();
}

void ConnectionImpl::readImplFromLoop(
    AbstractNopHolder& object,
    read_nop_callback_fn fn) {
  readOperations_.emplace_back(
      &object,
      [fn{std::move(fn)}](
          const Error& error, const void* /* unused */, size_t /* unused */) {
        fn(error);
      });

  // If the inbox already contains some data, we may be able to process this
  // operation right away.
  processReadOperationsFromLoop();
}

void ConnectionImpl::readImplFromLoop(
    void* ptr,
    size_t length,
    read_callback_fn fn) {
  readOperations_.emplace_back(ptr, length, std::move(fn));

  // If the inbox already contains some data, we may be able to process this
  // operation right away.
  processReadOperationsFromLoop();
}

void ConnectionImpl::writeImplFromLoop(
    const void* ptr,
    size_t length,
    write_callback_fn fn) {
  writeOperations_.emplace_back(ptr, length, std::move(fn));

  // If the outbox has some free space, we may be able to process this operation
  // right away.
  processWriteOperationsFromLoop();
}

void ConnectionImpl::writeImplFromLoop(
    const AbstractNopHolder& object,
    write_callback_fn fn) {
  writeOperations_.emplace_back(&object, std::move(fn));

  // If the outbox has some free space, we may be able to process this operation
  // right away.
  processWriteOperationsFromLoop();
}

void ConnectionImpl::handleEventsFromLoop(int events) {
  TP_DCHECK(context_->inLoop());
  TP_VLOG(9) << "Connection " << id_ << " is handling an event on its socket ("
             << EpollLoop::formatEpollEvents(events) << ")";

  // Handle only one of the events in the mask. Events on the control
  // file descriptor are rare enough for the cost of having epoll call
  // into this function multiple times to not matter. The benefit is
  // that every handler can close and unregister the control file
  // descriptor from the event loop, without worrying about the next
  // handler trying to do so as well.
  // In some cases the socket could be in a state where it's both in an error
  // state and readable/writable. If we checked for EPOLLIN or EPOLLOUT first
  // and then returned after handling them, we would keep doing so forever and
  // never reach the error handling. So we should keep the error check first.
  if (events & EPOLLERR) {
    int error;
    socklen_t errorlen = sizeof(error);
    int rv = getsockopt(
        socket_.fd(),
        SOL_SOCKET,
        SO_ERROR,
        reinterpret_cast<void*>(&error),
        &errorlen);
    if (rv == -1) {
      setError(TP_CREATE_ERROR(SystemError, "getsockopt", rv));
    } else {
      setError(TP_CREATE_ERROR(SystemError, "async error on socket", error));
    }
    return;
  }
  if (events & EPOLLIN) {
    handleEventInFromLoop();
    return;
  }
  if (events & EPOLLOUT) {
    handleEventOutFromLoop();
    return;
  }
  // Check for hangup last, as there could be cases where we get EPOLLHUP but
  // there's still data to be read from the socket, so we want to deal with that
  // before dealing with the hangup.
  if (events & EPOLLHUP) {
    setError(TP_CREATE_ERROR(EOFError));
    return;
  }
}

void ConnectionImpl::handleEventInFromLoop() {
  TP_DCHECK(context_->inLoop());
  if (state_ == RECV_FDS) {
    Fd reactorHeaderFd;
    Fd reactorDataFd;
    Fd outboxHeaderFd;
    Fd outboxDataFd;
    Reactor::TToken peerInboxReactorToken;
    Reactor::TToken peerOutboxReactorToken;

    // Receive the reactor token, reactor fds, and inbox fds.
    auto err = socket_.recvPayloadAndFds(
        peerInboxReactorToken,
        peerOutboxReactorToken,
        reactorHeaderFd,
        reactorDataFd,
        outboxHeaderFd,
        outboxDataFd);
    if (err) {
      setError(std::move(err));
      return;
    }

    // Load ringbuffer for outbox.
    std::tie(err, outboxHeaderSegment_, outboxDataSegment_, outboxRb_) =
        loadShmRingBuffer<kNumRingbufferRoles>(
            std::move(outboxHeaderFd), std::move(outboxDataFd));
    TP_THROW_ASSERT_IF(err)
        << "Couldn't access ringbuffer of connection outbox: " << err.what();

    // Initialize remote reactor trigger.
    peerReactorTrigger_.emplace(
        std::move(reactorHeaderFd), std::move(reactorDataFd));

    peerInboxReactorToken_ = peerInboxReactorToken;
    peerOutboxReactorToken_ = peerOutboxReactorToken;

    // The connection is usable now.
    state_ = ESTABLISHED;
    processWriteOperationsFromLoop();
    // Trigger read operations in case a pair of local read() and remote
    // write() happened before connection is established. Otherwise read()
    // callback would lose if it's the only read() request.
    processReadOperationsFromLoop();
    return;
  }

  if (state_ == ESTABLISHED) {
    // We don't expect to read anything on this socket once the
    // connection has been established. If we do, assume it's a
    // zero-byte read indicating EOF.
    setError(TP_CREATE_ERROR(EOFError));
    return;
  }

  TP_THROW_ASSERT() << "EPOLLIN event not handled in state " << state_;
}

void ConnectionImpl::handleEventOutFromLoop() {
  TP_DCHECK(context_->inLoop());
  if (state_ == SEND_FDS) {
    int reactorHeaderFd;
    int reactorDataFd;
    std::tie(reactorHeaderFd, reactorDataFd) = context_->reactorFds();

    // Send our reactor token, reactor fds, and inbox fds.
    auto err = socket_.sendPayloadAndFds(
        inboxReactorToken_.value(),
        outboxReactorToken_.value(),
        reactorHeaderFd,
        reactorDataFd,
        inboxHeaderSegment_.getFd(),
        inboxDataSegment_.getFd());
    if (err) {
      setError(std::move(err));
      return;
    }

    // Sent our fds. Wait for fds from peer.
    state_ = RECV_FDS;
    context_->registerDescriptor(socket_.fd(), EPOLLIN, shared_from_this());
    return;
  }

  TP_THROW_ASSERT() << "EPOLLOUT event not handled in state " << state_;
}

void ConnectionImpl::processReadOperationsFromLoop() {
  TP_DCHECK(context_->inLoop());

  // Process all read read operations that we can immediately serve, only
  // when connection is established.
  if (state_ != ESTABLISHED) {
    return;
  }
  // Serve read operations
  Consumer inboxConsumer(inboxRb_);
  while (!readOperations_.empty()) {
    RingbufferReadOperation& readOperation = readOperations_.front();
    if (readOperation.handleRead(inboxConsumer) > 0) {
      peerReactorTrigger_->run(peerOutboxReactorToken_.value());
    }
    if (readOperation.completed()) {
      readOperations_.pop_front();
    } else {
      break;
    }
  }
}

void ConnectionImpl::processWriteOperationsFromLoop() {
  TP_DCHECK(context_->inLoop());

  if (state_ != ESTABLISHED) {
    return;
  }

  Producer outboxProducer(outboxRb_);
  while (!writeOperations_.empty()) {
    RingbufferWriteOperation& writeOperation = writeOperations_.front();
    if (writeOperation.handleWrite(outboxProducer) > 0) {
      peerReactorTrigger_->run(peerInboxReactorToken_.value());
    }
    if (writeOperation.completed()) {
      writeOperations_.pop_front();
    } else {
      break;
    }
  }
}

void ConnectionImpl::handleErrorImpl() {
  for (auto& readOperation : readOperations_) {
    readOperation.handleError(error_);
  }
  readOperations_.clear();
  for (auto& writeOperation : writeOperations_) {
    writeOperation.handleError(error_);
  }
  writeOperations_.clear();
  if (inboxReactorToken_.has_value()) {
    context_->removeReaction(inboxReactorToken_.value());
    inboxReactorToken_.reset();
  }
  if (outboxReactorToken_.has_value()) {
    context_->removeReaction(outboxReactorToken_.value());
    outboxReactorToken_.reset();
  }
  if (socket_.hasValue()) {
    if (state_ > INITIALIZING) {
      context_->unregisterDescriptor(socket_.fd());
    }
    socket_.reset();
  }

  context_->unenroll(*this);
}

} // namespace shm
} // namespace transport
} // namespace tensorpipe


================================================
FILE: tensorpipe/transport/shm/connection_impl.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <deque>
#include <memory>
#include <string>

#include <tensorpipe/common/epoll_loop.h>
#include <tensorpipe/common/nop.h>
#include <tensorpipe/common/optional.h>
#include <tensorpipe/common/ringbuffer.h>
#include <tensorpipe/common/ringbuffer_read_write_ops.h>
#include <tensorpipe/common/shm_segment.h>
#include <tensorpipe/common/socket.h>
#include <tensorpipe/transport/connection_impl_boilerplate.h>
#include <tensorpipe/transport/shm/reactor.h>
#include <tensorpipe/transport/shm/sockaddr.h>

namespace tensorpipe {
namespace transport {
namespace shm {

class ContextImpl;
class ListenerImpl;

class ConnectionImpl final : public ConnectionImplBoilerplate<
                                 ContextImpl,
                                 ListenerImpl,
                                 ConnectionImpl>,
                             public EpollLoop::EventHandler {
  constexpr static size_t kBufferSize = 2 * 1024 * 1024;

  constexpr static int kNumRingbufferRoles = 2;
  using Consumer = RingBufferRole<kNumRingbufferRoles, 0>;
  using Producer = RingBufferRole<kNumRingbufferRoles, 1>;

  enum State {
    INITIALIZING = 1,
    SEND_FDS,
    RECV_FDS,
    ESTABLISHED,
  };

 public:
  // Create a connection that is already connected (e.g. from a listener).
  ConnectionImpl(
      ConstructorToken token,
      std::shared_ptr<ContextImpl> context,
      std::string id,
      Socket socket);

  // Create a connection that connects to the specified address.
  ConnectionImpl(
      ConstructorToken token,
      std::shared_ptr<ContextImpl> context,
      std::string id,
      std::string addr);

  // Implementation of EventHandler.
  void handleEventsFromLoop(int events) override;

 protected:
  // Implement the entry points called by ConnectionImplBoilerplate.
  void initImplFromLoop() override;
  void readImplFromLoop(read_callback_fn fn) override;
  void readImplFromLoop(AbstractNopHolder& object, read_nop_callback_fn fn)
      override;
  void readImplFromLoop(void* ptr, size_t length, read_callback_fn fn) override;
  void writeImplFromLoop(const void* ptr, size_t length, write_callback_fn fn)
      override;
  void writeImplFromLoop(const AbstractNopHolder& object, write_callback_fn fn)
      override;
  void handleErrorImpl() override;

 private:
  // Handle events of type EPOLLIN on the UNIX domain socket.
  //
  // The only data that is expected on that socket is the file descriptors for
  // the other side's inbox (which is this side's outbox) and its reactor, plus
  // the reactor tokens to trigger the other side to read or write.
  void handleEventInFromLoop();

  // Handle events of type EPOLLOUT on the UNIX domain socket.
  //
  // Once the socket is writable we send the file descriptors for this side's
  // inbox (which the other side's outbox) and our reactor, plus the reactor
  // tokens to trigger this connection to read or write.
  void handleEventOutFromLoop();

  State state_{INITIALIZING};
  Socket socket_;
  optional<Sockaddr> sockaddr_;

  // Inbox.
  ShmSegment inboxHeaderSegment_;
  ShmSegment inboxDataSegment_;
  RingBuffer<kNumRingbufferRoles> inboxRb_;
  optional<Reactor::TToken> inboxReactorToken_;

  // Outbox.
  ShmSegment outboxHeaderSegment_;
  ShmSegment outboxDataSegment_;
  RingBuffer<kNumRingbufferRoles> outboxRb_;
  optional<Reactor::TToken> outboxReactorToken_;

  // Peer trigger/tokens.
  optional<Reactor::Trigger> peerReactorTrigger_;
  optional<Reactor::TToken> peerInboxReactorToken_;
  optional<Reactor::TToken> peerOutboxReactorToken_;

  // Pending read operations.
  std::deque<RingbufferReadOperation> readOperations_;

  // Pending write operations.
  std::deque<RingbufferWriteOperation> writeOperations_;

  // Process pending read operations if in an operational state.
  //
  // This may be triggered by the other side of the connection (by pushing this
  // side's inbox token to the reactor) when it has written some new data to its
  // outbox (which is this side's inbox). It is also called by this connection
  // when it moves into an established state or when a new read operation is
  // queued, in case data was already available before this connection was ready
  // to consume it.
  void processReadOperationsFromLoop();

  // Process pending write operations if in an operational state.
  //
  // This may be triggered by the other side of the connection (by pushing this
  // side's outbox token to the reactor) when it has read some data from its
  // inbox (which is this side's outbox). This is important when some of this
  // side's writes couldn't complete because the outbox was full, and thus they
  // needed to wait for some of its data to be read. This method is also called
  // by this connection when it moves into an established state, in case some
  // writes were queued before the connection was ready to process them, or when
  // a new write operation is queued.
  void processWriteOperationsFromLoop();
};

} // namespace shm
} // namespace transport
} // namespace tensorpipe


================================================
FILE: tensorpipe/transport/shm/context_impl.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/transport/shm/context_impl.h>

#include <tensorpipe/common/epoll_loop.h>
#include <tensorpipe/common/system.h>
#include <tensorpipe/transport/shm/connection_impl.h>
#include <tensorpipe/transport/shm/listener_impl.h>
#include <tensorpipe/transport/shm/reactor.h>

namespace tensorpipe {
namespace transport {
namespace shm {

namespace {

// Prepend descriptor with transport name so it's easy to
// disambiguate descriptors when debugging.
const std::string kDomainDescriptorPrefix{"shm:"};

} // namespace

std::shared_ptr<ContextImpl> ContextImpl::create() {
  std::ostringstream oss;
  oss << kDomainDescriptorPrefix;

  // This transport only works across processes on the same machine, and we
  // detect that by computing the boot ID.
  auto bootID = getBootID();
  TP_THROW_ASSERT_IF(!bootID.has_value()) << "Unable to read boot_id";
  oss << bootID.value();

  // This transport bootstraps a connection by opening a UNIX domain socket, for
  // which it uses an "abstract" address (i.e., just an identifier, which is not
  // materialized to a filesystem path). In order for the two endpoints to
  // access each other's address they must be in the same Linux kernel network
  // namespace (see network_namespaces(7)).
  auto nsID = getLinuxNamespaceId(LinuxNamespace::kNet);
  if (!nsID.has_value()) {
    TP_VLOG(8) << "Unable to read net namespace ID";
    return nullptr;
  }
  oss << '_' << nsID.value();

  // Over that UNIX domain socket, the two endpoints exchange file descriptors
  // to regions of shared memory. Some restrictions may be in place that prevent
  // allocating such regions, hence let's allocate one here to see if it works.
  Error error;
  ShmSegment segment;
  std::tie(error, segment) = ShmSegment::alloc(1024 * 1024);
  if (error) {
    TP_VLOG(8) << "Couldn't allocate shared memory segment: " << error.what();
    return nullptr;
  }

  // A separate problem is that /dev/shm may be sized too small for all the
  // memory we need to allocate. However, our memory usage is unbounded, as it
  // grows as we open more connections, hence we cannot check it in advance.

  std::string domainDescriptor = oss.str();
  TP_VLOG(8) << "The domain descriptor for SHM is " << domainDescriptor;
  return std::make_shared<ContextImpl>(std::move(domainDescriptor));
}

ContextImpl::ContextImpl(std::string domainDescriptor)
    : ContextImplBoilerplate<ContextImpl, ListenerImpl, ConnectionImpl>(
          std::move(domainDescriptor)) {}

void ContextImpl::handleErrorImpl() {
  loop_.close();
  reactor_.close();
}

void ContextImpl::joinImpl() {
  loop_.join();
  reactor_.join();
}

bool ContextImpl::inLoop() const {
  return reactor_.inLoop();
};

void ContextImpl::deferToLoop(std::function<void()> fn) {
  reactor_.deferToLoop(std::move(fn));
};

void ContextImpl::registerDescriptor(
    int fd,
    int events,
    std::shared_ptr<EpollLoop::EventHandler> h) {
  loop_.registerDescriptor(fd, events, std::move(h));
}

void ContextImpl::unregisterDescriptor(int fd) {
  loop_.unregisterDescriptor(fd);
}

ContextImpl::TToken ContextImpl::addReaction(TFunction fn) {
  return reactor_.add(std::move(fn));
}

void ContextImpl::removeReaction(TToken token) {
  reactor_.remove(token);
}

std::tuple<int, int> ContextImpl::reactorFds() {
  return reactor_.fds();
}

} // namespace shm
} // namespace transport
} // namespace tensorpipe


================================================
FILE: tensorpipe/transport/shm/context_impl.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <functional>
#include <memory>
#include <tuple>

#include <tensorpipe/common/epoll_loop.h>
#include <tensorpipe/transport/context_impl_boilerplate.h>
#include <tensorpipe/transport/shm/reactor.h>

namespace tensorpipe {
namespace transport {
namespace shm {

class ConnectionImpl;
class ListenerImpl;

class ContextImpl final
    : public ContextImplBoilerplate<ContextImpl, ListenerImpl, ConnectionImpl> {
 public:
  static std::shared_ptr<ContextImpl> create();

  explicit ContextImpl(std::string domainDescriptor);

  // Implement the DeferredExecutor interface.
  bool inLoop() const override;
  void deferToLoop(std::function<void()> fn) override;

  void registerDescriptor(
      int fd,
      int events,
      std::shared_ptr<EpollLoop::EventHandler> h);

  void unregisterDescriptor(int fd);

  using TToken = uint32_t;
  using TFunction = std::function<void()>;

  TToken addReaction(TFunction fn);

  void removeReaction(TToken token);

  std::tuple<int, int> reactorFds();

 protected:
  // Implement the entry points called by ContextImplBoilerplate.
  void handleErrorImpl() override;
  void joinImpl() override;

 private:
  Reactor reactor_;
  EpollLoop loop_{this->reactor_};
};

} // namespace shm
} // namespace transport
} // namespace tensorpipe


================================================
FILE: tensorpipe/transport/shm/factory.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/transport/shm/factory.h>

#include <tensorpipe/transport/context_boilerplate.h>
#include <tensorpipe/transport/shm/connection_impl.h>
#include <tensorpipe/transport/shm/context_impl.h>
#include <tensorpipe/transport/shm/listener_impl.h>

namespace tensorpipe {
namespace transport {
namespace shm {

std::shared_ptr<Context> create() {
  return std::make_shared<
      ContextBoilerplate<ContextImpl, ListenerImpl, ConnectionImpl>>();
}

} // namespace shm
} // namespace transport
} // namespace tensorpipe


================================================
FILE: tensorpipe/transport/shm/factory.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <memory>

#include <tensorpipe/transport/context.h>

namespace tensorpipe {
namespace transport {
namespace shm {

std::shared_ptr<Context> create();

} // namespace shm
} // namespace transport
} // namespace tensorpipe


================================================
FILE: tensorpipe/transport/shm/listener_impl.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/transport/shm/listener_impl.h>

#include <deque>
#include <functional>
#include <mutex>
#include <vector>

#include <tensorpipe/common/callback.h>
#include <tensorpipe/common/defs.h>
#include <tensorpipe/common/epoll_loop.h>
#include <tensorpipe/common/optional.h>
#include <tensorpipe/transport/error.h>
#include <tensorpipe/transport/shm/connection_impl.h>
#include <tensorpipe/transport/shm/context_impl.h>
#include <tensorpipe/transport/shm/sockaddr.h>

namespace tensorpipe {
namespace transport {
namespace shm {

ListenerImpl::ListenerImpl(
    ConstructorToken token,
    std::shared_ptr<ContextImpl> context,
    std::string id,
    std::string addr)
    : ListenerImplBoilerplate<ContextImpl, ListenerImpl, ConnectionImpl>(
          token,
          std::move(context),
          std::move(id)),
      sockaddr_(Sockaddr::createAbstractUnixAddr(addr)) {}

void ListenerImpl::initImplFromLoop() {
  context_->enroll(*this);

  Error error;
  TP_DCHECK(!socket_.hasValue());
  std::tie(error, socket_) = Socket::createForFamily(AF_UNIX);
  if (error) {
    setError(std::move(error));
    return;
  }
  error = socket_.bind(sockaddr_);
  if (error) {
    setError(std::move(error));
    return;
  }
  error = socket_.block(false);
  if (error) {
    setError(std::move(error));
    return;
  }
  error = socket_.listen(128);
  if (error) {
    setError(std::move(error));
    return;
  }
  struct sockaddr_storage addr;
  socklen_t addrlen;
  std::tie(error, addr, addrlen) = socket_.getSockName();
  if (error) {
    setError(std::move(error));
    return;
  }
  sockaddr_ = Sockaddr(reinterpret_cast<struct sockaddr*>(&addr), addrlen);
}

void ListenerImpl::handleErrorImpl() {
  if (!fns_.empty()) {
    context_->unregisterDescriptor(socket_.fd());
  }
  socket_.reset();
  for (auto& fn : fns_) {
    fn(error_, std::shared_ptr<Connection>());
  }
  fns_.clear();

  context_->unenroll(*this);
}

void ListenerImpl::acceptImplFromLoop(accept_callback_fn fn) {
  fns_.push_back(std::move(fn));

  // Only register if we go from 0 to 1 pending callbacks. In other cases we
  // already had a pending callback and thus we were already registered.
  if (fns_.size() == 1) {
    // Register with loop for readability events.
    context_->registerDescriptor(socket_.fd(), EPOLLIN, shared_from_this());
  }
}

std::string ListenerImpl::addrImplFromLoop() const {
  return sockaddr_.str();
}

void ListenerImpl::handleEventsFromLoop(int events) {
  TP_DCHECK(context_->inLoop());
  TP_VLOG(9) << "Listener " << id_ << " is handling an event on its socket ("
             << EpollLoop::formatEpollEvents(events) << ")";

  if (events & EPOLLERR) {
    int error;
    socklen_t errorlen = sizeof(error);
    int rv = getsockopt(
        socket_.fd(),
        SOL_SOCKET,
        SO_ERROR,
        reinterpret_cast<void*>(&error),
        &errorlen);
    if (rv == -1) {
      setError(TP_CREATE_ERROR(SystemError, "getsockopt", rv));
    } else {
      setError(TP_CREATE_ERROR(SystemError, "async error on socket", error));
    }
    return;
  }
  if (events & EPOLLHUP) {
    setError(TP_CREATE_ERROR(EOFError));
    return;
  }
  TP_ARG_CHECK_EQ(events, EPOLLIN);

  Error error;
  Socket socket;
  std::tie(error, socket) = socket_.accept();
  if (error) {
    setError(std::move(error));
    return;
  }

  TP_DCHECK(!fns_.empty())
      << "when the callback is disarmed the listener's descriptor is supposed "
      << "to be unregistered";
  auto fn = std::move(fns_.front());
  fns_.pop_front();
  if (fns_.empty()) {
    context_->unregisterDescriptor(socket_.fd());
  }
  fn(Error::kSuccess, createAndInitConnection(std::move(socket)));
}

} // namespace shm
} // namespace transport
} // namespace tensorpipe


================================================
FILE: tensorpipe/transport/shm/listener_impl.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <deque>
#include <memory>
#include <string>

#include <tensorpipe/common/epoll_loop.h>
#include <tensorpipe/common/socket.h>
#include <tensorpipe/transport/listener_impl_boilerplate.h>
#include <tensorpipe/transport/shm/sockaddr.h>

namespace tensorpipe {
namespace transport {
namespace shm {

class ConnectionImpl;
class ContextImpl;

class ListenerImpl final
    : public ListenerImplBoilerplate<ContextImpl, ListenerImpl, ConnectionImpl>,
      public EpollLoop::EventHandler {
 public:
  // Create a listener that listens on the specified address.
  ListenerImpl(
      ConstructorToken token,
      std::shared_ptr<ContextImpl> context,
      std::string id,
      std::string addr);

  // Implementation of EventHandler.
  void handleEventsFromLoop(int events) override;

 protected:
  // Implement the entry points called by ListenerImplBoilerplate.
  void initImplFromLoop() override;
  void acceptImplFromLoop(accept_callback_fn fn) override;
  std::string addrImplFromLoop() const override;
  void handleErrorImpl() override;

 private:
  Socket socket_;
  Sockaddr sockaddr_;
  std::deque<accept_callback_fn> fns_;
};

} // namespace shm
} // namespace transport
} // namespace tensorpipe


================================================
FILE: tensorpipe/transport/shm/reactor.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/transport/shm/reactor.h>

#include <tensorpipe/common/shm_ringbuffer.h>
#include <tensorpipe/common/system.h>

namespace tensorpipe {
namespace transport {
namespace shm {

namespace {

void writeToken(Reactor::Producer& producer, Reactor::TToken token) {
  for (;;) {
    auto rv = producer.write(&token, sizeof(token));
    if (rv == -EAGAIN) {
      // There's contention on the spin-lock, wait for it by retrying.
      std::this_thread::yield();
      continue;
    }
    if (rv == -ENODATA) {
      // The ringbuffer is full. Retrying should typically work, but might lead
      // to a deadlock if, for example, a reactor thread is trying to write a
      // token to its own ringbuffer, as then it would be stuck here and never
      // proceed to consume data from the ringbuffer. This could also happen
      // across multiple processes. This case seems remote enough, and a proper
      // solution rather complicated, that we're going to take that risk...
      std::this_thread::yield();
      continue;
    }
    TP_DCHECK_EQ(rv, sizeof(token));
    break;
  }
}

} // namespace

Reactor::Reactor() {
  Error error;
  std::tie(error, headerSegment_, dataSegment_, rb_) =
      createShmRingBuffer<kNumRingbufferRoles>(kSize);
  TP_THROW_ASSERT_IF(error)
      << "Couldn't allocate ringbuffer for reactor: " << error.what();

  startThread("TP_SHM_reactor");
}

void Reactor::close() {
  if (!closed_.exchange(true)) {
    stopBusyPolling();
  }
}

void Reactor::join() {
  close();

  if (!joined_.exchange(true)) {
    joinThread();
  }
}

Reactor::~Reactor() {
  join();
}

Reactor::TToken Reactor::add(TFunction fn) {
  std::unique_lock<std::mutex> lock(mutex_);
  TToken token;

  // Either reuse a token or generate a new one.
  auto it = reusableTokens_.begin();
  if (it != reusableTokens_.end()) {
    token = *it;
    reusableTokens_.erase(it);
  } else {
    // If there are no reusable tokens, the next token is always equal
    // to the number of tokens in use + 1.
    token = functions_.size();
  }

  // Ensure there is enough space in the functions vector.
  if (functions_.size() <= token) {
    functions_.resize(token + 1);
  }

  functions_[token] = std::move(fn);

  functionCount_++;

  return token;
}

void Reactor::remove(TToken token) {
  std::unique_lock<std::mutex> lock(mutex_);
  functions_[token] = nullptr;
  reusableTokens_.insert(token);
  functionCount_--;
}

std::tuple<int, int> Reactor::fds() const {
  return std::make_tuple(headerSegment_.getFd(), dataSegment_.getFd());
}

bool Reactor::pollOnce() {
  Consumer reactorConsumer(rb_);
  uint32_t token;
  auto ret = reactorConsumer.read(&token, sizeof(token));
  if (ret == -ENODATA) {
    return false;
  }
  TP_THROW_SYSTEM_IF(ret < 0, -ret);

  TFunction fn;

  // Make copy of std::function so we don't need
  // to hold the lock while executing it.
  {
    std::unique_lock<std::mutex> lock(mutex_);
    TP_DCHECK_LT(token, functions_.size());
    fn = functions_[token];
  }

  if (fn) {
    fn();
  }

  return true;
}

bool Reactor::readyToClose() {
  return functionCount_ == 0;
}

Reactor::Trigger::Trigger(Fd headerFd, Fd dataFd) {
  // The header and data segment objects take over ownership
  // of file descriptors. Release them to avoid double close.
  Error error;
  std::tie(error, headerSegment_, dataSegment_, rb_) =
      loadShmRingBuffer<kNumRingbufferRoles>(
          std::move(headerFd), std::move(dataFd));
  TP_THROW_ASSERT_IF(error)
      << "Couldn't access ringbuffer of remote reactor: " << error.what();
}

void Reactor::Trigger::run(TToken token) {
  Producer producer(rb_);
  writeToken(producer, token);
}

} // namespace shm
} // namespace transport
} // namespace tensorpipe


================================================
FILE: tensorpipe/transport/shm/reactor.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <atomic>
#include <functional>
#include <future>
#include <list>
#include <mutex>
#include <set>
#include <thread>
#include <vector>

#include <tensorpipe/common/busy_polling_loop.h>
#include <tensorpipe/common/callback.h>
#include <tensorpipe/common/fd.h>
#include <tensorpipe/common/optional.h>
#include <tensorpipe/common/ringbuffer_role.h>
#include <tensorpipe/common/shm_segment.h>

namespace tensorpipe {
namespace transport {
namespace shm {

// Reactor loop.
//
// Companion class to the event loop in `loop.h` that executes
// functions on triggers. The triggers are posted to a shared memory
// ring buffer, so this can be done by other processes on the same
// machine. It uses extra data in the ring buffer header to store a
// mutex and condition variable to avoid a busy loop.
//
class Reactor final : public BusyPollingLoop {
  // This allows for buffering 1M triggers (at 4 bytes a piece).
  static constexpr auto kSize = 4 * 1024 * 1024;

  static constexpr int kNumRingbufferRoles = 2;

 public:
  using TFunction = std::function<void()>;
  using TToken = uint32_t;
  using Consumer = RingBufferRole<kNumRingbufferRoles, 0>;
  using Producer = RingBufferRole<kNumRingbufferRoles, 1>;

  Reactor();

  // Add function to the reactor.
  // Returns token that can be used to trigger it.
  TToken add(TFunction fn);

  // Removes function associated with token from reactor.
  void remove(TToken token);

  // Returns the file descriptors for the underlying ring buffer.
  std::tuple<int, int> fds() const;

  void close();

  void join();

  ~Reactor();

 protected:
  bool pollOnce() override;

  bool readyToClose() override;

 private:
  ShmSegment headerSegment_;
  ShmSegment dataSegment_;
  RingBuffer<kNumRingbufferRoles> rb_;

  std::mutex mutex_;
  std::atomic<bool> closed_{false};
  std::atomic<bool> joined_{false};

  // Tokens are placed in this set if they can be reused.
  std::set<TToken> reusableTokens_;

  // Map reactor tokens to functions.
  //
  // The tokens are reused so we don't worry about unbounded growth
  // and comfortably use a std::vector here.
  //
  std::vector<TFunction> functions_;

  // Count how many functions are registered.
  std::atomic<uint64_t> functionCount_{0};

 public:
  class Trigger {
   public:
    Trigger(Fd header, Fd data);

    void run(TToken token);

   private:
    ShmSegment headerSegment_;
    ShmSegment dataSegment_;
    RingBuffer<kNumRingbufferRoles> rb_;
  };
};

} // namespace shm
} // namespace transport
} // namespace tensorpipe


================================================
FILE: tensorpipe/transport/shm/sockaddr.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/transport/shm/sockaddr.h>

#include <fcntl.h>
#include <sys/un.h>
#include <unistd.h>

#include <cstring>

#include <tensorpipe/common/defs.h>

namespace tensorpipe {
namespace transport {
namespace shm {

Sockaddr Sockaddr::createAbstractUnixAddr(const std::string& name) {
  struct sockaddr_un sun;
  sun.sun_family = AF_UNIX;
  std::memset(&sun.sun_path, 0, sizeof(sun.sun_path));
  // There are three "modes" for binding UNIX domain sockets:
  // - if len(path) == 0: it autobinds to an abstract address
  // - if len(path) > 0 and path[0] == 0: it uses an explicit abstract address
  // - if len(path) > 0 and path[0] != 0: it uses a concrete filesystem path
  if (name == "") {
    return Sockaddr(
        reinterpret_cast<struct sockaddr*>(&sun), sizeof(sun.sun_family));
  } else {
    constexpr size_t offset = 1;
    const size_t len = std::min(sizeof(sun.sun_path) - offset, name.size());
    std::strncpy(&sun.sun_path[offset], name.data(), len);

    // Note: instead of using sizeof(sun) we compute the addrlen from
    // the string length of the abstract socket name. If we use
    // sizeof(sun), lsof shows all the trailing NUL characters.
    return Sockaddr(
        reinterpret_cast<struct sockaddr*>(&sun),
        sizeof(sun.sun_family) + offset + len);
  }
};

Sockaddr::Sockaddr(const struct sockaddr* addr, socklen_t addrlen) {
  TP_ARG_CHECK(addr != nullptr);
  TP_ARG_CHECK_LE(addrlen, sizeof(addr_));
  std::memset(&addr_, 0, sizeof(addr_));
  std::memcpy(&addr_, addr, addrlen);
  addrlen_ = addrlen;
}

std::string Sockaddr::str() const {
  TP_DCHECK_GE(addrlen_, sizeof(sockaddr_un::sun_family));
  if (addrlen_ == sizeof(sockaddr_un::sun_family)) {
    return "";
  } else {
    const struct sockaddr_un* sun{
        reinterpret_cast<const struct sockaddr_un*>(&addr_)};
    TP_DCHECK_EQ(sun->sun_path[0], '\0');
    constexpr size_t offset = 1;
    const size_t len = addrlen_ - sizeof(sun->sun_family) - offset;
    return std::string(&sun->sun_path[offset], len);
  }
}

} // namespace shm
} // namespace transport
} // namespace tensorpipe


================================================
FILE: tensorpipe/transport/shm/sockaddr.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <sys/socket.h>

#include <cstring>
#include <string>

#include <tensorpipe/common/defs.h>
#include <tensorpipe/common/error.h>
#include <tensorpipe/common/error_macros.h>
#include <tensorpipe/common/optional.h>
#include <tensorpipe/common/socket.h>
#include <tensorpipe/transport/error.h>

namespace tensorpipe {
namespace transport {
namespace shm {

class Sockaddr final : public tensorpipe::Sockaddr {
 public:
  static Sockaddr createAbstractUnixAddr(const std::string& name);

  inline const struct sockaddr* addr() const override {
    return reinterpret_cast<const struct sockaddr*>(&addr_);
  }

  inline socklen_t addrlen() const override {
    return addrlen_;
  }

  std::string str() const;

  explicit Sockaddr(const struct sockaddr* addr, socklen_t addrlen);

 private:
  struct sockaddr_storage addr_;
  socklen_t addrlen_;
};

} // namespace shm
} // namespace transport
} // namespace tensorpipe


================================================
FILE: tensorpipe/transport/uv/connection_impl.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/transport/uv/connection_impl.h>

#include <array>
#include <deque>

#include <tensorpipe/common/callback.h>
#include <tensorpipe/common/defs.h>
#include <tensorpipe/common/error_macros.h>
#include <tensorpipe/common/optional.h>
#include <tensorpipe/common/stream_read_write_ops.h>
#include <tensorpipe/transport/uv/context_impl.h>
#include <tensorpipe/transport/uv/error.h>
#include <tensorpipe/transport/uv/loop.h>
#include <tensorpipe/transport/uv/sockaddr.h>
#include <tensorpipe/transport/uv/uv.h>

namespace tensorpipe {
namespace transport {
namespace uv {

ConnectionImpl::ConnectionImpl(
    ConstructorToken token,
    std::shared_ptr<ContextImpl> context,
    std::string id,
    std::unique_ptr<TCPHandle> handle)
    : ConnectionImplBoilerplate<ContextImpl, ListenerImpl, ConnectionImpl>(
          token,
          std::move(context),
          std::move(id)),
      handle_(std::move(handle)) {}

ConnectionImpl::ConnectionImpl(
    ConstructorToken token,
    std::shared_ptr<ContextImpl> context,
    std::string id,
    std::string addr)
    : ConnectionImplBoilerplate<ContextImpl, ListenerImpl, ConnectionImpl>(
          token,
          std::move(context),
          std::move(id)),
      handle_(context_->createHandle()),
      sockaddr_(Sockaddr::createInetSockAddr(addr)) {}

void ConnectionImpl::initImplFromLoop() {
  context_->enroll(*this);

  TP_VLOG(9) << "Connection " << id_ << " is initializing in loop";

  if (sockaddr_.has_value()) {
    TP_THROW_ASSERT_IF(context_->closed());
    handle_->initFromLoop();
    handle_->connectFromLoop(sockaddr_.value(), [this](int status) {
      if (status < 0) {
        setError(TP_CREATE_ERROR(UVError, status));
      }
    });
  }
  handle_->armCloseCallbackFromLoop(
      [this]() { this->closeCallbackFromLoop(); });
  handle_->armAllocCallbackFromLoop(
      [this](uv_buf_t* buf) { this->allocCallbackFromLoop(buf); });
  handle_->armReadCallbackFromLoop([this](ssize_t nread, const uv_buf_t* buf) {
    this->readCallbackFromLoop(nread, buf);
  });
}

void ConnectionImpl::readImplFromLoop(read_callback_fn fn) {
  readOperations_.emplace_back(std::move(fn));

  // Start reading if this is the first read operation.
  if (readOperations_.size() == 1) {
    handle_->readStartFromLoop();
  }
}

void ConnectionImpl::readImplFromLoop(
    void* ptr,
    size_t length,
    read_callback_fn fn) {
  readOperations_.emplace_back(ptr, length, std::move(fn));

  // Start reading if this is the first read operation.
  if (readOperations_.size() == 1) {
    handle_->readStartFromLoop();
  }
}

void ConnectionImpl::writeImplFromLoop(
    const void* ptr,
    size_t length,
    write_callback_fn fn) {
  writeOperations_.emplace_back(ptr, length, std::move(fn));

  auto& writeOperation = writeOperations_.back();
  StreamWriteOperation::Buf* bufsPtr;
  unsigned int bufsLen;
  std::tie(bufsPtr, bufsLen) = writeOperation.getBufs();
  const std::array<uv_buf_t, 2> uvBufs = {
      uv_buf_t{bufsPtr[0].base, bufsPtr[0].len},
      uv_buf_t{bufsPtr[1].base, bufsPtr[1].len}};
  handle_->writeFromLoop(uvBufs.data(), bufsLen, [this](int status) {
    this->writeCallbackFromLoop(status);
  });
}

void ConnectionImpl::allocCallbackFromLoop(uv_buf_t* buf) {
  TP_DCHECK(context_->inLoop());
  TP_THROW_ASSERT_IF(readOperations_.empty());
  TP_VLOG(9) << "Connection " << id_
             << " has incoming data for which it needs to provide a buffer";
  readOperations_.front().allocFromLoop(&buf->base, &buf->len);
}

void ConnectionImpl::readCallbackFromLoop(
    ssize_t nread,
    const uv_buf_t* /* unused */) {
  TP_DCHECK(context_->inLoop());
  TP_VLOG(9) << "Connection " << id_ << " has completed reading some data ("
             << (nread >= 0 ? std::to_string(nread) + " bytes"
                            : formatUvError(nread))
             << ")";

  if (nread < 0) {
    setError(TP_CREATE_ERROR(UVError, nread));
    return;
  }

  TP_THROW_ASSERT_IF(readOperations_.empty());
  auto& readOperation = readOperations_.front();
  readOperation.readFromLoop(nread);
  if (readOperation.completeFromLoop()) {
    readOperation.callbackFromLoop(Error::kSuccess);
    // Remove the completed operation.
    // If this was the final pending operation, this instance should
    // no longer receive allocation and read callbacks.
    readOperations_.pop_front();
    if (readOperations_.empty()) {
      handle_->readStopFromLoop();
    }
  }
}

void ConnectionImpl::writeCallbackFromLoop(int status) {
  TP_DCHECK(context_->inLoop());
  TP_VLOG(9) << "Connection " << id_ << " has completed a write request ("
             << formatUvError(status) << ")";

  if (status < 0) {
    setError(TP_CREATE_ERROR(UVError, status));
    // Do NOT return, because the error handler method will only fire the
    // callbacks of the read operations, because we can only fire the callbacks
    // of the write operations after their corresponding UV requests complete
    // (or else the user may deallocate the buffers while the loop is still
    // processing them), therefore we must fire the write operation callbacks in
    // this method, both in case of success and of error.
  }

  TP_THROW_ASSERT_IF(writeOperations_.empty());
  auto& writeOperation = writeOperations_.front();
  writeOperation.callbackFromLoop(error_);
  writeOperations_.pop_front();
}

void ConnectionImpl::closeCallbackFromLoop() {
  TP_DCHECK(context_->inLoop());
  TP_VLOG(9) << "Connection " << id_ << " has finished closing its handle";
  TP_DCHECK(writeOperations_.empty());
  context_->unenroll(*this);
}

void ConnectionImpl::handleErrorImpl() {
  for (auto& readOperation : readOperations_) {
    readOperation.callbackFromLoop(error_);
  }
  readOperations_.clear();
  // Do NOT fire the callbacks of the write operations, because we must wait for
  // their corresponding UV write requests to complete (or else the user may
  // deallocate the buffers while the loop is still processing them).
  handle_->closeFromLoop();
  // Do NOT unenroll here, as we must keep the UV handle alive until the close
  // callback fires.
}

} // namespace uv
} // namespace transport
} // namespace tensorpipe


================================================
FILE: tensorpipe/transport/uv/connection_impl.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <cstddef>
#include <deque>
#include <memory>
#include <string>

#include <tensorpipe/common/optional.h>
#include <tensorpipe/common/stream_read_write_ops.h>
#include <tensorpipe/transport/connection_impl_boilerplate.h>
#include <tensorpipe/transport/uv/sockaddr.h>
#include <tensorpipe/transport/uv/uv.h>

namespace tensorpipe {
namespace transport {
namespace uv {

class ContextImpl;
class ListenerImpl;

class ConnectionImpl final : public ConnectionImplBoilerplate<
                                 ContextImpl,
                                 ListenerImpl,
                                 ConnectionImpl> {
 public:
  // Create a connection that is already connected (e.g. from a listener).
  ConnectionImpl(
      ConstructorToken token,
      std::shared_ptr<ContextImpl> context,
      std::string id,
      std::unique_ptr<TCPHandle> handle);

  // Create a connection that connects to the specified address.
  ConnectionImpl(
      ConstructorToken token,
      std::shared_ptr<ContextImpl> context,
      std::string id,
      std::string addr);

 protected:
  // Implement the entry points called by ConnectionImplBoilerplate.
  void initImplFromLoop() override;
  void readImplFromLoop(read_callback_fn fn) override;
  void readImplFromLoop(void* ptr, size_t length, read_callback_fn fn) override;
  void writeImplFromLoop(const void* ptr, size_t length, write_callback_fn fn)
      override;
  void handleErrorImpl() override;

 private:
  // Called when libuv is about to read data from connection.
  void allocCallbackFromLoop(uv_buf_t* buf);

  // Called when libuv has read data from connection.
  void readCallbackFromLoop(ssize_t nread, const uv_buf_t* buf);

  // Called when libuv has written data to connection.
  void writeCallbackFromLoop(int status);

  // Called when libuv has closed the handle.
  void closeCallbackFromLoop();

  const std::unique_ptr<TCPHandle> handle_;
  optional<Sockaddr> sockaddr_;

  std::deque<StreamReadOperation> readOperations_;
  std::deque<StreamWriteOperation> writeOperations_;
};

} // namespace uv
} // namespace transport
} // namespace tensorpipe


================================================
FILE: tensorpipe/transport/uv/context_impl.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/transport/uv/context_impl.h>

#include <tensorpipe/transport/uv/connection_impl.h>
#include <tensorpipe/transport/uv/listener_impl.h>
#include <tensorpipe/transport/uv/uv.h>

namespace tensorpipe {
namespace transport {
namespace uv {

namespace {

// Prepend descriptor with transport name so it's easy to
// disambiguate descriptors when debugging.
const std::string kDomainDescriptorPrefix{"uv:"};

std::string generateDomainDescriptor() {
  return kDomainDescriptorPrefix + "*";
}

} // namespace

std::shared_ptr<ContextImpl> ContextImpl::create() {
  return std::make_shared<ContextImpl>();
}

ContextImpl::ContextImpl()
    : ContextImplBoilerplate<ContextImpl, ListenerImpl, ConnectionImpl>(
          generateDomainDescriptor()) {}

void ContextImpl::handleErrorImpl() {
  loop_.close();
}

void ContextImpl::joinImpl() {
  loop_.join();
}

bool ContextImpl::inLoop() const {
  return loop_.inLoop();
};

void ContextImpl::deferToLoop(std::function<void()> fn) {
  loop_.deferToLoop(std::move(fn));
};

std::unique_ptr<TCPHandle> ContextImpl::createHandle() {
  return std::make_unique<TCPHandle>(loop_.ptr(), loop_);
};

} // namespace uv
} // namespace transport
} // namespace tensorpipe


================================================
FILE: tensorpipe/transport/uv/context_impl.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <functional>
#include <memory>
#include <string>
#include <tuple>

#include <tensorpipe/common/error.h>
#include <tensorpipe/transport/context_impl_boilerplate.h>
#include <tensorpipe/transport/uv/loop.h>
#include <tensorpipe/transport/uv/uv.h>

namespace tensorpipe {
namespace transport {
namespace uv {

class ConnectionImpl;
class ListenerImpl;

class ContextImpl final
    : public ContextImplBoilerplate<ContextImpl, ListenerImpl, ConnectionImpl> {
 public:
  static std::shared_ptr<ContextImpl> create();

  ContextImpl();

  // Implement the DeferredExecutor interface.
  bool inLoop() const override;
  void deferToLoop(std::function<void()> fn) override;

  std::unique_ptr<TCPHandle> createHandle();

 protected:
  // Implement the entry points called by ContextImplBoilerplate.
  void handleErrorImpl() override;
  void joinImpl() override;

 private:
  Loop loop_;
};

} // namespace uv
} // namespace transport
} // namespace tensorpipe


================================================
FILE: tensorpipe/transport/uv/error.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/transport/uv/error.h>

#include <tensorpipe/transport/uv/uv.h>

namespace tensorpipe {
namespace transport {
namespace uv {

std::string UVError::what() const {
  return formatUvError(error_);
}

std::string NoAddrFoundError::what() const {
  return "no address found";
}

} // namespace uv
} // namespace transport
} // namespace tensorpipe


================================================
FILE: tensorpipe/transport/uv/error.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <string>

#include <tensorpipe/transport/error.h>

namespace tensorpipe {
namespace transport {
namespace uv {

class UVError final : public BaseError {
 public:
  explicit UVError(int error) : error_(error) {}

  std::string what() const override;

 private:
  int error_;
};

class NoAddrFoundError final : public BaseError {
 public:
  NoAddrFoundError() {}

  std::string what() const override;
};

} // namespace uv
} // namespace transport
} // namespace tensorpipe


================================================
FILE: tensorpipe/transport/uv/factory.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/transport/uv/factory.h>

#include <tensorpipe/transport/context_boilerplate.h>
#include <tensorpipe/transport/uv/connection_impl.h>
#include <tensorpipe/transport/uv/context_impl.h>
#include <tensorpipe/transport/uv/listener_impl.h>

namespace tensorpipe {
namespace transport {
namespace uv {

std::shared_ptr<Context> create() {
  return std::make_shared<
      ContextBoilerplate<ContextImpl, ListenerImpl, ConnectionImpl>>();
}

} // namespace uv
} // namespace transport
} // namespace tensorpipe


================================================
FILE: tensorpipe/transport/uv/factory.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <memory>

#include <tensorpipe/transport/context.h>

namespace tensorpipe {
namespace transport {
namespace uv {

std::shared_ptr<Context> create();

} // namespace uv
} // namespace transport
} // namespace tensorpipe


================================================
FILE: tensorpipe/transport/uv/listener_impl.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/transport/uv/listener_impl.h>

#include <tensorpipe/common/callback.h>
#include <tensorpipe/common/error_macros.h>
#include <tensorpipe/transport/uv/connection_impl.h>
#include <tensorpipe/transport/uv/context_impl.h>
#include <tensorpipe/transport/uv/error.h>
#include <tensorpipe/transport/uv/loop.h>
#include <tensorpipe/transport/uv/sockaddr.h>
#include <tensorpipe/transport/uv/uv.h>

namespace tensorpipe {
namespace transport {
namespace uv {

ListenerImpl::ListenerImpl(
    ConstructorToken token,
    std::shared_ptr<ContextImpl> context,
    std::string id,
    std::string addr)
    : ListenerImplBoilerplate<ContextImpl, ListenerImpl, ConnectionImpl>(
          token,
          std::move(context),
          std::move(id)),
      handle_(context_->createHandle()),
      sockaddr_(Sockaddr::createInetSockAddr(addr)) {}

void ListenerImpl::initImplFromLoop() {
  context_->enroll(*this);

  TP_VLOG(9) << "Listener " << id_ << " is initializing in loop";

  TP_THROW_ASSERT_IF(context_->closed());
  handle_->initFromLoop();
  auto rv = handle_->bindFromLoop(sockaddr_);
  TP_THROW_UV_IF(rv < 0, rv);
  handle_->armCloseCallbackFromLoop(
      [this]() { this->closeCallbackFromLoop(); });
  handle_->listenFromLoop(
      [this](int status) { this->connectionCallbackFromLoop(status); });

  sockaddr_ = handle_->sockNameFromLoop();
}

void ListenerImpl::acceptImplFromLoop(accept_callback_fn fn) {
  callback_.arm(std::move(fn));
}

std::string ListenerImpl::addrImplFromLoop() const {
  return sockaddr_.str();
}

void ListenerImpl::connectionCallbackFromLoop(int status) {
  TP_DCHECK(context_->inLoop());
  TP_VLOG(9) << "Listener " << id_
             << " has an incoming connection ready to be accepted ("
             << formatUvError(status) << ")";

  if (status != 0) {
    setError(TP_CREATE_ERROR(UVError, status));
    return;
  }

  auto connection = context_->createHandle();
  TP_THROW_ASSERT_IF(context_->closed());
  connection->initFromLoop();
  handle_->acceptFromLoop(*connection);
  callback_.trigger(
      Error::kSuccess, createAndInitConnection(std::move(connection)));
}

void ListenerImpl::closeCallbackFromLoop() {
  TP_VLOG(9) << "Listener " << id_ << " has finished closing its handle";
  context_->unenroll(*this);
}

void ListenerImpl::handleErrorImpl() {
  callback_.triggerAll([&]() {
    return std::make_tuple(std::cref(error_), std::shared_ptr<Connection>());
  });
  handle_->closeFromLoop();
  // Do NOT unenroll here, as we must keep the UV handle alive until the close
  // callback fires.
}

} // namespace uv
} // namespace transport
} // namespace tensorpipe


================================================
FILE: tensorpipe/transport/uv/listener_impl.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <memory>
#include <string>

#include <tensorpipe/common/callback.h>
#include <tensorpipe/transport/listener_impl_boilerplate.h>
#include <tensorpipe/transport/uv/sockaddr.h>
#include <tensorpipe/transport/uv/uv.h>

namespace tensorpipe {
namespace transport {
namespace uv {

class ConnectionImpl;
class ContextImpl;

class ListenerImpl final : public ListenerImplBoilerplate<
                               ContextImpl,
                               ListenerImpl,
                               ConnectionImpl> {
 public:
  // Create a listener that listens on the specified address.
  ListenerImpl(
      ConstructorToken token,
      std::shared_ptr<ContextImpl> context,
      std::string id,
      std::string addr);

 protected:
  // Implement the entry points called by ListenerImplBoilerplate.
  void initImplFromLoop() override;
  void acceptImplFromLoop(accept_callback_fn fn) override;
  std::string addrImplFromLoop() const override;
  void handleErrorImpl() override;

 private:
  // Called by libuv if the listening socket can accept a new connection. Status
  // is 0 in case of success, < 0 otherwise. See `uv_connection_cb` for more
  // information.
  void connectionCallbackFromLoop(int status);

  // Called when libuv has closed the handle.
  void closeCallbackFromLoop();

  const std::unique_ptr<TCPHandle> handle_;
  Sockaddr sockaddr_;

  // Once an accept callback fires, it becomes disarmed and must be rearmed.
  // Any firings that occur while the callback is disarmed are stashed and
  // triggered as soon as it's rearmed. With libuv we don't have the ability
  // to disable the lower-level callback when the user callback is disarmed.
  // So we'll keep getting notified of new connections even if we don't know
  // what to do with them and don't want them. Thus we must store them
  // somewhere. This is what RearmableCallback is for.
  RearmableCallback<const Error&, std::shared_ptr<Connection>> callback_;
};

} // namespace uv
} // namespace transport
} // namespace tensorpipe


================================================
FILE: tensorpipe/transport/uv/loop.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/transport/uv/loop.h>

#include <tensorpipe/common/system.h>
#include <tensorpipe/transport/uv/uv.h>

namespace tensorpipe {
namespace transport {
namespace uv {

Loop::Loop() {
  int rv;
  rv = uv_loop_init(&loop_);
  TP_THROW_UV_IF(rv < 0, rv);
  rv = uv_async_init(&loop_, &async_, uvAsyncCb);
  TP_THROW_UV_IF(rv < 0, rv);
  async_.data = this;

  startThread("TP_UV_loop");
}

void Loop::close() {
  if (!closed_.exchange(true)) {
    // It's fine to capture this because the loop won't be destroyed until join
    // has completed, and join won't complete until this operation is performed.
    deferToLoop(
        [this]() { uv_unref(reinterpret_cast<uv_handle_t*>(&async_)); });
  }
}

void Loop::join() {
  close();

  if (!joined_.exchange(true)) {
    joinThread();
  }
}

Loop::~Loop() noexcept {
  join();
}

void Loop::wakeupEventLoopToDeferFunction() {
  auto rv = uv_async_send(&async_);
  TP_THROW_UV_IF(rv < 0, rv);
}

void Loop::eventLoop() {
  int rv;

  rv = uv_run(&loop_, UV_RUN_DEFAULT);
  TP_THROW_ASSERT_IF(rv > 0)
      << ": uv_run returned with active handles or requests";
}

void Loop::cleanUpLoop() {
  int rv;

  uv_ref(reinterpret_cast<uv_handle_t*>(&async_));
  uv_close(reinterpret_cast<uv_handle_t*>(&async_), nullptr);

  rv = uv_run(&loop_, UV_RUN_NOWAIT);
  TP_THROW_ASSERT_IF(rv > 0)
      << ": uv_run returned with active handles or requests";

  // Release resources associated with loop.
  rv = uv_loop_close(&loop_);
  TP_THROW_UV_IF(rv < 0, rv);
}

void Loop::uvAsyncCb(uv_async_t* handle) {
  auto& loop = *reinterpret_cast<Loop*>(handle->data);
  loop.runDeferredFunctionsFromEventLoop();
}

} // namespace uv
} // namespace transport
} // namespace tensorpipe


================================================
FILE: tensorpipe/transport/uv/loop.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <functional>
#include <future>
#include <memory>
#include <mutex>
#include <thread>
#include <vector>

#include <uv.h>

#include <tensorpipe/common/deferred_executor.h>
#include <tensorpipe/transport/uv/uv.h>

namespace tensorpipe {
namespace transport {
namespace uv {

class Loop final : public EventLoopDeferredExecutor {
 public:
  Loop();

  uv_loop_t* ptr() {
    return &loop_;
  }

  bool closed() {
    return closed_;
  }

  void close();

  void join();

  ~Loop() noexcept;

 protected:
  // Event loop thread entry function.
  void eventLoop() override;

  // Clean up after event loop transitioned to on-demand.
  void cleanUpLoop() override;

  // Wake up the event loop.
  void wakeupEventLoopToDeferFunction() override;

 private:
  uv_loop_t loop_;
  uv_async_t async_;
  std::atomic<bool> closed_{false};
  std::atomic<bool> joined_{false};

  // This function is called by the event loop thread whenever
  // we have to run a number of deferred functions.
  static void uvAsyncCb(uv_async_t* handle);
};

} // namespace uv
} // namespace transport
} // namespace tensorpipe


================================================
FILE: tensorpipe/transport/uv/sockaddr.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/transport/uv/sockaddr.h>

#include <array>
#include <cstring>
#include <sstream>
#include <utility>

#include <uv.h>

#include <tensorpipe/common/defs.h>
#include <tensorpipe/transport/uv/uv.h>

namespace tensorpipe {
namespace transport {
namespace uv {

Sockaddr Sockaddr::createInetSockAddr(const std::string& str) {
  int port = 0;
  std::string addrStr;
  std::string portStr;

  // If the input string is an IPv6 address with port, the address
  // itself must be wrapped with brackets.
  if (addrStr.empty()) {
    auto start = str.find("[");
    auto stop = str.find("]");
    if (start < stop && start != std::string::npos &&
        stop != std::string::npos) {
      addrStr = str.substr(start + 1, stop - (start + 1));
      if (stop + 1 < str.size() && str[stop + 1] == ':') {
        portStr = str.substr(stop + 2);
      }
    }
  }

  // If the input string is an IPv4 address with port, we expect
  // at least a single period and a single colon in the string.
  if (addrStr.empty()) {
    auto period = str.find(".");
    auto colon = str.find(":");
    if (period != std::string::npos && colon != std::string::npos) {
      addrStr = str.substr(0, colon);
      portStr = str.substr(colon + 1);
    }
  }

  // Fallback to using entire input string as address without port.
  if (addrStr.empty()) {
    addrStr = str;
  }

  // Parse port number if specified.
  if (!portStr.empty()) {
    port = std::stoi(portStr);
    if (port < 0 || port > std::numeric_limits<uint16_t>::max()) {
      TP_THROW_EINVAL() << str;
    }
  }

  // Try to convert an IPv4 address.
  {
    struct sockaddr_in addr;
    auto rv = uv_ip4_addr(addrStr.c_str(), port, &addr);
    if (rv == 0) {
      return Sockaddr(reinterpret_cast<struct sockaddr*>(&addr), sizeof(addr));
    }
  }

  // Try to convert an IPv6 address.
  {
    struct sockaddr_in6 addr;
    auto rv = uv_ip6_addr(addrStr.c_str(), port, &addr);
    if (rv == 0) {
      return Sockaddr(reinterpret_cast<struct sockaddr*>(&addr), sizeof(addr));
    }
  }

  // Invalid address.
  TP_THROW_EINVAL() << str;

  // Return bogus to silence "return from non-void function" warning.
  // Note: we don't reach this point per the throw above.
  return Sockaddr(nullptr, 0);
}

std::string Sockaddr::str() const {
  std::ostringstream oss;

  if (addr_.ss_family == AF_INET) {
    std::array<char, 64> buf;
    auto in = reinterpret_cast<const struct sockaddr_in*>(&addr_);
    auto rv = uv_ip4_name(in, buf.data(), buf.size());
    TP_THROW_UV_IF(rv < 0, rv);
    oss << buf.data() << ":" << htons(in->sin_port);
  } else if (addr_.ss_family == AF_INET6) {
    std::array<char, 64> buf;
    auto in6 = reinterpret_cast<const struct sockaddr_in6*>(&addr_);
    auto rv = uv_ip6_name(in6, buf.data(), buf.size());
    TP_THROW_UV_IF(rv < 0, rv);
    oss << "[" << buf.data();
    if (in6->sin6_scope_id > 0) {
      std::array<char, UV_IF_NAMESIZE> scopeBuf;
      size_t size = sizeof(scopeBuf);
      rv = uv_if_indextoiid(in6->sin6_scope_id, scopeBuf.data(), &size);
      TP_THROW_UV_IF(rv < 0, rv);
      oss << "%" << scopeBuf.data();
    }
    oss << "]:" << htons(in6->sin6_port);
  } else {
    TP_THROW_EINVAL() << "invalid address family: " << addr_.ss_family;
  }

  return oss.str();
}

} // namespace uv
} // namespace transport
} // namespace tensorpipe


================================================
FILE: tensorpipe/transport/uv/sockaddr.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <sys/socket.h>

#include <cstring>
#include <string>

#include <tensorpipe/common/socket.h>

namespace tensorpipe {
namespace transport {
namespace uv {

class Sockaddr final : public tensorpipe::Sockaddr {
 public:
  static Sockaddr createInetSockAddr(const std::string& str);

  Sockaddr(const struct sockaddr* addr, socklen_t addrlen) {
    TP_ARG_CHECK(addr != nullptr);
    TP_ARG_CHECK_LE(addrlen, sizeof(addr_));
    // Ensure the sockaddr_storage is zeroed, because we don't always
    // write to all fields in the `sockaddr_[in|in6]` structures.
    std::memset(&addr_, 0, sizeof(addr_));
    std::memcpy(&addr_, addr, addrlen);
    addrlen_ = addrlen;
  }

  inline const struct sockaddr* addr() const override {
    return reinterpret_cast<const struct sockaddr*>(&addr_);
  }

  inline struct sockaddr* addr() {
    return reinterpret_cast<struct sockaddr*>(&addr_);
  }

  inline socklen_t addrlen() const override {
    return addrlen_;
  }

  std::string str() const;

 private:
  struct sockaddr_storage addr_;
  socklen_t addrlen_;
};

} // namespace uv
} // namespace transport
} // namespace tensorpipe


================================================
FILE: tensorpipe/transport/uv/utility.cc
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <tensorpipe/transport/uv/utility.h>

#include <tensorpipe/common/error_macros.h>
#include <tensorpipe/transport/uv/error.h>
#include <tensorpipe/transport/uv/sockaddr.h>
#include <tensorpipe/transport/uv/uv.h>

namespace tensorpipe {
namespace transport {
namespace uv {

std::tuple<Error, std::string> lookupAddrForIface(std::string iface) {
  int rv;
  InterfaceAddresses addresses;
  int count;
  std::tie(rv, addresses, count) = getInterfaceAddresses();
  if (rv < 0) {
    return std::make_tuple(TP_CREATE_ERROR(UVError, rv), std::string());
  }

  for (auto i = 0; i < count; i++) {
    const uv_interface_address_t& interface = addresses[i];
    if (iface != interface.name) {
      continue;
    }

    const auto& address = interface.address;
    const struct sockaddr* sockaddr =
        reinterpret_cast<const struct sockaddr*>(&address);
    switch (sockaddr->sa_family) {
      case AF_INET:
        return std::make_tuple(
            Error::kSuccess,
            Sockaddr(sockaddr, sizeof(address.address4)).str());
      case AF_INET6:
        return std::make_tuple(
            Error::kSuccess,
            Sockaddr(sockaddr, sizeof(address.address6)).str());
    }
  }

  return std::make_tuple(TP_CREATE_ERROR(NoAddrFoundError), std::string());
}

std::tuple<Error, std::string> lookupAddrForHostname() {
  // For some operations we need a libuv event loop. We create a fresh one, just
  // for this purpose, which we'll drive inline from this thread. This way we
  // avoid misusing the main event loop in the context impl.
  struct InlineLoop {
    uv_loop_t loop;

    InlineLoop() {
      auto rv = uv_loop_init(&loop);
      TP_THROW_UV_IF(rv < 0, rv);
    }

    ~InlineLoop() {
      auto rv = uv_loop_close(&loop);
      TP_THROW_UV_IF(rv < 0, rv);
    }
  };
  InlineLoop loop;

  struct InlineDeferredExecutor : public DeferredExecutor {
    std::thread::id threadId = std::this_thread::get_id();

    void deferToLoop(TTask fn) override {
      TP_THROW_ASSERT()
          << "How could this be called?! This class is supposed to be "
          << "instantiated as const, and this method isn't const-qualified";
    }

    bool inLoop() const override {
      return std::this_thread::get_id() == threadId;
    }
  };
  const InlineDeferredExecutor executor;

  int rv;
  std::string hostname;
  std::tie(rv, hostname) = getHostname();
  if (rv < 0) {
    return std::make_tuple(TP_CREATE_ERROR(UVError, rv), std::string());
  }

  Addrinfo info;
  std::tie(rv, info) = getAddrinfoFromLoop(&loop.loop, std::move(hostname));
  if (rv < 0) {
    return std::make_tuple(TP_CREATE_ERROR(UVError, rv), std::string());
  }

  Error error;
  for (struct addrinfo* rp = info.get(); rp != nullptr; rp = rp->ai_next) {
    TP_DCHECK(rp->ai_family == AF_INET || rp->ai_family == AF_INET6);
    TP_DCHECK_EQ(rp->ai_socktype, SOCK_STREAM);
    TP_DCHECK_EQ(rp->ai_protocol, IPPROTO_TCP);

    Sockaddr addr = Sockaddr(rp->ai_addr, rp->ai_addrlen);

    TCPHandle handle(&loop.loop, executor);
    handle.initFromLoop();
    rv = handle.bindFromLoop(addr);
    handle.closeFromLoop();

    // The handle will only be closed at the next loop iteration, so run it.
    {
      auto rv = uv_run(&loop.loop, UV_RUN_DEFAULT);
      TP_THROW_ASSERT_IF(rv > 0);
    }

    if (rv < 0) {
      // Record the first binding error we encounter and return that in the end
      // if no working address is found, in order to help with debugging.
      if (!error) {
        error = TP_CREATE_ERROR(UVError, rv);
      }
      continue;
    }

    return std::make_tuple(Error::kSuccess, addr.str());
  }

  if (error) {
    return std::make_tuple(std::move(error), std::string());
  } else {
    return std::make_tuple(TP_CREATE_ERROR(NoAddrFoundError), std::string());
  }
}

std::tuple<Error, std::string> lookupAddrLikeNccl(
    optional<sa_family_t> familyFilter) {
  int rv;
  InterfaceAddresses addresses;
  int count;
  std::tie(rv, addresses, count) = getInterfaceAddresses();
  if (rv < 0) {
    return std::make_tuple(TP_CREATE_ERROR(UVError, rv), std::string());
  }

  // Libuv already only returns the interfaces that are up and running, whose
  // address is not null, and whose family is IPv4 or IPv6.

  // NCCL prioritizes the interfaces whose name starts with "ib" (for IP over
  // InfiniBand?), and deprioritizes those that start with "docker" or "lo".
  optional<std::string> withIbPrefix;
  optional<std::string> withoutPrefix;
  optional<std::string> withDockerPrefix;
  optional<std::string> withLoPrefix;

  for (auto i = 0; i < count; i++) {
    const uv_interface_address_t& interface = addresses[i];
    const struct sockaddr* sockaddr =
        reinterpret_cast<const struct sockaddr*>(&interface.address);

    // NCCL also seems to ignore any IPv6 loopback address.
    if (sockaddr->sa_family == AF_INET6 && interface.is_internal) {
      continue;
    }

    if (familyFilter.has_value() &&
        sockaddr->sa_family != familyFilter.value()) {
      continue;
    }

    std::string addr;
    switch (sockaddr->sa_family) {
      case AF_INET:
        addr = Sockaddr(sockaddr, sizeof(struct sockaddr_in)).str();
        break;
      case AF_INET6:
        addr = Sockaddr(sockaddr, sizeof(struct sockaddr_in6)).str();
        break;
    }

    std::string name = interface.name;
    if (name.find("ib") == 0) {
      if (!withIbPrefix.has_value()) {
        withIbPrefix = std::move(addr);
      }
    } else if (name.find("docker") == 0) {
      if (!withDockerPrefix.has_value()) {
        withDockerPrefix = std::move(addr);
      }
    } else if (name.find("lo") == 0) {
      if (!withLoPrefix.has_value()) {
        withLoPrefix = std::move(addr);
      }
    } else {
      if (!withoutPrefix.has_value()) {
        withoutPrefix = std::move(addr);
      }
    }
  }

  if (withIbPrefix.has_value()) {
    return std::make_tuple(Error::kSuccess, std::move(withIbPrefix).value());
  } else if (withoutPrefix.has_value()) {
    return std::make_tuple(Error::kSuccess, std::move(withoutPrefix).value());
  } else if (withDockerPrefix.has_value()) {
    return std::make_tuple(
        Error::kSuccess, std::move(withDockerPrefix).value());
  } else if (withLoPrefix.has_value()) {
    return std::make_tuple(Error::kSuccess, std::move(withLoPrefix).value());
  }

  return std::make_tuple(TP_CREATE_ERROR(NoAddrFoundError), std::string());
}

} // namespace uv
} // namespace transport
} // namespace tensorpipe


================================================
FILE: tensorpipe/transport/uv/utility.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <string>
#include <tuple>

#include <sys/socket.h>

#include <tensorpipe/common/error.h>
#include <tensorpipe/common/optional.h>

namespace tensorpipe {
namespace transport {
namespace uv {

std::tuple<Error, std::string> lookupAddrForIface(std::string iface);

std::tuple<Error, std::string> lookupAddrForHostname();

// Try to replicate the same logic used by NCCL to find a node's own address.
// Roughly, it returns the "first" usable address it can find, and prioritizes
// the interfaces with an `ib` prefix and de-prioritizes those with a `docker`
// or `lo` prefix. It can optionally only return only IPv4 or IPv4 addresses.
std::tuple<Error, std::string> lookupAddrLikeNccl(
    optional<sa_family_t> familyFilter = nullopt);

} // namespace uv
} // namespace transport
} // namespace tensorpipe


================================================
FILE: tensorpipe/transport/uv/uv.h
================================================
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#pragma once

#include <array>
#include <memory>

#include <uv.h>

#include <tensorpipe/common/deferred_executor.h>
#include <tensorpipe/common/defs.h>
#include <tensorpipe/transport/uv/sockaddr.h>

#define TP_THROW_UV(err) TP_THROW(std::runtime_error)
#define TP_THROW_UV_IF(cond, err) \
  if (unlikely(cond))             \
  TP_THROW_UV(err) << TP_STRINGIFY(cond) << ": " << uv_strerror(err)

namespace tensorpipe {
namespace transport {
namespace uv {

template <typename T, typename U>
class BaseHandle {
  static void uvCloseCb(uv_handle_t* handle) {
    T& ref = *reinterpret_cast<T*>(handle->data);
    if (ref.closeCallback_ != nullptr) {
      ref.closeCallback_();
    }
  }

 public:
  using TCloseCallback = std::function<void()>;

  explicit BaseHandle(uv_loop_t* loop, const DeferredExecutor& executor)
      : loop_(loop), executor_(executor) {
    handle_.data = this;
  }

  // Libuv's handles cannot be copied or moved.
  BaseHandle(const BaseHandle&) = delete;
  BaseHandle(BaseHandle&&) = delete;
  BaseHandle& operator=(const BaseHandle&) = delete;
  BaseHandle& operator=(BaseHandle&&) = delete;

  virtual ~BaseHandle() = default;

  U* ptr() {
    return &handle_;
  }

  void armCloseCallbackFromLoop(TCloseCallback fn) {
    TP_DCHECK(this->executor_.inLoop());
    TP_THROW_ASSERT_IF(closeCallback_ != nullptr);
    closeCallback_ = std::move(fn);
  }

  void closeFromLoop() {
    TP_DCHECK(!uv_is_closing(reinterpret_cast<uv_handle_t*>(ptr())));
    uv_close(reinterpret_cast<uv_handle_t*>(ptr()), uvCloseCb);
  }

 protected:
  // Underlying libuv handle.
  U handle_;

  // Underlying libuv event loop.
  uv_loop_t* const loop_;

  // This DeferredExecutor is only used to check that all calls are performed
  // from the right thread.
  const DeferredExecutor& executor_;

  TCloseCallback closeCallback_;
};

template <typename T, typename U>
class BaseRequest {
 public:
  BaseRequest() {
    request_.data = this;
  }

  // Libuv's requests cannot be copied or moved.
  BaseRequest(const BaseRequest&) = delete;
  BaseRequest(BaseRequest&&) = delete;
  BaseRequest& operator=(const BaseRequest&) = delete;
  BaseRequest& operator=(BaseRequest&&) = delete;

  U* ptr() {
    return &request_;
  }

 private:
  // Underlying libuv request.
  U request_;
};

class WriteRequest final : public BaseRequest<WriteRequest, uv_write_t> {
  static void uvWriteCb(uv_write_t* req, int status) {
    std::unique_ptr<WriteRequest> request(
        reinterpret_cast<WriteRequest*>(req->data));
    request->writeCallback_(status);
  }

 public:
  using TWriteCallback = std::function<void(int status)>;

  explicit WriteRequest(TWriteCallback fn) : writeCallback_(std::move(fn)) {}

  static int perform(
      uv_stream_t* handle,
      const uv_buf_t bufs[],
      unsigned int nbufs,
      TWriteCallback fn) {
    auto request = std::make_unique<WriteRequest>(std::move(fn));
    auto rv = uv_write(request->ptr(), handle, bufs, nbufs, uvWriteCb);
    request.release();
    return rv;
  }

 private:
  TWriteCallback writeCallback_;
};

template <typename T, typename U>
class StreamHandle : public BaseHandle<T, U> {
  static void uvConnectionCb(uv_stream_t* server, int status) {
    T& ref = *reinterpret_cast<T*>(server->data);
    TP_DCHECK(ref.connectionCallback_ != nullptr);
    ref.connectionCallback_(status);
  }

  static void uvAllocCb(
      uv_handle_t* handle,
      size_t /* unused */,
      uv_buf_t* buf) {
    T& ref = *reinterpret_cast<T*>(handle->data);
    TP_DCHECK(ref.allocCallback_ != nullptr);
    ref.allocCallback_(buf);
  }

  static void uvReadCb(
      uv_stream_t* server,
      ssize_t nread,
      const uv_buf_t* buf) {
    T& ref = *reinterpret_cast<T*>(server->data);
    TP_DCHECK(ref.readCallback_ != nullptr);
    ref.readCallback_(nread, buf);
  }

  static constexpr int kBacklog = 128;

 public:
  using TConnectionCallback = std::function<void(int status)>;
  using TAcceptCallback = std::function<void(int status)>;
  using TAllocCallback = std::function<void(uv_buf_t* buf)>;
  using TReadCallback = std::function<void(ssize_t nread, const uv_buf_t* buf)>;

  using BaseHandle<T, U>::BaseHandle;

  // TODO Split this into a armConnectionCallback, a listenStart and a
  // listenStop method, to propagate the backpressure to the clients.
  void listenFromLoop(TConnectionCallback connectionCallback) {
    TP_DCHECK(this->executor_.inLoop());
    TP_THROW_ASSERT_IF(connectionCallback_ != nullptr);
    connectionCallback_ = std::move(connectionCallback);
    auto rv = uv_listen(
        reinterpret_cast<uv_stream_t*>(this->ptr()), kBacklog, uvConnectionCb);
    TP_THROW_UV_IF(rv < 0, rv);
  }

  template <typename V>
  void acceptFromLoop(V& other) {
    TP_DCHECK(this->executor_.inLoop());
    auto rv = uv_accept(
        reinterpret_cast<uv_stream_t*>(this->ptr()),
        reinterpret_cast<uv_stream_t*>(other.ptr()));
    TP_THROW_UV_IF(rv < 0, rv);
  }

  void armAllocCallbackFromLoop(TAllocCallback fn) {
    TP_DCHECK(this->executor_.inLoop());
    TP_THROW_ASSERT_IF(allocCallback_ != nullptr);
    allocCallback_ = std::move(fn);
  }

  void armReadCallbackFromLoop(TReadCallback fn) {
    TP_DCHECK(this->executor_.inLoop());
    TP_THROW_ASSERT_IF(readCallback_ != nullptr);
    readCallback_ = std::move(fn);
  }

  void readStartFromLoop() {
    TP_DCHECK(this->executor_.inLoop());
    TP_THROW_ASSERT_IF(allocCallback_ == nullptr);
    TP_THROW_ASSERT_IF(readCallback_ == nullptr);
    auto rv = uv_read_start(
        reinterpret_cast<uv_stream_t*>(this->ptr()), uvAllocCb, uvReadCb);
    TP_THROW_UV_IF(rv < 0, rv);
  }

  void readStopFromLoop() {
    TP_DCHECK(this->executor_.inLoop());
    auto rv = uv_read_stop(reinterpret_cast<uv_stream_t*>(this->ptr()));
    TP_THROW_UV_IF(rv < 0, rv);
  }

  void writeFromLoop(
      const uv_buf_t bufs[],
      unsigned int nbufs,
      WriteRequest::TWriteCallback fn) {
    TP_DCHECK(this->executor_.inLoop());
    auto rv = WriteRequest::perform(
        reinterpret_cast<uv_stream_t*>(this->ptr()),
        bufs,
        nbufs,
        std::move(fn));
    TP_THROW_UV_IF(rv < 0, rv);
  }

 protected:
  TConnectionCallback connectionCallback_;
  TAllocCallback allocCallback_;
  TReadCallback readCallback_;
};

class ConnectRequest final : public BaseRequest<ConnectRequest, uv_connect_t> {
  static void uvConnectCb(uv_connect_t* req, int status) {
    std::unique_ptr<ConnectRequest> request(
        reinterpret_cast<ConnectRequest*>(req->data));
    request->connectCallback_(status);
  }

 public:
  using TConnectCallback = std::function<void(int status)>;

  explicit ConnectRequest(TConnectCallback fn)
      : connectCallback_(std::move(fn)) {}

  static int perform(
      uv_tcp_t* handle,
      const struct sockaddr* addr,
      TConnectCallback fn) {
    auto request = std::make_unique<ConnectRequest>(std::move(fn));
    auto rv = uv_tcp_connect(request->ptr(), handle, addr, uvConnectCb);
    request.release();
    return rv;
  }

 private:
  TConnectCallback connectCallback_;
};

class TCPHandle : public StreamHandle<TCPHandle, uv_tcp_t> {
 public:
  using StreamHandle<TCPHandle, uv_tcp_t>::StreamHandle;

  void initFromLoop() {
    TP_DCHECK(this->executor_.inLoop());
    int rv;
    rv = uv_tcp_init(loop_, this->ptr());
    TP_THROW_UV_IF(rv < 0, rv);
    rv = uv_tcp_nodelay(this->ptr(), 1);
    TP_THROW_UV_IF(rv < 0, rv);
  }

  [[nodiscard]] int bindFromLoop(const Sockaddr& addr) {
    TP_DCHECK(this->executor_.inLoop());
    auto rv = uv_tcp_bind(ptr(), addr.addr(), 0);
    // We don't throw in case of errors here because sometimes we bind in order
    // to try if an address works and want to handle errors gracefully.
    return rv;
  }

  Sockaddr sockNameFromLoop() {
    TP_DCHECK(this->executor_.inLoop());
    struct sockaddr_storage ss;
    struct sockaddr* addr = reinterpret_cast<struct sockaddr*>(&ss);
    int addrlen = sizeof(ss);
    auto rv = uv_tcp_getsockname(ptr(), addr, &addrlen);
    TP_THROW_UV_IF(rv < 0, rv);
    return Sockaddr(addr, addrlen);
  }

  void connectFromLoop(
      const Sockaddr& addr,
      ConnectRequest::TConnectCallback fn) {
    TP_DCHECK(this->executor_.inLoop());
    auto rv = ConnectRequest::perform(ptr(), addr.addr(), std::move(fn));
    TP_THROW_UV_IF(rv < 0, rv);
  }
};

struct AddrinfoDeleter {
  void operator()(struct addrinfo* ptr) const {
    uv_freeaddrinfo(ptr);
  }
};

using Addrinfo = std::unique_ptr<struct addrinfo, AddrinfoDeleter>;

inline std::tuple<int, Addrinfo> getAddrinfoFromLoop(
    uv_loop_t* loop,
    std::string hostname) {
  struct addrinfo hints;
  std::memset(&hints, 0, sizeof(hints));
  hints.ai_family = AF_UNSPEC;
  hints.ai_socktype = SOCK_STREAM;
  hints.ai_protocol = IPPROTO_TCP;

  uv_getaddrinfo_t request;
  // Don't use a callback, and thus perform the call synchronously, because the
  // asynchronous version uses a thread pool, and it's not worth spawning new
  // threads for a functionality which is used so sparingly.
  auto rv = uv_getaddrinfo(
      loop,
      &request,
      /*getaddrinfo_cb=*/nullptr,
      hostname.c_str(),
      /*service=*/nullptr,
      &hints);
  if (rv != 0) {
    return std::make_tuple(rv, Addrinfo());
  }

  return std::make_tuple(0, Addrinfo(request.addrinfo, AddrinfoDeleter()));
}

struct InterfaceAddressesDeleter {
  explicit InterfaceAddressesDeleter(int count) : count_(count) {}

  InterfaceAddressesDeleter() = default;

  void operator()(uv_interface_address_t* ptr) const {
    uv_free_interface_addresses(ptr, count_);
  }

 private:
  int count_{-1};
};

using InterfaceAddresses =
    std::unique_ptr<uv_interface_address_t[], InterfaceAddressesDeleter>;

inline std::tuple<int, InterfaceAddresses, int> getInterfaceAddresses() {
  uv_interface_address_t* info;
  int count;
  auto rv = uv_interface_addresses(&info, &count);
  if (rv != 0) {
    return std::make_tuple(rv, InterfaceAddresses(), 0);
  }
  return std::make_tuple(
      0, InterfaceAddresses(info, InterfaceAddressesDeleter(count)), count);
}

inline std::tuple<int, std::string> getHostname() {
  std::array<char, UV_MAXHOSTNAMESIZE> hostname;
  size_t size = hostname.size();
  auto rv = uv_os_gethostname(hostname.data(), &size);
  if (rv != 0) {
    return std::make_tuple(rv, std::string());
  }
  return std::make_tuple(
      0, std::string(hostname.data(), hostname.data() + size));
}

inline std::string formatUvError(int status) {
  if (status == 0) {
    return "success";
  } else {
    std::ostringstream ss;
    ss << uv_err_name(status) << ": " << uv_strerror(status);
    return ss.str();
  }
}

} // namespace uv
} // namespace transport
} // namespace tensorpipe


================================================
FILE: third_party/README.md
================================================
# third_party

This directory includes dependencies as [submodules][submodules].

[submodules]: https://git-scm.com/book/en/v2/Git-Tools-Submodules

## Build dependencies

* **libuv** is a multi-platform support library with a focus on asynchronous I/O.

## Test dependencies

* **backward-cpp** is a beautiful stack trace pretty printer for C++.
* **googletest** is a C++ test framework.