Copy disabled (too large)
Download .txt
Showing preview only (20,383K chars total). Download the full file to get everything.
Repository: ROCmSoftwarePlatform/rccl
Branch: develop_deprecated
Commit: 57e58688f44c
Files: 772
Total size: 19.3 MB
Directory structure:
gitextract_sughb21j/
├── .azuredevops/
│ ├── multinode-ci-nightly.yml
│ ├── multinode-ci-pr.yml
│ ├── multinode-ci-slurm-nightly.yml
│ ├── multinode-ci-slurm-pr.yml
│ ├── rocm-ci.yml
│ ├── slurm/
│ │ ├── build.sh
│ │ ├── test_rccl-UnitTests.sh
│ │ └── test_rccl-tests.sh
│ ├── templates/
│ │ ├── build.yml
│ │ ├── test_rccl-UnitTests.yml
│ │ └── test_rccl-tests.yml
│ └── tests/
│ └── pytest/
│ └── HelloWorld.py
├── .clang-format
├── .github/
│ ├── CODEOWNERS
│ ├── PULL_REQUEST_TEMPLATE.md
│ ├── dependabot.yml
│ ├── scripts/
│ │ └── therock_configure_ci.py
│ └── workflows/
│ ├── therock-ci-linux.yml
│ ├── therock-ci.yml
│ ├── therock-test-packages-multi-node.yml
│ └── therock-test-packages-single-node.yml
├── .gitignore
├── .gitmodules
├── .readthedocs.yaml
├── CHANGELOG.md
├── CMakeLists.txt
├── CppCheckSuppressions.txt
├── LICENSE.txt
├── Makefile
├── NOTICES.txt
├── README.md
├── cmake/
│ ├── CheckSymbolExistsNoWarn.cmake
│ ├── Dependencies.cmake
│ ├── DownloadProject.CMakeLists.cmake.in
│ ├── DownloadProject.cmake
│ ├── FindIBVerbs.cmake
│ ├── Findmscclpp_nccl.cmake
│ ├── Findrocshmem_static.cmake
│ ├── MSCCLPP.cmake
│ ├── ROCSHMEM.cmake
│ ├── rcclRAS.cmake
│ ├── rocmIb.cmake
│ └── scripts/
│ ├── add_faults.sh
│ ├── add_unroll.sh
│ ├── extract_metadata.cmake
│ └── git_version.cmake
├── docker/
│ ├── Dockerfile.ubuntu
│ └── README.md
├── docs/
│ ├── .gitignore
│ ├── api-reference/
│ │ ├── api-library.rst
│ │ ├── env-variables.rst
│ │ └── library-specification.rst
│ ├── attributions.rst
│ ├── conf.py
│ ├── doxygen/
│ │ └── Doxyfile
│ ├── how-to/
│ │ ├── rccl-usage-tips.rst
│ │ ├── troubleshooting-rccl.rst
│ │ ├── using-nccl.rst
│ │ └── using-rccl-tuner-plugin-api.rst
│ ├── index.rst
│ ├── install/
│ │ ├── building-installing.rst
│ │ ├── docker-install.rst
│ │ └── installation.rst
│ ├── license.rst
│ ├── sphinx/
│ │ ├── _toc.yml.in
│ │ ├── requirements.in
│ │ └── requirements.txt
│ └── what-is-rccl.rst
├── ext-net/
│ ├── README.md
│ ├── example/
│ │ ├── CMakeLists.txt
│ │ ├── Makefile
│ │ ├── nccl/
│ │ │ ├── common.h
│ │ │ ├── err.h
│ │ │ ├── net.h
│ │ │ ├── net_device.h
│ │ │ ├── net_v10.h
│ │ │ ├── net_v11.h
│ │ │ ├── net_v2.h
│ │ │ ├── net_v3.h
│ │ │ ├── net_v4.h
│ │ │ ├── net_v5.h
│ │ │ ├── net_v6.h
│ │ │ ├── net_v7.h
│ │ │ ├── net_v8.h
│ │ │ ├── net_v9.h
│ │ │ └── types.h
│ │ └── plugin.c
│ └── google-fastsocket/
│ └── Makefile
├── ext-profiler/
│ ├── README.md
│ ├── example/
│ │ ├── CMakeLists.txt
│ │ ├── Makefile
│ │ ├── README.md
│ │ ├── event.h
│ │ ├── nccl/
│ │ │ ├── common.h
│ │ │ ├── err.h
│ │ │ ├── net_ib_v1.h
│ │ │ ├── net_socket_v1.h
│ │ │ ├── profiler.h
│ │ │ ├── profiler_net.h
│ │ │ ├── profiler_v1.h
│ │ │ ├── profiler_v2.h
│ │ │ ├── profiler_v3.h
│ │ │ ├── profiler_v4.h
│ │ │ ├── profiler_v5.h
│ │ │ └── types.h
│ │ ├── plugin.cc
│ │ ├── plugin.h
│ │ ├── print_event.cc
│ │ ├── print_event.h
│ │ └── queue.h
│ ├── google-CoMMA/
│ │ └── Makefile
│ └── inspector/
│ ├── Makefile
│ ├── README.md
│ ├── exporter/
│ │ └── example/
│ │ ├── README.md
│ │ ├── perf_summary_exporter.py
│ │ └── requirements.txt
│ ├── inspector.cc
│ ├── inspector.h
│ ├── inspector_plugin.cc
│ ├── json.cc
│ ├── json.h
│ ├── nccl/
│ │ ├── common.h
│ │ ├── profiler.h
│ │ ├── profiler_net.h
│ │ ├── profiler_v1.h
│ │ ├── profiler_v2.h
│ │ ├── profiler_v3.h
│ │ ├── profiler_v4.h
│ │ ├── profiler_v5.h
│ │ └── types.h
│ └── version.h
├── ext-src/
│ ├── bf16-tuning.patch
│ ├── check_ibv_access_relaxed_ordering.cc
│ ├── cpx.patch
│ ├── device-flag.patch
│ ├── disable-executor.patch
│ ├── disable-format-checks.patch
│ ├── mem-reg.patch
│ ├── mscclpp_ibv_access_relaxed_ordering.patch
│ ├── no-cache.patch
│ ├── non-multiple-128-fix.patch
│ ├── read-allred.patch
│ ├── reg-fix.patch
│ ├── remove-clip.patch
│ └── rocm_netib.patch
├── ext-tuner/
│ ├── README.md
│ ├── basic/
│ │ ├── Makefile
│ │ ├── README.md
│ │ ├── nccl/
│ │ │ ├── common.h
│ │ │ ├── err.h
│ │ │ └── tuner.h
│ │ └── plugin.c
│ ├── example/
│ │ ├── .gitignore
│ │ ├── CMakeLists.txt
│ │ ├── Makefile
│ │ ├── README.md
│ │ ├── nccl/
│ │ │ ├── common.h
│ │ │ ├── err.h
│ │ │ └── tuner.h
│ │ ├── nccl_tuner.conf
│ │ ├── plugin.c
│ │ ├── scripts/
│ │ │ ├── README.md
│ │ │ └── optimize_config.py
│ │ └── test/
│ │ ├── Makefile
│ │ ├── README.md
│ │ └── test_plugin.c
│ └── model_demo/
│ ├── Makefile
│ ├── README.md
│ ├── nccl/
│ │ ├── common.h
│ │ ├── err.h
│ │ └── tuner.h
│ └── plugin.c
├── install.sh
├── makefiles/
│ ├── common.mk
│ ├── formatting.mk
│ └── version.mk
├── pkg/
│ ├── Makefile
│ ├── debian/
│ │ ├── .gitignore
│ │ ├── Makefile
│ │ ├── changelog.in
│ │ ├── compat
│ │ ├── control.in
│ │ ├── gbp.conf
│ │ ├── libnccl-dev.install.in
│ │ ├── libnccl2.install.in
│ │ ├── rules
│ │ └── source/
│ │ └── format
│ ├── redhat/
│ │ ├── Makefile
│ │ └── nccl.spec.in
│ ├── srctxz/
│ │ ├── Makefile
│ │ └── create_srctxz.sh.in
│ └── txz/
│ ├── Makefile
│ └── create_txz.sh.in
├── rtest.xml
├── src/
│ ├── CMakeLists.txt
│ ├── Makefile
│ ├── allocator.cc
│ ├── bootstrap.cc
│ ├── ce_coll.cc
│ ├── channel.cc
│ ├── collectives.cc
│ ├── commDump.cc
│ ├── debug.cc
│ ├── dev_runtime.cc
│ ├── device/
│ │ ├── CMakeLists.txt
│ │ ├── Makefile
│ │ ├── all_gather.h
│ │ ├── all_reduce.h
│ │ ├── alltoall_gda.h
│ │ ├── alltoall_pivot.h
│ │ ├── broadcast.h
│ │ ├── common.cu
│ │ ├── common.h
│ │ ├── common_kernel.h
│ │ ├── generate.py
│ │ ├── msccl_kernel_impl.h
│ │ ├── network/
│ │ │ └── unpack/
│ │ │ ├── unpack.h
│ │ │ └── unpack_defs.h
│ │ ├── onerank.cu
│ │ ├── op128.h
│ │ ├── primitives.h
│ │ ├── prims_ll.h
│ │ ├── prims_ll128.h
│ │ ├── prims_simple.h
│ │ ├── rccl_metadata.h
│ │ ├── rccl_ptr.h
│ │ ├── reduce.h
│ │ ├── reduce_kernel.h
│ │ ├── reduce_scatter.h
│ │ ├── sendrecv.h
│ │ └── symmetric/
│ │ ├── all_gather.cuh
│ │ ├── all_reduce.cuh
│ │ ├── generate.py
│ │ ├── kernel.cuh
│ │ ├── primitives.cuh
│ │ └── reduce_scatter.cuh
│ ├── enhcompat.cc
│ ├── enqueue.cc
│ ├── graph/
│ │ ├── CMakeLists.txt
│ │ ├── connect.cc
│ │ ├── paths.cc
│ │ ├── rings.cc
│ │ ├── rings.h
│ │ ├── rome_models.cc
│ │ ├── rome_models.h
│ │ ├── search.cc
│ │ ├── topo.cc
│ │ ├── topo.h
│ │ ├── trees.cc
│ │ ├── tuning.cc
│ │ ├── xml.cc
│ │ └── xml.h
│ ├── group.cc
│ ├── include/
│ │ ├── BfdBacktrace.hpp
│ │ ├── alloc.h
│ │ ├── allocator.h
│ │ ├── alt_rsmi.h
│ │ ├── amdsmi_wrap.h
│ │ ├── api_trace.h
│ │ ├── archinfo.h
│ │ ├── argcheck.h
│ │ ├── bitops.h
│ │ ├── bootstrap.h
│ │ ├── ce_coll.h
│ │ ├── channel.h
│ │ ├── checks.h
│ │ ├── coll_net.h
│ │ ├── collectives.h
│ │ ├── comm.h
│ │ ├── core.h
│ │ ├── cpuset.h
│ │ ├── cudawrap.h
│ │ ├── debug.h
│ │ ├── dev_runtime.h
│ │ ├── device.h
│ │ ├── enqueue.h
│ │ ├── gdrwrap.h
│ │ ├── git_version.h
│ │ ├── graph.h
│ │ ├── group.h
│ │ ├── hip_rocm_version_info.h
│ │ ├── ibvcore.h
│ │ ├── ibvsymbols.h
│ │ ├── ibvwrap.h
│ │ ├── info.h
│ │ ├── ionic/
│ │ │ ├── ionicdvcore.h
│ │ │ ├── ionicdvsymbols.h
│ │ │ └── ionicdvwrap.h
│ │ ├── ipcsocket.h
│ │ ├── latency_profiler/
│ │ │ ├── CollTrace.h
│ │ │ ├── CollTraceEvent.h
│ │ │ ├── CollTraceFunc.h
│ │ │ ├── CollTraceUtils.h
│ │ │ ├── EventQueue.h
│ │ │ └── MIT-LICENSE.txt
│ │ ├── mlx5/
│ │ │ ├── mlx5dvcore.h
│ │ │ ├── mlx5dvsymbols.h
│ │ │ └── mlx5dvwrap.h
│ │ ├── mnnvl.h
│ │ ├── msccl/
│ │ │ ├── msccl_kernel.h
│ │ │ ├── msccl_lifecycle.h
│ │ │ ├── msccl_parser.h
│ │ │ ├── msccl_scheduler.h
│ │ │ ├── msccl_setup.h
│ │ │ ├── msccl_status.h
│ │ │ └── msccl_struct.h
│ │ ├── mscclpp/
│ │ │ └── mscclpp_nccl.h
│ │ ├── nccl_common.h
│ │ ├── nccl_device/
│ │ │ ├── README.md
│ │ │ ├── comm.h
│ │ │ ├── coop.h
│ │ │ ├── core.h
│ │ │ ├── impl/
│ │ │ │ ├── comm__funcs.h
│ │ │ │ ├── comm__types.h
│ │ │ │ ├── core__funcs.h
│ │ │ │ ├── core__types.h
│ │ │ │ ├── ll_a2a__funcs.h
│ │ │ │ ├── ll_a2a__types.h
│ │ │ │ ├── mem_barrier__funcs.h
│ │ │ │ ├── mem_barrier__types.h
│ │ │ │ ├── ptr__funcs.h
│ │ │ │ └── ptr__types.h
│ │ │ ├── ll_a2a.h
│ │ │ ├── mem_barrier.h
│ │ │ ├── ptr.h
│ │ │ └── utility.h
│ │ ├── nccl_device.h
│ │ ├── net.h
│ │ ├── net_device.h
│ │ ├── npkit/
│ │ │ ├── npkit.h
│ │ │ ├── npkit_event.h
│ │ │ └── npkit_struct.h
│ │ ├── nvmlwrap.h
│ │ ├── nvtx.h
│ │ ├── nvtx3/
│ │ │ ├── nvToolsExt.h
│ │ │ ├── nvToolsExtCounters.h
│ │ │ ├── nvToolsExtCuda.h
│ │ │ ├── nvToolsExtCudaRt.h
│ │ │ ├── nvToolsExtMem.h
│ │ │ ├── nvToolsExtMemCudaRt.h
│ │ │ ├── nvToolsExtOpenCL.h
│ │ │ ├── nvToolsExtPayload.h
│ │ │ ├── nvToolsExtPayloadHelper.h
│ │ │ ├── nvToolsExtSemanticsCounters.h
│ │ │ ├── nvToolsExtSemanticsScope.h
│ │ │ ├── nvToolsExtSync.h
│ │ │ ├── nvtx3.hpp
│ │ │ └── nvtxDetail/
│ │ │ ├── nvtxExtHelperMacros.h
│ │ │ ├── nvtxExtImpl.h
│ │ │ ├── nvtxExtImplCounters_v1.h
│ │ │ ├── nvtxExtImplMemCudaRt_v1.h
│ │ │ ├── nvtxExtImplMem_v1.h
│ │ │ ├── nvtxExtImplPayload_v1.h
│ │ │ ├── nvtxExtInit.h
│ │ │ ├── nvtxExtPayloadHelperInternal.h
│ │ │ ├── nvtxExtPayloadTypeInfo.h
│ │ │ ├── nvtxExtTypes.h
│ │ │ ├── nvtxImpl.h
│ │ │ ├── nvtxImplCore.h
│ │ │ ├── nvtxImplCudaRt_v3.h
│ │ │ ├── nvtxImplCuda_v3.h
│ │ │ ├── nvtxImplOpenCL_v3.h
│ │ │ ├── nvtxImplSync_v3.h
│ │ │ ├── nvtxInit.h
│ │ │ ├── nvtxInitDecls.h
│ │ │ ├── nvtxInitDefs.h
│ │ │ ├── nvtxLinkOnce.h
│ │ │ └── nvtxTypes.h
│ │ ├── nvtx_payload_schemas.h
│ │ ├── nvtx_stub.h
│ │ ├── p2p.h
│ │ ├── param.h
│ │ ├── plugin/
│ │ │ ├── nccl_net.h
│ │ │ ├── nccl_profiler.h
│ │ │ ├── nccl_tuner.h
│ │ │ ├── net/
│ │ │ │ ├── net_v10.h
│ │ │ │ ├── net_v11.h
│ │ │ │ ├── net_v6.h
│ │ │ │ ├── net_v7.h
│ │ │ │ ├── net_v8.h
│ │ │ │ └── net_v9.h
│ │ │ ├── plugin.h
│ │ │ ├── profiler/
│ │ │ │ ├── net_ib.h
│ │ │ │ ├── net_ib_v1.h
│ │ │ │ ├── net_socket.h
│ │ │ │ ├── net_socket_v1.h
│ │ │ │ ├── profiler_v1.h
│ │ │ │ ├── profiler_v2.h
│ │ │ │ ├── profiler_v3.h
│ │ │ │ ├── profiler_v4.h
│ │ │ │ └── profiler_v5.h
│ │ │ └── tuner/
│ │ │ ├── tuner_v2.h
│ │ │ ├── tuner_v3.h
│ │ │ ├── tuner_v4.h
│ │ │ └── tuner_v5.h
│ │ ├── profiler.h
│ │ ├── proxy.h
│ │ ├── proxy_trace/
│ │ │ └── proxy_trace.h
│ │ ├── ras.h
│ │ ├── rccl_common.h
│ │ ├── rccl_float8.h
│ │ ├── rccl_vars.h
│ │ ├── recorder.h
│ │ ├── register.h
│ │ ├── register_inline.h
│ │ ├── rocm_smi_wrap.h
│ │ ├── rocmwrap.h
│ │ ├── roctx.h
│ │ ├── scheduler.h
│ │ ├── shm.h
│ │ ├── shmutils.h
│ │ ├── signals.h
│ │ ├── socket.h
│ │ ├── strongstream.h
│ │ ├── sym_kernels.h
│ │ ├── timer.h
│ │ ├── transport.h
│ │ ├── trees.h
│ │ ├── tuner.h
│ │ └── utils.h
│ ├── init.cc
│ ├── init_nvtx.cc
│ ├── misc/
│ │ ├── CMakeLists.txt
│ │ ├── alt_rsmi.cc
│ │ ├── amdsmi_wrap.cc
│ │ ├── api_trace.c
│ │ ├── api_trace.cc
│ │ ├── archinfo.cc
│ │ ├── argcheck.cc
│ │ ├── cudawrap.cc
│ │ ├── gdrwrap.cc
│ │ ├── ibvsymbols.cc
│ │ ├── ibvwrap.cc
│ │ ├── ionicdvsymbols.cc
│ │ ├── ionicdvwrap.cc
│ │ ├── ipcsocket.cc
│ │ ├── latency_profiler/
│ │ │ ├── CollTrace.cc
│ │ │ ├── CollTraceEvent.cc
│ │ │ ├── CollTraceFunc.cc
│ │ │ ├── CollTraceUtils.cc
│ │ │ └── MIT-LICENSE.txt
│ │ ├── mlx5dvsymbols.cc
│ │ ├── mlx5dvwrap.cc
│ │ ├── msccl/
│ │ │ ├── msccl_lifecycle.cc
│ │ │ ├── msccl_parser.cc
│ │ │ ├── msccl_setup.cc
│ │ │ └── msccl_status.cc
│ │ ├── mscclpp/
│ │ │ ├── mscclpp_nccl.cc
│ │ │ └── mscclpp_nccl_syms.txt
│ │ ├── npkit.cc
│ │ ├── nvmlwrap.cc
│ │ ├── nvmlwrap_stub.cc
│ │ ├── param.cc
│ │ ├── proxy_trace/
│ │ │ └── proxy_trace.cc
│ │ ├── recorder.cc
│ │ ├── rocm_smi_wrap.cc
│ │ ├── rocmwrap.cc
│ │ ├── roctx.cc
│ │ ├── shmutils.cc
│ │ ├── signals.cc
│ │ ├── socket.cc
│ │ ├── strongstream.cc
│ │ └── utils.cc
│ ├── mnnvl.cc
│ ├── msccl.cc
│ ├── nccl.h.in
│ ├── nccl.pc.in
│ ├── nccl_device/
│ │ ├── CMakeLists.txt
│ │ ├── core.cc
│ │ ├── ll_a2a.cc
│ │ └── mem_barrier.cc
│ ├── plugin/
│ │ ├── CMakeLists.txt
│ │ ├── net/
│ │ │ ├── CMakeLists.txt
│ │ │ ├── net_v10.cc
│ │ │ ├── net_v11.cc
│ │ │ ├── net_v6.cc
│ │ │ ├── net_v7.cc
│ │ │ ├── net_v8.cc
│ │ │ └── net_v9.cc
│ │ ├── net.cc
│ │ ├── plugin_open.cc
│ │ ├── profiler/
│ │ │ ├── CMakeLists.txt
│ │ │ ├── profiler_v1.cc
│ │ │ ├── profiler_v2.cc
│ │ │ ├── profiler_v3.cc
│ │ │ ├── profiler_v4.cc
│ │ │ └── profiler_v5.cc
│ │ ├── profiler.cc
│ │ ├── tuner/
│ │ │ ├── CMakeLists.txt
│ │ │ ├── tuner_v2.cc
│ │ │ ├── tuner_v3.cc
│ │ │ ├── tuner_v4.cc
│ │ │ └── tuner_v5.cc
│ │ └── tuner.cc
│ ├── proxy.cc
│ ├── ras/
│ │ ├── CMakeLists.txt
│ │ ├── client.cc
│ │ ├── client_support.cc
│ │ ├── collectives.cc
│ │ ├── peers.cc
│ │ ├── ras.cc
│ │ ├── ras_internal.h
│ │ └── rasnet.cc
│ ├── rccl_wrap.cc
│ ├── register/
│ │ ├── CMakeLists.txt
│ │ ├── coll_reg.cc
│ │ ├── register.cc
│ │ └── sendrecv_reg.cc
│ ├── scheduler/
│ │ ├── CMakeLists.txt
│ │ └── symmetric_sched.cc
│ ├── sym_kernels.cc
│ ├── transport/
│ │ ├── CMakeLists.txt
│ │ ├── coll_net.cc
│ │ ├── generic.cc
│ │ ├── net.cc
│ │ ├── net_ib.cc
│ │ ├── net_socket.cc
│ │ ├── nvls.cc
│ │ ├── p2p.cc
│ │ ├── profiler.cc
│ │ └── shm.cc
│ └── transport.cc
├── test/
│ ├── AllGatherTests.cpp
│ ├── AllReduceTests.cpp
│ ├── AllToAllTests.cpp
│ ├── AllToAllVTests.cpp
│ ├── AllocTests.cpp
│ ├── AltRsmiTests.cpp
│ ├── ArgCheckTests.cpp
│ ├── BitOpsTests.cpp
│ ├── BroadcastTests.cpp
│ ├── CMakeLists.txt
│ ├── CommTests.cpp
│ ├── EnqueueTests.cpp
│ ├── GatherTests.cpp
│ ├── GroupCallTests.cpp
│ ├── IpcsocketTests.cpp
│ ├── NetSocketTests.cpp
│ ├── NonBlockingTests.cpp
│ ├── ParamTests.cpp
│ ├── ParamTestsConfFile.txt
│ ├── ProxyTests.cpp
│ ├── README.md
│ ├── RcclWrapTests.cpp
│ ├── ReduceScatterTests.cpp
│ ├── ReduceTests.cpp
│ ├── RegisterTests.cpp
│ ├── ScatterTests.cpp
│ ├── SendRecvTests.cpp
│ ├── StandaloneTests.cpp
│ ├── TransportTests.cpp
│ ├── _RecorderTests.cpp
│ ├── common/
│ │ ├── CallCollectiveForked.cpp
│ │ ├── CallCollectiveForked.hpp
│ │ ├── CollectiveArgs.cpp
│ │ ├── CollectiveArgs.hpp
│ │ ├── DeviceBufferHelpers.hpp
│ │ ├── EnvVars.cpp
│ │ ├── EnvVars.hpp
│ │ ├── ErrCode.hpp
│ │ ├── MPIEnvironment.cpp
│ │ ├── MPIEnvironment.hpp
│ │ ├── MPIHelpers.cpp
│ │ ├── MPIHelpers.hpp
│ │ ├── MPIStandaloneTest.hpp
│ │ ├── MPITestBase.hpp
│ │ ├── MPITestCore.cpp
│ │ ├── MPITestCore.hpp
│ │ ├── MPITestRunner.md
│ │ ├── PrepDataFuncs.cpp
│ │ ├── PrepDataFuncs.hpp
│ │ ├── ProcessIsolatedTestRunner.cpp
│ │ ├── ProcessIsolatedTestRunner.hpp
│ │ ├── ProcessIsolatedTestRunner.md
│ │ ├── PtrUnion.cpp
│ │ ├── PtrUnion.hpp
│ │ ├── RcclMockFuncs.hpp
│ │ ├── ResourceGuards.hpp
│ │ ├── StandaloneUtils.cpp
│ │ ├── StandaloneUtils.hpp
│ │ ├── TestBed.cpp
│ │ ├── TestBed.hpp
│ │ ├── TestBedChild.cpp
│ │ ├── TestBedChild.hpp
│ │ ├── TestChecks.cpp
│ │ ├── TestChecks.hpp
│ │ ├── TransportUtils.hpp
│ │ ├── main.cpp
│ │ ├── main_fixtures.cpp
│ │ └── main_mpi.cpp
│ ├── ext-plugins/
│ │ ├── .gitignore
│ │ ├── README.md
│ │ ├── assets/
│ │ │ └── csv_confs/
│ │ │ ├── incorrect_values_config.conf
│ │ │ ├── multinode_config.conf
│ │ │ ├── no_matching_config.conf
│ │ │ ├── singlenode_config.conf
│ │ │ ├── unsupported_algo_proto_config.conf
│ │ │ ├── valid_config_with_wildcards.conf
│ │ │ └── valid_config_without_wildcards.conf
│ │ ├── pytest.ini
│ │ ├── requirements.txt
│ │ └── tests/
│ │ ├── conftest.py
│ │ ├── ext-profiler/
│ │ │ ├── test_allgather.py
│ │ │ ├── test_allreduce.py
│ │ │ ├── test_alltoall.py
│ │ │ ├── test_broadcast.py
│ │ │ ├── test_reduce.py
│ │ │ ├── test_reducescatter.py
│ │ │ └── test_sendrecv.py
│ │ └── ext-tuner/
│ │ ├── test_allgather.py
│ │ ├── test_allreduce.py
│ │ ├── test_broadcast.py
│ │ ├── test_reduce.py
│ │ └── test_reducescatter.py
│ ├── graph/
│ │ └── XmlTests.cpp
│ ├── latency_profiler/
│ │ └── LatencyProfilerUnitTest.cpp
│ ├── proxy_trace/
│ │ └── ProxyTraceUnitTests.cpp
│ └── transport/
│ ├── NetIbMPITests.cpp
│ ├── NetMPITests.cpp
│ ├── P2pMPITests.cpp
│ ├── ShmMPITests.cpp
│ ├── TransportMPIBase.cpp
│ └── TransportMPIBase.hpp
├── toolchain-linux.cmake
└── tools/
├── EmptyKernelTest/
│ ├── EmptyKernelTest.cpp
│ ├── Makefile
│ └── run.sh
├── GraphBench/
│ ├── GraphBench.cpp
│ └── Makefile
├── HelloRccl/
│ ├── HelloRccl.cpp
│ ├── HelloRccl.hpp
│ ├── Makefile
│ └── runTest.sh
├── JitterBench/
│ ├── Common.hpp
│ ├── Compatibility.hpp
│ ├── GetClosestNumaNode.hpp
│ ├── JitterBench.cpp
│ ├── Makefile
│ ├── Timeline.hpp
│ └── runSweep.sh
├── RcclReplayer/
│ ├── Makefile
│ ├── README.md
│ ├── rcclReplayer.cpp
│ ├── rcclReplayer.hpp
│ └── replay_log_converter.py
├── TopoVisual/
│ ├── README.md
│ ├── extract_topo.awk
│ └── topo_visual.sh
├── TransferBench/
│ └── README.md
├── ib-test/
│ ├── Makefile
│ ├── ib_test.cpp
│ ├── include/
│ │ └── nccl.h
│ └── utils.cpp
├── msccl-algorithms/
│ ├── allgather_16n_direct_0_3m_ll128.xml
│ ├── allgather_16n_direct_0_3m_ll128_op.xml
│ ├── allgather_32n_direct_0_6m_ll128.xml
│ ├── allgather_32n_direct_0_6m_ll128_op.xml
│ ├── allreduce-allpairs-8n-ll-32tb-op.xml
│ ├── allreduce-allpairs-8n-ll-32tb.xml
│ ├── allreduce-allpairs-8n-ll-64tb-op.xml
│ ├── allreduce-allpairs-8n-ll-64tb.xml
│ ├── allreduce-allpairs-8n-simple-op.xml
│ ├── allreduce-allpairs-8n-simple.xml
│ ├── alltoall-8n-0-9kb.xml
│ ├── alltoall-8n-190kb-512kb.xml
│ ├── alltoall-8n-512kb-7mb.xml
│ ├── alltoall-8n-7mb-43mb.xml
│ └── alltoall-8n-9kb-190kb.xml
├── msccl-unit-test-algorithms/
│ ├── all-reduce-ring-ll.xml
│ ├── all-reduce-ring-ll128.xml
│ └── all-reduce-ring-simple.xml
├── p2p-latency-test/
│ ├── Makefile
│ ├── README.md
│ ├── build_and_run.sh
│ ├── ll_latency_test.cpp
│ ├── ll_latency_test.cu
│ └── p2p_latency_test.cpp
├── rccl-prim-test/
│ ├── Makefile
│ ├── copy_kernel.h
│ └── rccl_prim_test.cpp
├── scripts/
│ ├── exclude_static_list.txt
│ ├── npkit_trace_analysis.py
│ ├── npkit_trace_generator.py
│ ├── pytorch-all-reduce/
│ │ ├── README.md
│ │ ├── all_reduce.py
│ │ └── trace_runs.sh
│ ├── pytorch-log-parser.py
│ ├── rcclDiagnostics.py
│ ├── rccl_bw_test.py
│ ├── rocprof-log-parser.py
│ ├── test_runner/
│ │ ├── README.md
│ │ ├── configs/
│ │ │ ├── mi300x_mellanox_ib.json
│ │ │ ├── rccl_perf_tests.json
│ │ │ └── test_config_sample.json
│ │ ├── lib/
│ │ │ ├── __init__.py
│ │ │ ├── test_config.py
│ │ │ ├── test_executor.py
│ │ │ └── test_parser.py
│ │ └── test_runner.py
│ ├── topo_val.sh
│ └── ucx_ompi_rccl_rccltests_TB_script.sh
├── time-trace/
│ ├── rccl-TimeTrace.sh
│ └── time_trace_generator.py
└── topo_expl/
├── Makefile
├── README.md
├── include/
│ ├── device_table.h
│ ├── model.h
│ ├── nccl.h
│ └── utils.h
├── model.cpp
├── models/
│ ├── topo_16p1h.xml
│ ├── topo_16p1h_vm.xml
│ ├── topo_16p_gio-1s-1rp-cascade.xml
│ ├── topo_16p_gio-3s-1rp-split-flat.xml
│ ├── topo_3p_pcie.xml
│ ├── topo_3p_pcie_1.xml
│ ├── topo_4p1h.xml
│ ├── topo_4p1h_1.xml
│ ├── topo_4p2h.xml
│ ├── topo_4p2h_1.xml
│ ├── topo_4p2h_2nic.xml
│ ├── topo_4p3l.xml
│ ├── topo_4p3l_2h.xml
│ ├── topo_4p3l_ia.xml
│ ├── topo_4p3l_n2.xml
│ ├── topo_4p3l_n2_1.xml
│ ├── topo_4p3l_n4.xml
│ ├── topo_4p4h.xml
│ ├── topo_4p_942.xml
│ ├── topo_8p1h.xml
│ ├── topo_8p1h_1.xml
│ ├── topo_8p1h_2.xml
│ ├── topo_8p1h_3.xml
│ ├── topo_8p1h_4.xml
│ ├── topo_8p1h_5.xml
│ ├── topo_8p1h_n1.xml
│ ├── topo_8p6l.xml
│ ├── topo_8p6l_1nic.xml
│ ├── topo_8p6l_2nic.xml
│ ├── topo_8p6l_3nic.xml
│ ├── topo_8p6l_4nic.xml
│ ├── topo_8p6l_5nic.xml
│ ├── topo_8p6l_6nic.xml
│ ├── topo_8p_4nics.xml
│ ├── topo_8p_90a.xml
│ ├── topo_8p_90a_1.xml
│ ├── topo_8p_942.xml
│ ├── topo_8p_942vm.xml
│ ├── topo_8p_950.xml
│ ├── topo_8p_pcie.xml
│ ├── topo_8p_pcie_1.xml
│ ├── topo_8p_pcie_2nic.xml
│ ├── topo_8p_rome.xml
│ ├── topo_8p_rome_4n_1.xml
│ ├── topo_8p_rome_4n_2.xml
│ ├── topo_8p_rome_4nics.xml
│ ├── topo_8p_rome_n2.xml
│ ├── topo_8p_rome_n2_1.xml
│ ├── topo_8p_rome_n2_2.xml
│ ├── topo_8p_rome_n4.xml
│ ├── topo_8p_rome_n4_1.xml
│ ├── topo_8p_rome_pcie.xml
│ ├── topo_8p_rome_vm1.xml
│ ├── topo_8p_ts1.xml
│ ├── topo_8p_ts1_1.xml
│ ├── topo_8p_ts1_n4.xml
│ ├── topo_8p_ts1_n4_1.xml
│ ├── topo_8p_ts1_n4_2.xml
│ ├── topo_collnet_n1.xml
│ └── topo_collnet_n4.xml
├── topo_expl.cpp
└── utils.cpp
================================================
FILE CONTENTS
================================================
================================================
FILE: .azuredevops/multinode-ci-nightly.yml
================================================
resources:
repositories:
- repository: pipelines_repo
type: github
endpoint: ROCm
name: ROCm/ROCm
variables:
- group: common
- template: /.azuredevops/variables-global.yml@pipelines_repo
- name: pytestFolder
value: '.azuredevops/tests/pytest'
parameters:
- name: pytestList
type: object
default:
- HelloWorld
trigger: none
pr: none
schedules:
- cron: "0 5 * 11-3 *" # 11 PM CST (November - March)
displayName: "Nightly Build (CST)"
branches:
include:
- develop
always: false
- cron: "0 4 * 4-10 *" # 11 PM CDT (April - October)
displayName: "Nightly Build (CDT)"
branches:
include:
- develop
always: false
jobs:
- job: rccl
timeoutInMinutes: 180
pool: rocm-ci_rccl_pool
workspace:
clean: all
steps:
- task: DeleteFiles@1
inputs:
Contents: '**/*'
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml@pipelines_repo
parameters:
submoduleBehaviour: recursive
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml@pipelines_repo
parameters:
installEnabled: false
printDiskSpace: false
extraBuildFlags: >-
-DCMAKE_BUILD_TYPE=Release
-DBUILD_TESTS=ON
-GNinja
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml@pipelines_repo
parameters:
componentName: rccl
testDir: $(Build.SourcesDirectory)/build/test
testExecutable: 'LD_LIBRARY_PATH=$(Build.SourcesDirectory)/build:${LD_LIBRARY_PATH} NCCL_DEBUG=INFO RCCL_ENABLE_SIGNALHANDLER=1 ./rccl-UnitTests'
testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
- ${{ each pytestScript in parameters.pytestList }}:
- task: Bash@3
displayName: Test ${{ pytestScript }}
continueOnError: true
inputs:
targetType: inline
workingDirectory: $(Build.SourcesDirectory)/$(pytestFolder)
script: pytest ${{ pytestScript }}.py
================================================
FILE: .azuredevops/multinode-ci-pr.yml
================================================
resources:
repositories:
- repository: pipelines_repo
type: github
endpoint: ROCm
name: ROCm/ROCm
variables:
- group: common
- template: /.azuredevops/variables-global.yml@pipelines_repo
- name: pytestFolder
value: '.azuredevops/tests/pytest'
parameters:
- name: pytestList
type: object
default:
- HelloWorld
trigger: none
pr:
autoCancel: true
branches:
include:
- develop
paths:
exclude:
- .github
- .jenkins
- docs
- '*.md'
- LICENSE.txt
- NOTICES.txt
drafts: false
stages:
- stage: rcclStage
displayName: 'RCCL develop PR'
jobs:
- deployment: rccl_pr_approval
displayName: "CI Run Requires Approval"
environment: rccl
- job: rccl
timeoutInMinutes: 180
pool: rocm-ci_rccl_pool
workspace:
clean: all
steps:
- task: DeleteFiles@1
inputs:
Contents: '**/*'
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml@pipelines_repo
parameters:
submoduleBehaviour: recursive
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml@pipelines_repo
parameters:
installEnabled: false
printDiskSpace: false
extraBuildFlags: >-
-DCMAKE_BUILD_TYPE=Release
-DBUILD_TESTS=ON
-DGPU_TARGETS=gfx942
-GNinja
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml@pipelines_repo
parameters:
componentName: rccl
testDir: $(Build.SourcesDirectory)/build/test
testExecutable: 'LD_LIBRARY_PATH=$(Build.SourcesDirectory)/build:${LD_LIBRARY_PATH} NCCL_DEBUG=INFO RCCL_ENABLE_SIGNALHANDLER=1 ./rccl-UnitTests'
testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes'
- ${{ each pytestScript in parameters.pytestList }}:
- task: Bash@3
displayName: Test ${{ pytestScript }}
continueOnError: true
inputs:
targetType: inline
workingDirectory: $(Build.SourcesDirectory)/$(pytestFolder)
script: pytest ${{ pytestScript }}.py
================================================
FILE: .azuredevops/multinode-ci-slurm-nightly.yml
================================================
resources:
repositories:
- repository: pipelines_repo
type: github
endpoint: ROCm
name: ROCm/ROCm
variables:
- group: common
- template: /.azuredevops/variables-global.yml@pipelines_repo
trigger: none
pr: none
schedules:
- cron: "0 5 * 11-3 *" # 11 PM CST (November - March)
displayName: "Nightly Build (CST)"
branches:
include:
- develop
always: false
- cron: "0 4 * 4-10 *" # 11 PM CDT (April - October)
displayName: "Nightly Build (CDT)"
branches:
include:
- develop
always: false
jobs:
- job: rccl
timeoutInMinutes: 240
pool: rocm-ci_rccl_slurm_pool
workspace:
clean: all
steps:
- task: DeleteFiles@1
inputs:
Contents: '**/*'
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml@pipelines_repo
parameters:
submoduleBehaviour: recursive
- template: templates/build.yml
- template: templates/test_rccl-UnitTests.yml
- template: templates/test_rccl-tests.yml
================================================
FILE: .azuredevops/multinode-ci-slurm-pr.yml
================================================
resources:
repositories:
- repository: pipelines_repo
type: github
endpoint: ROCm
name: ROCm/ROCm
variables:
- group: common
- template: /.azuredevops/variables-global.yml@pipelines_repo
trigger: none
pr:
autoCancel: true
branches:
include:
- develop
paths:
exclude:
- .github
- .jenkins
- docs
- '*.md'
- LICENSE.txt
- NOTICES.txt
drafts: false
stages:
- stage: rcclStage
displayName: 'RCCL develop PR'
jobs:
- deployment: rccl_pr_approval
displayName: "CI Run Requires Approval"
environment: rccl
- job: rccl
timeoutInMinutes: 240
pool: rocm-ci_rccl_slurm_pool
workspace:
clean: all
steps:
- task: DeleteFiles@1
inputs:
Contents: '**/*'
- template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml@pipelines_repo
parameters:
submoduleBehaviour: recursive
- template: templates/build.yml
- template: templates/test_rccl-UnitTests.yml
- template: templates/test_rccl-tests.yml
================================================
FILE: .azuredevops/rocm-ci.yml
================================================
variables:
- group: common
- template: /.azuredevops/variables-global.yml@pipelines_repo
parameters:
- name: pipelinesRepoRef
type: string
default: refs/heads/develop
- name: systemsRepoRef
type: string
default: refs/heads/develop
- name: systemsSparseCheckoutDir
type: string
default: 'projects/rocprofiler-sdk'
- name: triggerDownstreamJobs
type: boolean
default: true
resources:
repositories:
- repository: pipelines_repo
type: github
endpoint: ROCm
name: ROCm/ROCm
ref: ${{ parameters.pipelinesRepoRef }}
- repository: systems_repo
type: github
endpoint: ROCm
name: ROCm/rocm-systems
ref: ${{ parameters.systemsRepoRef }}
trigger:
batch: true
branches:
include:
- develop
- mainline
paths:
exclude:
- .github
- .jenkins
- docs
- '.*.y*ml'
- '*.md'
- LICENSE.txt
- NOTICES.txt
pr:
autoCancel: true
branches:
include:
- develop
- mainline
paths:
exclude:
- .github
- .jenkins
- docs
- '.*.y*ml'
- '*.md'
- LICENSE.txt
- NOTICES.txt
drafts: false
stages:
- stage: rccl
jobs:
- template: ${{ variables.CI_COMPONENT_PATH }}/rccl.yml@pipelines_repo
parameters:
sparseCheckoutDir: ''
systemsRepo: systems_repo
systemsSparseCheckoutDir: ${{ parameters.systemsSparseCheckoutDir }}
triggerDownstreamJobs: ${{ parameters.triggerDownstreamJobs }}
================================================
FILE: .azuredevops/slurm/build.sh
================================================
#!/bin/bash
#SBATCH --job-name=rccl-build
#SBATCH --output=rccl-build-%j.out
#SBATCH --error=rccl-build-%j.out
#SBATCH --time=60
#SBATCH --nodes=1
#SBATCH --exclusive
#SBATCH --partition=gt
short_id=$(hostname | cut -d'.' -f1 | cut -d'-' -f3-)
echo "Node identifier: $short_id"
source /etc/profile.d/lmod.sh
module load rocm/6.4.1
# Setup local binary path
export PATH="$HOME/.local/bin:$PATH"
mkdir -p "$HOME/.local/bin"
# Install Ninja if not already available
if ! command -v ninja &>/dev/null; then
echo "Ninja not found. Installing locally..."
wget -q https://github.com/ninja-build/ninja/releases/download/v1.11.1/ninja-linux.zip -O /tmp/ninja.zip
unzip -q /tmp/ninja.zip -d "$HOME/.local/bin"
chmod +x "$HOME/.local/bin/ninja"
fi
echo "Using Ninja at: $(which ninja)"
ninja --version
# Define GPU target
export GPU_TARGETS="gfx942"
cd "${SLURM_SUBMIT_DIR:-$PWD}"
## Building RCCL
mkdir -p build
cd build
cmake -G Ninja -DCMAKE_INSTALL_PREFIX="$BINARIES_DIR" -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=${GPU_TARGETS} -DBUILD_TESTS=ON -DROCM_PATH="$ROCM_PATH" ..
cmake --build .
cmake --build . --target install
# Building RCCL Replayer
cd ../tools/RcclReplayer 2>/dev/null || cd ../RcclReplayer
RCCL_DIR="../../build" ROCM_DIR="$ROCM_PATH" MPI_DIR="$MPI_HOME" make
cd "${SLURM_SUBMIT_DIR:-$PWD}"
## Building RCCL-Tests
git clone https://github.com/ROCm/rccl-tests
cd rccl-tests
mkdir -p build
cd build
cmake -DCMAKE_PREFIX_PATH="$BINARIES_DIR;$MPI_HOME" -DUSE_MPI=ON -DCMAKE_INSTALL_PREFIX="$BINARIES_DIR" -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=${GPU_TARGETS} -DROCM_PATH="$ROCM_PATH" ..
cmake --build .
cmake --build . --target install
================================================
FILE: .azuredevops/slurm/test_rccl-UnitTests.sh
================================================
#!/bin/bash
#SBATCH --job-name=rccl-UnitTests
#SBATCH --output=%x-%j.out
#SBATCH --error=%x-%j.out
#SBATCH --time=180
#SBATCH --nodes=1
#SBATCH --exclusive
#SBATCH --partition=gt
short_id=$(hostname | cut -d'.' -f1 | cut -d'-' -f3-)
echo "Node identifier: $short_id"
source /etc/profile.d/lmod.sh
module load rocm/6.4.1
cd "$BINARIES_DIR/bin"
LD_LIBRARY_PATH="$BINARIES_DIR/lib:$LD_LIBRARY_PATH" NCCL_DEBUG=INFO RCCL_ENABLE_SIGNALHANDLER=1 HSA_NO_SCRATCH_RECLAIM=1 ./rccl-UnitTests --gtest_output=xml:$PIPELINE_WORKSPACE/rccl-UnitTests_output.xml --gtest_color=yes
================================================
FILE: .azuredevops/slurm/test_rccl-tests.sh
================================================
#!/bin/bash
#SBATCH --job-name=rccl-tests
#SBATCH --output=%x-%j.out
#SBATCH --error=%x-%j.out
#SBATCH --time=60
#SBATCH --nodes=1
#SBATCH --exclusive
#SBATCH --partition=gt
short_id=$(hostname | cut -d'.' -f1 | cut -d'-' -f3-)
echo "Node identifier: $short_id"
source /etc/profile.d/lmod.sh
module load rocm/6.4.1
cd ${PIPELINE_WORKSPACE}/TestResults
mkdir -p ${PIPELINE_WORKSPACE}/TestResults/rccl-tests_logs
export WORKDIR=${PIPELINE_WORKSPACE}/TestResults/rccl-tests_logs
export PATH="$BINARIES_DIR/bin:$PATH"
export LD_LIBRARY_PATH="$BINARIES_DIR/lib:$LD_LIBRARY_PATH"
### create hostlist
#nodelist=($(scontrol show hostnames))
#echo "SLURM nodes:"
#echo ${nodelist[@]}
#echo ""
#
#hosts_8ppn=()
#for node in "${nodelist[@]}"
#do
# hosts_8ppn+=("${node}:8")
#done
#echo ${hosts_8ppn[@]}
### Run multi- and single-node RCCL-Tests
## Run single-node RCCL-Tests
for n in 1
do
total=$((n*8))
#h_8ppn=`echo ${hosts_8ppn[@]:0:${n}} | tr ' ' ','`
for coll in all_reduce all_gather reduce_scatter alltoall alltoallv broadcast gather reduce scatter sendrecv
do
for dtype in float bfloat16 half fp8_e5m2
do
out_filename="${WORKDIR}/rccl-tests_${coll}_1KB-16GB_nodes${n}_gpus${total}_${dtype}.log"
#cmd="${MPI_HOME}/bin/mpirun -np ${total} --host ${h_8ppn} -mca pml ob1 -mca btl ^openib -mca oob_tcp_if_exclude docker,lo -mca btl_tcp_if_exclude docker,lo -x PATH -x LD_LIBRARY_PATH -x NCCL_DEBUG=VERSION -x NCCL_IB_HCA=bnxt_re0,bnxt_re1,bnxt_re2,bnxt_re3,bnxt_re4,bnxt_re5,bnxt_re6,bnxt_re7 -x NCCL_IGNORE_CPU_AFFINITY=1 -x HSA_NO_SCRATCH_RECLAIM=1 -x NCCL_IB_GID_INDEX=3 ${BINARIES_DIR}/bin/${coll}_perf -b 1K -e 16G -f 2 -g 1 -n 100 -w 50 -d ${dtype} -Z json -x ${WORKDIR}/rccl-tests_${coll}_nodes${n}_gpus${total}_${dtype}.json"
cmd="${MPI_HOME}/bin/mpirun -np ${total} -mca pml ^ucx -mca osc ^ucx -mca btl ^openib -mca oob_tcp_if_exclude docker,lo -mca btl_tcp_if_exclude docker,lo -x PATH -x LD_LIBRARY_PATH -x NCCL_DEBUG=VERSION -x NCCL_IGNORE_CPU_AFFINITY=1 -x HSA_NO_SCRATCH_RECLAIM=1 ${BINARIES_DIR}/bin/${coll}_perf -b 1K -e 16G -f 2 -g 1 -n 100 -w 50 -d ${dtype} -Z json -x ${WORKDIR}/rccl-tests_${coll}_nodes${n}_gpus${total}_${dtype}.json"
echo "Running ${coll}" 2>&1 | tee ${out_filename}
echo "Run cmd: ${cmd}" 2>&1 | tee -a ${out_filename}
eval ${cmd} 2>&1 | tee -a ${out_filename}
sleep 2
done
done
done
## To add
### Summarize results
### Convert to junit
================================================
FILE: .azuredevops/templates/build.yml
================================================
# small subset of files to check for install to determine pass/fail
parameters:
- name: expectedInstallFiles
type: object
default:
- bin/rccl-UnitTests
- include/rccl/rccl.h
- lib/cmake/rccl/rccl-config.cmake
- lib/librccl.so
- share/doc/rccl/LICENSE.txt
- share/rccl/msccl-algorithms
- share/rccl/msccl-unit-test-algorithms
steps:
- task: Bash@3
displayName: Build Job
env:
BINARIES_DIR: $(Build.BinariesDirectory)
inputs:
targetType: inline
script: |
echo "##[section]Starting build job..."
rm -rf $(Build.BinariesDirectory)/*
echo "Submitting build job..."
mkdir -p $(Build.BinariesDirectory)
BUILD_JOB_ID=$(sbatch --export=ALL --parsable $(Build.SourcesDirectory)/.azuredevops/slurm/build.sh)
echo "Submitted build job: $BUILD_JOB_ID"
echo "##vso[task.setvariable variable=BUILD_JOB_ID]$BUILD_JOB_ID"
echo "Waiting for build job to start..."
while squeue -j $BUILD_JOB_ID 2>/dev/null | grep -q $BUILD_JOB_ID; do
echo "##[section]Build job $BUILD_JOB_ID is still running..."
sleep 60
done
echo "Waiting for final status via sacct..."
LOOP_COUNT=0
MAX_LOOPS=30 # Maximum of 30 loops (30 minutes)
while true; do
STATE=$(sacct -j "$BUILD_JOB_ID" --format=JobID,State --noheader | awk '$1 ~ /\.batch$/ { print $2; exit }' | xargs)
echo "##[section]Build job state: $STATE"
if [[ "$STATE" == "COMPLETED" ]]; then
break
elif [[ "$STATE" =~ ^(FAILED|CANCELLED|TIMEOUT)$ ]]; then
echo "Build failed with state $STATE"
break
fi
sleep 60
LOOP_COUNT=$((LOOP_COUNT + 1))
if [ $LOOP_COUNT -ge $MAX_LOOPS ]; then
echo "Time limit reached while waiting for final status."
exit 1 # Exit with an error code if time limit is reached
fi
done
echo "Checking for expected installed files..."
MISSING_FILES=0
expectedFiles="${{ join(' ', parameters.expectedInstallFiles) }}"
i=1
total=$(echo "$expectedFiles" | wc -w)
while [ $i -le $total ]; do
relpath=$(echo "$expectedFiles" | cut -d ' ' -f"$i")
fullpath="$BINARIES_DIR/$relpath"
if [ ! -e "$fullpath" ]; then
echo "##vso[task.logissue type=error]Missing expected file: $fullpath"
MISSING_FILES=1
fi
i=$((i + 1))
done
if [ "$MISSING_FILES" -eq 1 ]; then
echo "One or more expected files are missing from the install directory."
exit 1
else
echo "All expected files are present in the install directory."
fi
- task: Bash@3
displayName: Build Logs
condition: always()
inputs:
targetType: inline
script: |
cat rccl-build-${BUILD_JOB_ID}.out || echo "No log found"
================================================
FILE: .azuredevops/templates/test_rccl-UnitTests.yml
================================================
steps:
- task: Bash@3
displayName: RCCL UnitTests
env:
BINARIES_DIR: $(Build.BinariesDirectory)
PIPELINE_WORKSPACE: $(Pipeline.Workspace)
inputs:
targetType: inline
script: |
echo "Submitting test job..."
TEST_JOB_ID=$(sbatch --export=ALL --parsable $(Build.SourcesDirectory)/.azuredevops/slurm/test_rccl-UnitTests.sh)
echo "Submitted test job: $TEST_JOB_ID"
echo "##vso[task.setvariable variable=TEST_JOB_ID]$TEST_JOB_ID"
echo "Waiting for test job to start..."
while squeue -j $TEST_JOB_ID 2>/dev/null | grep -q $TEST_JOB_ID; do
echo "##[section]Test job $TEST_JOB_ID is still running..."
sleep 60
done
echo "Waiting for final status via sacct..."
LOOP_COUNT=0
MAX_LOOPS=120 # Maximum of 120 loops (120 minutes)
while true; do
STATE=$(sacct -j "$TEST_JOB_ID" --format=JobID,State --noheader | awk '$1 ~ /\.batch$/ { print $2; exit }' | xargs)
echo "##[section]Test job state: $STATE"
if [[ "$STATE" == "COMPLETED" ]]; then
break
elif [[ "$STATE" =~ ^(FAILED|CANCELLED|TIMEOUT)$ ]]; then
echo "Test failed with state $STATE"
break
fi
sleep 60
LOOP_COUNT=$((LOOP_COUNT + 1))
if [ $LOOP_COUNT -ge $MAX_LOOPS ]; then
echo "Time limit reached while waiting for final status."
exit 1 # Exit with an error code if time limit is reached
fi
done
echo "Checking test result XML for failures..."
TEST_XML=$(find "$(Pipeline.Workspace)" -name 'rccl-UnitTests_output.xml' | head -n1)
if [ -z "$TEST_XML" ]; then
echo "##vso[task.logissue type=error]No $TEST_XML file found"
echo "##vso[task.complete result=Failed;]DONE"
exit 1
fi
if grep -q 'failures="[^0]' "$TEST_XML"; then
echo "##vso[task.logissue type=error]Test failures detected in $TEST_XML"
echo "##vso[task.complete result=Failed;]DONE"
exit 1
else
echo "No test failures detected."
fi
- task: Bash@3
displayName: Test Logs
condition: always()
inputs:
targetType: inline
script: |
cat rccl-UnitTests-${TEST_JOB_ID}.out || echo "No log found"
- task: PublishTestResults@2
displayName: 'Publish Results'
condition: succeededOrFailed()
inputs:
searchFolder: $(Pipeline.Workspace)
testResultsFormat: JUnit
testResultsFiles: '**/rccl-UnitTests_output.xml'
================================================
FILE: .azuredevops/templates/test_rccl-tests.yml
================================================
steps:
- task: Bash@3
displayName: RCCL-Tests
env:
BINARIES_DIR: $(Build.BinariesDirectory)
PIPELINE_WORKSPACE: $(Pipeline.Workspace)
inputs:
targetType: inline
script: |
echo "Submitting test job..."
TEST_JOB_ID=$(sbatch --export=ALL --parsable $(Build.SourcesDirectory)/.azuredevops/slurm/test_rccl-tests.sh)
echo "Submitted test job: $TEST_JOB_ID"
echo "##vso[task.setvariable variable=TEST_JOB_ID]$TEST_JOB_ID"
echo "Waiting for test job to start..."
while squeue -j $TEST_JOB_ID 2>/dev/null | grep -q $TEST_JOB_ID; do
echo "##[section]Test job $TEST_JOB_ID is still running..."
sleep 60
done
echo "Waiting for final status via sacct..."
LOOP_COUNT=0
MAX_LOOPS=120 # Maximum of 120 loops (120 minutes)
while true; do
STATE=$(sacct -j "$TEST_JOB_ID" --format=JobID,State --noheader | awk '$1 ~ /\.batch$/ { print $2; exit }' | xargs)
echo "##[section]Test job state: $STATE"
if [[ "$STATE" == "COMPLETED" ]]; then
break
elif [[ "$STATE" =~ ^(FAILED|CANCELLED|TIMEOUT)$ ]]; then
echo "Test failed with state $STATE"
break
fi
sleep 60
LOOP_COUNT=$((LOOP_COUNT + 1))
if [ $LOOP_COUNT -ge $MAX_LOOPS ]; then
echo "Time limit reached while waiting for final status."
exit 1 # Exit with an error code if time limit is reached
fi
done
echo "Checking test result json for failures..."
TEST_JSON=$(find "$(Pipeline.Workspace)" -name 'rccl-tests*.json')
if [ -z "$TEST_JSON" ]; then
echo "##vso[task.logissue type=error]No $TEST_JSON file(s) found"
echo "##vso[task.complete result=Failed;]DONE"
exit 1
fi
#echo "Checking test result XML for failures..."
#TEST_XML=$(find "$(Pipeline.Workspace)" -name 'rccl-tests_output.xml' | head -n1)
#if [ -z "$TEST_XML" ]; then
# echo "##vso[task.logissue type=error]No $TES_XML file found"
# echo "##vso[task.complete result=Failed;]DONE"
# exit 1
#fi
#if grep -q 'failures="[^0]' "$TEST_XML"; then
# echo "##vso[task.logissue type=error]Test failures detected in $TEST_XML"
# echo "##vso[task.complete result=Failed;]DONE"
# exit 1
#else
# echo "No test failures detected."
#fi
- task: Bash@3
displayName: Test Logs
condition: always()
inputs:
targetType: inline
script: |
cat rccl-tests-${TEST_JOB_ID}.out || echo "No log found"
# - task: PublishTestResults@2
# displayName: 'Publish Results'
# condition: succeededOrFailed()
# inputs:
# searchFolder: $(Pipeline.Workspace)
# testResultsFormat: JUnit
# testResultsFiles: '**/rccl-tests_output.xml'
================================================
FILE: .azuredevops/tests/pytest/HelloWorld.py
================================================
import pytest
def test_HelloWorld():
greeting = "Hello, World!"
assert greeting == "Hello, World!"
================================================
FILE: .clang-format
================================================
# Style file for MLSE Libraries based on the modified rocBLAS style
# Common settings
BasedOnStyle: WebKit
TabWidth: 4
IndentWidth: 4
UseTab: Never
ColumnLimit: 100
UseCRLF: false
# Other languages JavaScript, Proto
---
Language: Json
DisableFormat: true
---
Language: Cpp
# http://releases.llvm.org/6.0.1/tools/clang/docs/ClangFormatStyleOptions.html#disabling-formatting-on-a-piece-of-code
# int formatted_code;
# // clang-format off
# void unformatted_code ;
# // clang-format on
# void formatted_code_again;
DisableFormat: false
Standard: Cpp11
AccessModifierOffset: -4
AlignAfterOpenBracket: true
AlignArrayOfStructures: Right
AlignConsecutiveAssignments: true
AlignConsecutiveDeclarations: true
AlignEscapedNewlines: Left
AlignOperands: true
AlignTrailingComments: false
AllowAllArgumentsOnNextLine: false
AllowAllParametersOfDeclarationOnNextLine: true
AllowShortBlocksOnASingleLine: Never
AllowShortCaseLabelsOnASingleLine: true
AllowShortFunctionsOnASingleLine: Empty
AllowShortIfStatementsOnASingleLine: false
AllowShortLoopsOnASingleLine: false
AlwaysBreakAfterReturnType: None
AlwaysBreakBeforeMultilineStrings: false
AlwaysBreakTemplateDeclarations: Yes
BinPackArguments: false
BinPackParameters: false
BitFieldColonSpacing: Both
# Configure each individual brace in BraceWrapping
BreakBeforeBraces: Custom
# Control of individual brace wrapping cases
BraceWrapping:
AfterCaseLabel: true
AfterClass: true
AfterControlStatement: Always
AfterEnum: true
AfterExternBlock: false
AfterFunction: true
AfterNamespace: true
AfterStruct: true
AfterUnion: true
BeforeCatch: true
BeforeElse: true
BeforeLambdaBody: true
BeforeWhile: true
IndentBraces: false
SplitEmptyFunction: false
SplitEmptyRecord: false
SplitEmptyNamespace: false
BreakBeforeBinaryOperators: All
BreakBeforeTernaryOperators: true
BreakConstructorInitializers: BeforeComma
BreakInheritanceList: BeforeComma
BreakStringLiterals: true
CommentPragmas: '^ IWYU pragma:'
CompactNamespaces: false
ConstructorInitializerIndentWidth: 4
ContinuationIndentWidth: 4
Cpp11BracedListStyle: true
DeriveLineEnding: false
DerivePointerAlignment: false
EmptyLineAfterAccessModifier: Never
EmptyLineBeforeAccessModifier: Always
ExperimentalAutoDetectBinPacking: false
FixNamespaceComments: true
ForEachMacros: []
IfMacros: []
IncludeBlocks: Preserve
IndentAccessModifiers: false
IndentCaseBlocks: true
IndentCaseLabels: true
IndentExternBlock: NoIndent
IndentPPDirectives: BeforeHash
IndentWrappedFunctionNames: true
KeepEmptyLinesAtTheStartOfBlocks: true
LambdaBodyIndentation: Signature
MacroBlockBegin: ''
MacroBlockEnd: ''
MaxEmptyLinesToKeep: 1
NamespaceIndentation: None
PPIndentWidth: -1
PackConstructorInitializers: NextLine
PenaltyBreakBeforeFirstCallParameter: 19
PenaltyBreakComment: 300
PenaltyBreakFirstLessLess: 120
PenaltyBreakString: 1000
PenaltyExcessCharacter: 1000000
PenaltyReturnTypeOnItsOwnLine: 60
PointerAlignment: Left
QualifierAlignment: Leave
ReferenceAlignment: Pointer
ReflowComments: false
ShortNamespaceLines: 0
SortIncludes: CaseSensitive
SortUsingDeclarations: true
SpaceAfterCStyleCast: false
SpaceAfterLogicalNot: false
SpaceAfterTemplateKeyword: false
SpaceAroundPointerQualifiers: Default
SpaceBeforeAssignmentOperators: true
SpaceBeforeCaseColon: false
SpaceBeforeCpp11BracedList: false
SpaceBeforeCtorInitializerColon: true
SpaceBeforeInheritanceColon: true
SpaceBeforeParens: Never
SpaceBeforeRangeBasedForLoopColon: true
SpaceBeforeSquareBrackets: false
SpaceInEmptyBlock: false
SpaceInEmptyParentheses: false
SpacesBeforeTrailingComments: 1
SpacesInAngles: Never
SpacesInCStyleCastParentheses: false
SpacesInConditionalStatement: false
SpacesInContainerLiterals: true
SpacesInParentheses: false
SpacesInSquareBrackets: false
---
================================================
FILE: .github/CODEOWNERS
================================================
* @ROCm/rccl-reviewers
# Documentation files
docs/ @ROCm/rocm-documentation
*.md @ROCm/rocm-documentation
*.rst @ROCm/rocm-documentation
.readthedocs.yaml @ROCm/rocm-documentation
src/include/api_trace.h @ROCm/ROCM-DevTools-Team
================================================
FILE: .github/PULL_REQUEST_TEMPLATE.md
================================================
## Details
___Do not mention proprietary info or link to internal work items in this PR.___
**Work item:** _"Internal", or link to GitHub issue (if applicable)._
**What were the changes?**
_One sentence describing the work done._
**Why were the changes made?**
_Explain the motivation behind the work. Provide any publicly-available historical context._
**How was the outcome achieved?**
_Technical details behind the work. Explain any publicly-available hardware peculiarities._
**Additional Documentation:**
_What else should the reviewer know?_
## Approval Checklist
___Do not approve until these items are satisfied.___
- [ ] Verify the CHANGELOG has been updated, if
- there are any NCCL API version changes,
- any changes impact library users, and/or
- any changes impact any other ROCm library.
================================================
FILE: .github/dependabot.yml
================================================
================================================
FILE: .github/scripts/therock_configure_ci.py
================================================
# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
# SPDX-License-Identifier: MIT
import fnmatch
import json
import os
from pathlib import Path
import subprocess
import sys
from typing import Iterable, Optional, Mapping
def gha_set_output(vars: Mapping[str, str | Path]):
"""Sets values in a step's output parameters.
This appends to the file located at the $GITHUB_OUTPUT environment variable.
See
* https://docs.github.com/en/actions/reference/workflow-commands-for-github-actions#setting-an-output-parameter
* https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/passing-information-between-jobs
"""
print(f"Setting github output:\n{vars}")
step_output_file = os.getenv("GITHUB_OUTPUT")
if not step_output_file:
print(" Warning: GITHUB_OUTPUT env var not set, can't set github outputs")
return
with open(step_output_file, "a") as f:
f.writelines(f"{k}={str(v)}" + "\n" for k, v in vars.items())
def get_modified_paths(base_ref: str) -> Optional[Iterable[str]]:
"""Returns the paths of modified files relative to the base reference."""
try:
return subprocess.run(
["git", "diff", "--name-only", base_ref],
stdout=subprocess.PIPE,
check=True,
text=True,
timeout=60,
).stdout.splitlines()
except TimeoutError:
print(
"Computing modified files timed out. Not using PR diff to determine"
" jobs to run.",
file=sys.stderr,
)
return None
GITHUB_WORKFLOWS_CI_PATTERNS = [
"therock*.yml",
]
def is_path_workflow_file_related_to_ci(path: str) -> bool:
return any(
fnmatch.fnmatch(path, ".github/workflows/" + pattern)
for pattern in GITHUB_WORKFLOWS_CI_PATTERNS
)
def check_for_workflow_file_related_to_ci(paths: Optional[Iterable[str]]) -> bool:
if paths is None:
return False
return any(is_path_workflow_file_related_to_ci(p) for p in paths)
# Paths matching any of these patterns are considered to have no influence over
# build or test workflows so any related jobs can be skipped if all paths
# modified by a commit/PR match a pattern in this list.
SKIPPABLE_PATH_PATTERNS = [
"docs/*",
"*.gitignore",
"*.md",
"*LICENSE*",
"*NOTICES*",
'.github/CODEOWNERS',
'.github/*.md',
'.github/dependabot.yml',
'.azuredevops*',
]
def is_path_skippable(path: str) -> bool:
"""Determines if a given relative path to a file matches any skippable patterns."""
return any(fnmatch.fnmatch(path, pattern) for pattern in SKIPPABLE_PATH_PATTERNS)
def check_for_non_skippable_path(paths: Optional[Iterable[str]]) -> bool:
"""Returns true if at least one path is not in the skippable set."""
if paths is None:
return False
return any(not is_path_skippable(p) for p in paths)
def should_ci_run_given_modified_paths(paths: Optional[Iterable[str]]) -> bool:
"""Returns true if CI workflows should run given a list of modified paths."""
if paths is None:
print("No files were modified, skipping TheRock CI jobs")
return False
paths_set = set(paths)
github_workflows_paths = set(
[p for p in paths if p.startswith(".github/workflows")]
)
other_paths = paths_set - github_workflows_paths
related_to_ci = check_for_workflow_file_related_to_ci(github_workflows_paths)
contains_other_non_skippable_files = check_for_non_skippable_path(other_paths)
print("should_ci_run_given_modified_paths findings:")
print(f" contains_other_non_skippable_files: {contains_other_non_skippable_files}")
if related_to_ci:
print("Enabling build jobs since a related workflow file was modified")
return True
elif contains_other_non_skippable_files:
print("Enabling TheRock CI jobs since a non-skippable path was modified")
return True
else:
print(
"Only unrelated and/or skippable paths were modified, skipping TheRock CI jobs"
)
return False
def main(args):
base_ref = args.get("base_ref")
modified_paths = get_modified_paths(base_ref)
print("modified_paths (max 200):", modified_paths[:200])
enable_jobs = should_ci_run_given_modified_paths(modified_paths)
output = {
'enable_therock_ci': json.dumps(enable_jobs)
}
gha_set_output(output)
if __name__ == "__main__":
args = {}
args["base_ref"] = os.environ.get("BASE_REF", "HEAD^1")
main(args)
================================================
FILE: .github/workflows/therock-ci-linux.yml
================================================
name: TheRock CI Linux
on:
workflow_call:
inputs:
amdgpu_families:
type: string
artifact_group:
type: string
extra_cmake_options:
type: string
permissions:
contents: read
jobs:
therock-build-linux:
name: Build Linux Packages
runs-on: azure-linux-scale-rocm
permissions:
id-token: write
container:
image: ghcr.io/rocm/therock_build_manylinux_x86_64@sha256:1f1ce0ab151146c7f86ee4345be74c42d8ca83200d9d26843e8a71df01ecad4e
options: -v /runner/config:/home/awsconfig/
env:
AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }}
TEATIME_FORCE_INTERACTIVE: 0
AWS_SHARED_CREDENTIALS_FILE: /home/awsconfig/credentials.ini
CACHE_DIR: ${{ github.workspace }}/.container-cache
# The ccache.conf will be written by setup_ccache.py before this gets used.
CCACHE_CONFIGPATH: ${{ github.workspace }}/.ccache/ccache.conf
steps:
- name: Checkout TheRock repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
repository: "ROCm/TheRock"
ref: ff46daa79b4c826c4f4676893d0d6586de567dfa # 2026-01-12 commit
- name: Checkout rccl repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
repository: "ROCm/rccl"
path: rccl
- name: Checkout rccl-tests repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
repository: "ROCm/rccl-tests"
path: rccl-tests
- name: Install python deps
run: |
pip install -r requirements.txt
# safe.directory must be set before Runner Health Status
- name: Adjust git config
run: |
git config --global --add safe.directory $PWD
git config fetch.parallel 10
- name: Setup ccache
run: |
./build_tools/setup_ccache.py \
--config-preset "github-oss-presubmit" \
--dir "$(dirname $CCACHE_CONFIGPATH)" \
--local-path "$CACHE_DIR/ccache"
- name: Runner health status
run: |
./build_tools/health_status.py
- name: Fetch sources
run: |
./build_tools/fetch_sources.py --jobs 12
- name: Configure Projects
env:
amdgpu_families: ${{ env.AMDGPU_FAMILIES }}
package_version: ADHOCBUILD
extra_cmake_options: ${{ inputs.extra_cmake_options }}
BUILD_DIR: build
run: |
python3 build_tools/github_actions/build_configure.py
- name: Build therock-dist
run: cmake --build build
- name: Build therock-archives
run: cmake --build build --target therock-archives
- name: Report
#if: ${{ !cancelled() }}
run: |
echo "Full SDK du:"
echo "------------"
du -h -d 1 build/dist/rocm
echo "Artifact Archives:"
echo "------------------"
ls -lh build/artifacts/*.tar.xz
echo "Artifacts:"
echo "----------"
du -h -d 1 build/artifacts
echo "CCache Stats:"
echo "-------------"
ccache -s -v
tail -v -n +1 .ccache/compiler_check_cache/* > build/logs/ccache_compiler_check_cache.log
- name: Configure AWS Credentials for non-forked repos
if: ${{ always() && !github.event.pull_request.head.repo.fork }}
uses: aws-actions/configure-aws-credentials@7474bc4690e29a8392af63c5b98e7449536d5c3a # v4.3.1
with:
aws-region: us-east-2
role-to-assume: arn:aws:iam::692859939525:role/therock-artifacts-external
- name: Post Build Upload
if: always()
run: |
python3 build_tools/github_actions/post_build_upload.py \
--run-id ${{ github.run_id }} \
--artifact-group ${{ env.AMDGPU_FAMILIES }} \
--build-dir build \
--upload
therock-test-linux-multi-node:
name: "Test multi-node"
if: ${{ inputs.amdgpu_families == 'gfx950-dcgpu' }}
permissions:
contents: read
id-token: write
needs: [therock-build-linux]
uses: ./.github/workflows/therock-test-packages-multi-node.yml
with:
amdgpu_families: ${{ inputs.amdgpu_families }}
artifact_group: ${{ inputs.artifact_group }}
test_runs_on: nova-linux-slurm-scale-runner
artifact_run_id: ${{ github.run_id }}
therock-test-linux-single-node:
name: "Test single-node"
if: ${{ inputs.amdgpu_families == 'gfx94X-dcgpu' }}
needs: [therock-build-linux]
uses: ./.github/workflows/therock-test-packages-single-node.yml
with:
amdgpu_families: ${{ inputs.amdgpu_families }}
artifact_group: ${{ inputs.artifact_group }}
test_runs_on: linux-mi325-4gpu-ossci-rocm
artifact_run_id: ${{ github.run_id }}
================================================
FILE: .github/workflows/therock-ci.yml
================================================
name: TheRock CI for rccl
on:
push:
branches:
- develop
pull_request:
types:
- labeled
- opened
- synchronize
workflow_dispatch:
permissions:
contents: read
concurrency:
# A PR number if a pull request and otherwise the commit hash. This cancels
# queued and in-progress runs for the same PR (presubmit) or commit
# (postsubmit). The workflow name is prepended to avoid conflicts between
# different workflows.
group: ${{ github.workflow }}-${{ github.event.number || github.sha }}
cancel-in-progress: true
jobs:
setup:
runs-on: ubuntu-24.04
env:
# The commit being checked out is the merge commit for a PR. Its first
# parent will be the tip of the base branch.
BASE_REF: HEAD^
outputs:
enable_therock_ci: ${{ steps.configure.outputs.enable_therock_ci }}
steps:
- name: "Checking out repository"
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
with:
# We need the parent commit to do a diff
fetch-depth: 2
- name: "Configuring CI options"
id: configure
run: python .github/scripts/therock_configure_ci.py
therock-ci-linux:
name: TheRock CI Linux
needs: setup
if: ${{ needs.setup.outputs.enable_therock_ci == 'true' }}
permissions:
contents: read
id-token: write
strategy:
fail-fast: false
matrix:
amdgpu_family: [gfx94X-dcgpu, gfx950-dcgpu]
uses: ./.github/workflows/therock-ci-linux.yml
secrets: inherit
with:
amdgpu_families: ${{ matrix.amdgpu_family }}
artifact_group: ${{ matrix.amdgpu_family }}
extra_cmake_options: >
-DTHEROCK_ENABLE_ALL=OFF
-DTHEROCK_BUILD_TESTING=ON
-DTHEROCK_BUNDLE_SYSDEPS=ON
-DTHEROCK_ENABLE_COMM_LIBS=ON
-DTHEROCK_ENABLE_ROCPROFV3=ON
-DTHEROCK_USE_EXTERNAL_RCCL=ON
-DTHEROCK_USE_EXTERNAL_RCCL_TESTS=ON
-DTHEROCK_RCCL_SOURCE_DIR=./rccl
-DTHEROCK_RCCL_TESTS_SOURCE_DIR=./rccl-tests
-DTHEROCK_ENABLE_MPI=ON
therock_ci_summary:
name: TheRock CI Summary
if: always()
needs:
- setup
- therock-ci-linux
runs-on: ubuntu-24.04
steps:
- name: Output failed jobs
run: |
echo '${{ toJson(needs) }}'
FAILED_JOBS="$(echo '${{ toJson(needs) }}' \
| jq --raw-output \
'map_values(select(.result!="success" and .result!="skipped")) | keys | join(",")' \
)"
if [[ "${FAILED_JOBS}" != "" ]]; then
echo "The following jobs failed: ${FAILED_JOBS}"
exit 1
fi
================================================
FILE: .github/workflows/therock-test-packages-multi-node.yml
================================================
name: TheRock Test Packages multi-node
on:
workflow_call:
inputs:
amdgpu_families:
type: string
artifact_group:
type: string
test_runs_on:
type: string
artifact_run_id:
type: string
workflow_dispatch:
inputs:
amdgpu_families:
type: string
artifact_group:
type: string
test_runs_on:
type: string
artifact_run_id:
type: string
permissions:
contents: read
id-token: write
jobs:
test_rccl_multi_node:
name: 'Test multi-node'
runs-on: ${{ inputs.test_runs_on }}
defaults:
run:
shell: bash
permissions:
contents: read
id-token: write
env:
VENV_DIR: ${{ github.workspace }}/.venv
ARTIFACT_RUN_ID: "${{ inputs.artifact_run_id }}"
OUTPUT_ARTIFACTS_DIR: /apps/cvs_tests/dist_new/dist/rocm
THEROCK_BIN_DIR: "./build/bin"
AWS_SHARED_CREDENTIALS_FILE: /apps/cvs_tests/awsconfig/credentials.ini
steps:
- name: Checkout Repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
repository: "ROCm/TheRock"
ref: ff46daa79b4c826c4f4676893d0d6586de567dfa # 2026-01-12 commit
- name: Run setup test environment workflow
uses: './.github/actions/setup_test_environment'
with:
ARTIFACT_RUN_ID: ${{ env.ARTIFACT_RUN_ID }}
AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }}
ARTIFACT_GROUP: ${{ inputs.artifact_group }}
OUTPUT_ARTIFACTS_DIR: ${{ env.OUTPUT_ARTIFACTS_DIR }}
VENV_DIR: ${{ env.VENV_DIR }}
FETCH_ARTIFACT_ARGS: "--rccl --tests"
IS_PR_FROM_FORK: ${{ github.event.pull_request.head.repo.fork }}
# The following step leverages slurm to run multi node rccl tests on the slurm mi350x cluster.
# salloc will hold 4 nodes while the commands inside the block run. After the block completes, salloc automatically releases the nodes.
# sbatch script runs rccl_heatmap_cvs script which validates and generates a bandwidth heatmap file for different rccl collectives
- name: Test gfx950
if: ${{ inputs.amdgpu_families == 'gfx950-dcgpu' }}
run: |
SETUP_NODES=1 sbatch --wait -N4 /apps/cvs_tests/cvs-sbatch/sbatch/default.sbatch
- name: Configure AWS Credentials for non-forked repos
if: ${{ always() && !github.event.pull_request.head.repo.fork }}
uses: aws-actions/configure-aws-credentials@7474bc4690e29a8392af63c5b98e7449536d5c3a # v4.3.1
with:
aws-region: us-east-2
role-to-assume: arn:aws:iam::692859939525:role/therock-artifacts-external
- name: Post test report upload
if: always()
working-directory: ${{ github.workspace }}
run: |
export PYTHONPATH="${PYTHONPATH}:${{ github.workspace }}/build_tools"
python3 build_tools/github_actions/upload_test_report_script.py \
--run-id "${{ github.run_id }}" \
--amdgpu-family "${{ inputs.amdgpu_families }}" \
--report-path "/apps/cvs_tests/test_reports" \
--log-destination "/logs/gfx950-dcgpu" \
--index-file-name "index_rccl_test_report.html"
================================================
FILE: .github/workflows/therock-test-packages-single-node.yml
================================================
name: TheRock Test Packages single-node
on:
workflow_call:
inputs:
amdgpu_families:
type: string
artifact_group:
type: string
test_runs_on:
type: string
artifact_run_id:
type: string
workflow_dispatch:
inputs:
amdgpu_families:
type: string
artifact_group:
type: string
test_runs_on:
type: string
artifact_run_id:
type: string
permissions:
contents: read
jobs:
test_rccl_single_node:
name: 'Test single-node'
runs-on: ${{ inputs.test_runs_on }}
container:
image: ghcr.io/rocm/no_rocm_image_ubuntu24_04@sha256:4150afe4759d14822f0e3f8930e1124f26e11f68b5c7b91ec9a02b20b1ebbb98
options: --ipc host
--group-add video
--device /dev/kfd
--device /dev/dri
--group-add 110
--ulimit memlock=-1:-1
--security-opt seccomp=unconfined
--env-file /etc/podinfo/gha-gpu-isolation-settings
--user 0:0
defaults:
run:
shell: bash
env:
VENV_DIR: ${{ github.workspace }}/.venv
ARTIFACT_RUN_ID: "${{ inputs.artifact_run_id }}"
OUTPUT_ARTIFACTS_DIR: "./build"
THEROCK_BIN_DIR: "./build/bin"
steps:
- name: Checkout Repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
repository: "ROCm/TheRock"
ref: ff46daa79b4c826c4f4676893d0d6586de567dfa # 2026-01-12 commit
- name: Run setup test environment workflow
uses: './.github/actions/setup_test_environment'
with:
ARTIFACT_RUN_ID: ${{ env.ARTIFACT_RUN_ID }}
AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }}
ARTIFACT_GROUP: ${{ inputs.artifact_group }}
OUTPUT_ARTIFACTS_DIR: ${{ env.OUTPUT_ARTIFACTS_DIR }}
VENV_DIR: ${{ env.VENV_DIR }}
FETCH_ARTIFACT_ARGS: "--rccl --tests"
IS_PR_FROM_FORK: ${{ github.event.pull_request.head.repo.fork }}
- name: Test
timeout-minutes: 15
# Currently, TheRock CI in RCCL always builds with MPI-supported enabled which causes the
# RCCL correctness tests to fail on the mi325 runners which don't have MPI pre-installed.
# TODO (geomin12): Rebuild rccl-tests without MPI to enable RCCL correctness tests.
run: |
pytest ./build_tools/github_actions/test_executable_scripts/test_rccl.py -v -s \
-k "not test_rccl_correctness_tests" \
--log-cli-level=info
================================================
FILE: .gitignore
================================================
# Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
*.gcov
/coverage/
build/
ext/
src/transport/net_ib_rocm.cc
# Visual Studio Code
.vscode
================================================
FILE: .gitmodules
================================================
[submodule "ext-src/mscclpp"]
path = ext-src/mscclpp
url = https://github.com/microsoft/mscclpp.git
ignore = dirty
shallow = true
[submodule "ext-src/json"]
path = ext-src/json
url = https://github.com/nlohmann/json.git
ignore = dirty
shallow = true
[submodule "ext-src/rocSHMEM"]
path = ext-src/rocSHMEM
url = https://github.com/ROCm/rocSHMEM.git
branch = develop
================================================
FILE: .readthedocs.yaml
================================================
# Read the Docs configuration file
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
version: 2
build:
os: ubuntu-22.04
tools:
python: "3.10"
sphinx:
configuration: docs/conf.py
formats: [htmlzip, pdf, epub]
python:
install:
- requirements: docs/sphinx/requirements.txt
================================================
FILE: CHANGELOG.md
================================================
# Changelog for RCCL
Full documentation for RCCL is available at [https://rccl.readthedocs.io](https://rccl.readthedocs.io)
## Unreleased - RCCL 2.28.3 for ROCm 7.11
### Known issues
* AllGather regression for small message sizes (less than 1 MB) due to the Direct algorithm.
* ROCTx feature needs to be verified.
* Profiler plugin needs to be verified.
### Changed
* Compatibility with NCCL 2.28.3.
* The MSCCL feature is now disabled by default. The `--disable-msccl-kernel` build flag is replaced with `--enable-msccl-kernel` in the `rccl/install.sh` script.
* MSCCL and NPKIT are deprecated and will be removed in a future release of RCCL.
## Unreleased - RCCL 2.27.7 for ROCm 7.2.0
### Changed
* RCCL error messages have been made more verbose in several cases. RCCL now prints out fatal error messages by default. Fatal error messages can be suppressed by setting `NCCL_DEBUG=NONE`.
* Disabled `reduceCopyPacks` pipelining for `gfx950`.
* Experimental support for traffic shaping using warp specialization (also known as WarpSpeed) is now available for the Ring algorithm.
* Enabling WarpSpeed in auto mode using RCCL_WARP_SPEED_AUTO optimizes performance and reduces the CU count by 50% on a single node for AllReduce, AllGather from 64MB, and ReduceScatter from 256MB.
* The following configuration knobs control WarpSpeed behavior for debugging purposes: `RCCL_WARP_SPEED_ENABLE`, `RCCL_UNROLL_FACTOR`, `RCCL_WARP_SPEED_CU_COUNT`, and `RCCL_THREADS_PER_BLOCK`. Note that the effective unroll factor is calculated as 2 raised to the value of `RCCL_UNROLL_FACTOR`.
### Known issues
* AllToAllv/AlltoAll for single GPU is hanging.
## Unreleased - RCCL 2.27.7 for ROCm 7.1.1
### Changed
* Enabling P2P batching with `RCCL_P2P_BATCH_ENABLE=1` is only applicable up to 32 nodes.
### Resolved Issues
* Fixed crash when using the librccl-profiler plugin with the all-to-all collective after the 2.27 update.
## RCCL 2.27.7 for ROCm 7.1.0
### Added
* Added `RCCL_IB_QPS_PER_P2P` to set the number of QPs per connection for P2P operations. When set (≥1), P2P operations (Send/Recv) use `RCCL_IB_QPS_PER_P2P`, while other collective operations continue to use `NCCL_IB_QPS_PER_CONNECTION`. When not set, `NCCL_IB_QPS_PER_CONNECTION` applies to all operations.
* Added `RCCL_FORCE_ENABLE_DMABUF` as a debugging feature if the user wants to explicitly enable DMABUF and forego system/kernel checks.
* Added `RCCL_P2P_BATCH_THRESHOLD` to set the message size limit for batching P2P operations. This mainly affects small message performance for alltoall at a large scale but also applies to alltoallv.
* Added `RCCL_P2P_BATCH_ENABLE` to enable batching P2P operations to receive performance gains for smaller messages up to 4MB for alltoall when the workload requires it. This is to avoid performance dips for larger messages.
* Added `RCCL_CHANNEL_TUNING_ENABLE` to enable channel tuning that overrides RCCL's internal adjustments based on threadThreshold.
### Changed
* The MSCCL++ feature is now disabled by default. The `--disable-mscclpp` build flag is replaced with `--enable-mscclpp` in the `rccl/install.sh` script.
* Compatibility with NCCL 2.27.7.
### Optimized
* Enabled and optimized batched P2P operations to improve small message performance for AllToAll and AllGather.
* Optimized channel count selection to improve efficiency for small to medium message sizes in ReduceScatter.
* Changed code inlining to improve latency for small message sizes for AllReduce, AllGather, and ReduceScatter.
### Known issues
* Symmetric memory kernels are currently disabled due to ongoing CUMEM enablement work.
* When running this version of RCCL using ROCm versions earlier than 6.4.0, the user must set the environment flag `HSA_NO_SCRATCH_RECLAIM=1`.
## RCCL 2.26.6 for ROCm 7.0.0
### Resolved issues
* Resolved an issue when using more than 64 channels when multiple collectives are used in the same `ncclGroup()` call.
* Fixed unit test failures in tests ending with `ManagedMem` and `ManagedMemGraph` suffixes.
* Suboptimal algorithmic switching point for AllReduce on MI300x.
* Fixed the known issue "When splitting a communicator using `ncclCommSplit` in some GPU configurations, MSCCL initialization can cause a segmentation fault." with a design change to use `comm` instead of `rank` for `mscclStatus`. The Global map for `comm` to `mscclStatus` is still not thread safe but should be explicitly handled by mutexes for read writes. This is tested for correctness, but there is a plan to use a thread-safe map data structure in upcoming changes.
* Fixed broken functionality within the LL protocol on gfx950 by disabling inlining of LLGenericOp kernels.
### Added
* Added new GPU target `gfx950`.
* Added support for `unroll=1` in device-code generation to improve performance,
* Set a default of 112 channels for a single node with `8 * gfx950`,
* Enabled LL128 protocol on `gfx950`.
* Added MSCCL support for AllGather multinode gfx942/gfx950 (i.e., 16 and 32 GPUs). To enable, set the environment variable `RCCL_MSCCL_FORCE_ENABLE=1`. Max message size for MSCCL AllGather usage is `12292 * sizeof(datatype) * nGPUs`.
* Thread thresholds for LL/LL128 are selected in Tuning Models for the MI300X. This impacts the number of channels used for AG and RS. Channel tuning model is bypassed if `NCCL_THREAD_THRESHOLDS`, `NCCL_MIN_NCHANNELS', or 'NCCL_MAX_NCHANNELS` are set.
* Multi-node tuning for AllGather, AllReduce, and ReduceScatter that leverages LL/LL64/LL128 protocol to use nontemporal vector load/store for tunable message size ranges.
* LL/LL128 usage ranges for AR, AG, and RS are part of the tuning models, which enable architecture-specific tuning in conjunction with the existing Rome Models scheme in RCCL.
* Two new APIs are exposed as part of an initiative to separate RCCL code. These APIs are `rcclGetAlgoInfo` and `rcclFuncMaxSendRecvCount`. However, user-level invocation requires that RCCL be built with `RCCL_EXPOSE_STATIC` enabled.
* Enabled double-buffering in `reduceCopyPacks` to trigger pipelining, especially to overlap `bf16` arithmetic and bridge the gap between `fp32` performance and `bf16` for both `gfx942` and `gfx950`. Pipelining has been made tunable via `rcclSetPipelining`, similar to algorithms/protocols so that regression is avoided in certain message sizes.
* Added a direct allgather algorithm. This is enabled by default for multi-node if there are 16 nodes or fewer. The message size threshold is 4MB.
* Added `RCCL_OVERRIDE_PROTO` and `RCCL_OVERRIDE_ALGO` to allow direct replacement of protocol and algorithm choices. Unlike `NCCL_PROTO` and `NCCL_ALGO`, which re-run the model across enabled combinations and may not guarantee the intended override, these new options enforce the specified selections explicitly.
### Changed
* Compatibility with NCCL 2.23.4.
* Compatibility with NCCL 2.24.3.
* Compatibility with NCCL 2.25.1.
* Compatibility with NCCL 2.26.6.
### Optimized
* Improved the performance of the `FP8` Sum operation by upcasting to `FP16`.
### Known Issues
* When running this version of RCCL using ROCm versions earlier than 6.4.0, the user must set the environment flag `HSA_NO_SCRATCH_RECLAIM=1`.
## RCCL 2.22.3 for ROCm 6.4.2
### Added
* Added support for the LL128 protocol on gfx942.
## RCCL 2.22.3 for ROCm 6.4.1
### Resolved issues
* Fixed the accuracy issue for MSCCLPP `allreduce7` kernel in graph mode.
* Fixed IntraNet performance.
* Fixed an issue where, in rare circumstances, the application could stop responding due to a proxy thread synchronization issue.
### Known issues
* When splitting a communicator using `ncclCommSplit` in some GPU configurations, MSCCL initialization can cause a segmentation fault.
The recommended workaround is to disable MSCCL with `export RCCL_MSCCL_ENABLE=0`.
* Within the RCCL-UnitTests test suite, failures occur in tests ending with the `ManagedMem` and `ManagedMemGraph` suffixes. These failures only affect the test results and do not affect the RCCL component itself. This issue will be resolved in the next major release.
## RCCL 2.22.3 for ROCm 6.4.0
### Added
* `RCCL_SOCKET_REUSEADDR` and `RCCL_SOCKET_LINGER` environment parameters.
* Setting `NCCL_DEBUG=TRACE NCCL_DEBUG_SUBSYS=VERBS` will generate traces for fifo and data `ibv_post_sends`.
* Added `--log-trace` flag to enable traces through the install.sh script (e.g. `./install.sh --log-trace`).
### Changed
* Compatibility with NCCL 2.22.3
* Added support for the rail-optimized tree algorithm for the MI300 series. This feature requires the use of all eight GPUs within
each node. It limits NIC traffic to use only GPUs of the same index across nodes and should not impact performance
on non-rail-optimized network topologies. The original method of building trees can be enabled by setting the
environment variable `RCCL_DISABLE_RAIL_TREES=1`.
* Additional debug information about how the trees are built can be logged to the GRAPH logging subsys by setting
`RCCL_OUTPUT_TREES=1`.
* Added documentation about the NPS4 and CPX partition modes performance benefits on the MI300X.
## RCCL 2.21.5 for ROCm 6.3.1
### Added
### Changed
* Enhanced user documentation
### Resolved issues
* Corrected user help strings in `install.sh`
## RCCL 2.21.5 for ROCm 6.3.0
### Added
* MSCCL++ integration for AllReduce and AllGather on gfx942
* Performance collection to rccl_replayer
* Tuner Plugin example for MI300
* Tuning table for large number of nodes
* Support for amdclang++
* Allow NIC ID remapping using `NCCL_RINGS_REMAP` environment variable
### Changed
* Compatibility with NCCL 2.21.5
* Increased channel count for MI300X multi-node
* Enabled MSCCL for single-process multi-threaded contexts
* Enabled gfx12
* Enabled CPX mode for MI300X
* Enabled tracing with rocprof
* Improved version reporting
* Enabled GDRDMA for Linux kernel 6.4.0+
### Resolved issues
* Fixed model matching with PXN enable
## RCCL 2.20.5 for ROCm 6.2.1
### Fixed
- GDR support flag now set with DMABUF
### Known issues
- On systems running Linux kernel 6.8.0, such as Ubuntu 24.04, Direct Memory Access (DMA) transfers between the GPU and NIC are disabled and impacts multi-node RCCL performance.
- This issue was reproduced with RCCL 2.20.5 (ROCm 6.2.0 and 6.2.1) on systems with Broadcom Thor-2 NICs and affects other systems with RoCE networks using Linux 6.8.0 or newer.
- Older RCCL versions are also impacted.
- This issue will be addressed in a future ROCm release.
## RCCL 2.20.5 for ROCm 6.2.0
### Changed
- Compatibility with NCCL 2.20.5
- Compatibility with NCCL 2.19.4
- Performance tuning for some collective operations on MI300
- Enabled NVTX code in RCCL
- Replaced rccl_bfloat16 with hip_bfloat16
- NPKit updates:
- Removed warm-up iteration removal by default, need to opt in now
- Doubled the size of buffers to accommodate for more channels
- Modified rings to be rail-optimized topology friendly
- Replaced ROCmSoftwarePlatform links with ROCm links
### Added
- Support for fp8 and rccl_bfloat8
- Support for using HIP contiguous memory
- Implemented ROC-TX for host-side profiling
- Enabled static build
- Added new rome model
- Added fp16 and fp8 cases to unit tests
- New unit test for main kernel stack size
- New -n option for topo_expl to override # of nodes
- Improved debug messages of memory allocations
### Fixed
- Bug when configuring RCCL for only LL128 protocol
- Scratch memory allocation after API change for MSCCL
## RCCL 2.18.6 for ROCm 6.1.0
### Changed
- Compatibility with NCCL 2.18.6
## RCCL 2.18.3 for ROCm 6.0.0
### Changed
- Compatibility with NCCL 2.18.3
## RCCL 2.17.1-1 for ROCm 5.7.0
### Changed
- Compatibility with NCCL 2.17.1-1
- Performance tuning for some collective operations
### Added
- Minor improvements to MSCCL codepath
- NCCL_NCHANNELS_PER_PEER support
- Improved compilation performance
- Support for gfx94x
### Fixed
- Potential race-condition during ncclSocketClose()
## RCCL 2.16.2 for ROCm 5.6.0
### Changed
- Compatibility with NCCL 2.16.2
### Fixed
- Remove workaround and use indirect function call
## RCCL 2.15.5 for ROCm 5.5.0
### Changed
- Compatibility with NCCL 2.15.5
- Unit test executable renamed to rccl-UnitTests
### Added
- HW-topology aware binary tree implementation
- Experimental support for MSCCL
- New unit tests for hipGraph support
- NPKit integration
### Fixed
- rocm-smi ID conversion
- Support for HIP_VISIBLE_DEVICES for unit tests
- Support for p2p transfers to non (HIP) visible devices
### Removed
- Removed TransferBench from tools. Exists in standalone repo: https://github.com/ROCm/TransferBench
## RCCL-2.13.4 for ROCm 5.4.0
### Changed
- Compatibility with NCCL 2.13.4
- Improvements to RCCL when running with hipGraphs
- RCCL_ENABLE_HIPGRAPH environment variable is no longer necessary to enable hipGraph support
- Minor latency improvements
### Fixed
- Resolved potential memory access error due to asynchronous memset
## RCCL-2.12.10 for ROCm 5.3.0
### Changed
- Improvements to LL128 algorithms
### Added
- Adding initial hipGraph support via opt-in environment variable RCCL_ENABLE_HIPGRAPH
- Integrating with NPKit (https://github.com/microsoft/NPKit) profiling code
## RCCL-2.12.10 for ROCm 5.2.3
### Added
- Compatibility with NCCL 2.12.10
- Packages for test and benchmark executables on all supported OSes using CPack.
- Adding custom signal handler - opt-in with RCCL_ENABLE_SIGNALHANDLER=1
- Additional details provided if Binary File Descriptor library (BFD) is pre-installed
- Adding support for reusing ports in NET/IB channels
- Opt-in with NCCL_IB_SOCK_CLIENT_PORT_REUSE=1 and NCCL_IB_SOCK_SERVER_PORT_REUSE=1
- When "Call to bind failed : Address already in use" error happens in large-scale AlltoAll
(e.g., >=64 MI200 nodes), users are suggested to opt-in either one or both of the options
to resolve the massive port usage issue
- Avoid using NCCL_IB_SOCK_SERVER_PORT_REUSE when NCCL_NCHANNELS_PER_NET_PEER is tuned >1
### Removed
- Removed experimental clique-based kernels
## RCCL-2.11.4 for ROCm 5.2.0
### Changed
- Unit testing framework rework
- Minor bug fixes
### Known issues
- Managed memory is not currently supported for clique-based kernels
## RCCL-2.11.4 for ROCm 5.1.0
### Added
- Compatibility with NCCL 2.11.4
### Known issues
- Managed memory is not currently supported for clique-based kernels
## RCCL-2.10.3 for ROCm 5.0.0
### Added
- Compatibility with NCCL 2.10.3
### Known issues
- Managed memory is not currently supported for clique-based kernels
## RCCL-2.9.9 for ROCm 4.5.0
### Changed
- Packaging split into a runtime package called rccl and a development package called rccl-devel. The development package depends on runtime. The runtime package suggests the development package for all supported OSes except CentOS 7 to aid in the transition. The suggests feature in packaging is introduced as a deprecated feature and will be removed in a future rocm release.
### Added
- Compatibility with NCCL 2.9.9
### Known issues
- Managed memory is not currently supported for clique-based kernels
## [RCCL-2.8.4 for ROCm 4.3.0]
### Added
- Ability to select the number of channels to use for clique-based all reduce (RCCL_CLIQUE_ALLREDUCE_NCHANNELS). This can be adjusted to tune for performance when computation kernels are being executed in parallel.
### Optimizations
- Additional tuning for clique-based kernel AllReduce performance (still requires opt in with RCCL_ENABLE_CLIQUE=1)
- Modification of default values for number of channels / byte limits for clique-based all reduce based on device architecture
### Changed
- Replaced RCCL_FORCE_ENABLE_CLIQUE to RCCL_CLIQUE_IGNORE_TOPO
- Clique-based kernels can now be enabled on topologies where all active GPUs are XGMI-connected
- Topologies not normally supported by clique-based kernels require RCCL_CLIQUE_IGNORE_TOPO=1
### Fixed
- Install script '-r' flag invoked alone no longer incorrectly deletes any existing builds.
### Known issues
- Managed memory is not currently supported for clique-based kernels
## [RCCL-2.8.4 for ROCm 4.2.0]
### Added
- Compatibility with NCCL 2.8.4
### Optimizations
- Additional tuning for clique-based kernels
- Enabling GPU direct RDMA read from GPU
- Fixing potential memory leak issue when re-creating multiple communicators within same process
- Improved topology detection
### Known issues
- None
## [RCCL-2.7.8 for ROCm 4.1.0]
### Added
- Experimental support for clique-based kernels (opt in with RCCL_ENABLE_CLIQUE=1)
- Clique-based kernels may offer better performance for smaller input sizes
- Clique-based kernels are currently only enabled for AllReduce under a certain byte limit (controlled via RCCL_CLIQUE_ALLREDUCE_BYTE_LIMIT)
### Optimizations
- Performance improvements for Rome-based systems
### Known issues
- Clique-based kernels are currently experimental and have not been fully tested on all topologies. By default, clique-based kernels are disabled if the detected topology is not supported (override with RCCL_FORCE_ENABLE_CLIQUE)
- Clique-based kernels may hang if there are differences between environment variables set across ranks.
- Clique-based kernels may fail if the input / output device pointers are not the base device pointers returned by hipMalloc.
## [RCCL-2.7.8 for ROCm 3.9.0]
### Added
- Adding support for alltoallv RCCL kernel
### Optimizations
- Modifications to topology based on XGMI links
### Known issues
- None
## [RCCL-2.7.6 for ROCm 3.8.0]
### Added
- Support for static library builds
### Known issues
- None
## [RCCL-2.7.6 for ROCm 3.7.0]
### Added
- Updated to RCCL API version of 2.7.6
- Added gather, scatter and all-to-all collectives
## [RCCL-2.7.0 for ROCm 3.6.0]
### Added
- Updated to RCCL API version of 2.6.4
## [RCCL-2.7.0 for ROCm 3.5.0]
### Added
- Compatibility with NCCL 2.6
- Network interface improvements with API v3
### Optimizations
- Fixing issues and built time improvements for hip-clang
- Network topology detection
- Improved CPU type detection
- Infiniband adaptive routing support
### Changed
- Switched to hip-clang as default compiler
### Deprecated
- Deprecated hcc build
================================================
FILE: CMakeLists.txt
================================================
# Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
# Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
# CMake version minimum requirements
#==================================================================================================
cmake_minimum_required(VERSION 3.16)
# CMake Toolchain file to define compilers and path to ROCm
#==================================================================================================
if (NOT CMAKE_TOOLCHAIN_FILE)
set(CMAKE_TOOLCHAIN_FILE "${CMAKE_CURRENT_SOURCE_DIR}/toolchain-linux.cmake")
message(STATUS "CMAKE_TOOLCHAIN_FILE: ${CMAKE_TOOLCHAIN_FILE}")
endif()
# RCCL project
#==================================================================================================
project(rccl CXX)
# Build options
#==================================================================================================
option(BUILD_ADDRESS_SANITIZER "Enable address sanitizer" OFF)
option(BUILD_BFD "Enable custom backtrace (if bfd.h exists)" OFF)
option(BUILD_LOCAL_GPU_TARGET_ONLY "Build only for GPUs detected on this machine" OFF)
option(BUILD_SHARED_LIBS "Build as shared library" ON)
option(BUILD_TESTS "Build unit test programs" OFF)
option(COLLTRACE "Collective Trace Option" ON)
option(DUMP_ASM "Disassemble and dump" OFF)
option(ENABLE_CODE_COVERAGE "Enable code coverage" OFF)
option(ENABLE_MSCCL_KERNEL "Enable MSCCL while compiling" OFF)
option(ENABLE_MSCCLPP "Enable MSCCL++" OFF)
option(ENABLE_MSCCLPP_CLIP "Enable MSCCL++ CLIP" OFF)
option(ENABLE_MSCCLPP_EXECUTOR "Enable MSCCL++ Executor" OFF)
option(ENABLE_MSCCLPP_FORMAT_CHECKS "Enable formatting checks in MSCCL++" OFF)
option(ENABLE_NPKIT "Enable NPKit" OFF)
option(ENABLE_IFC "Enable indirect function call" OFF)
option(GENERATE_SYM_KERNELS "Generate symmetric memory kernels" OFF)
option(INSTALL_DEPENDENCIES "Force install dependencies" OFF)
option(REPORT_KERNEL_RESOURCE_USE "Append -Rpass-analysis=kernel to CXX flags" OFF)
option(ROCTX "Enable ROCTX" ON)
option(PROFILE "Enable profiling" OFF)
option(TIMETRACE "Enable time-trace during compilation" OFF)
option(TRACE "Enable additional tracing" OFF)
option(FAULT_INJECTION "Enable fault injection" ON)
option(QUIET_WARNINGS "Supress compiler warnings" OFF)
option(ENABLE_ROCSHMEM "Enable rocSHMEM support in RCCL" OFF)
# Default GPU architectures to build
#==================================================================================================
set(DEFAULT_GPUS
gfx906
gfx908
gfx90a
gfx942
gfx950
gfx1030
gfx1100
gfx1101
gfx1102
gfx1200
gfx1201)
# Load CMake modules
#==================================================================================================
include(CheckIncludeFiles)
include(CheckSymbolExists)
include(cmake/Dependencies.cmake) # GTest, rocm-cmake, rocm_local_targets
include(cmake/CheckSymbolExistsNoWarn.cmake)
# Include rocSHMEM build module only if enabled
if(ENABLE_ROCSHMEM)
include(cmake/ROCSHMEM.cmake)
endif()
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
# Build only for local GPU architecture
if (BUILD_LOCAL_GPU_TARGET_ONLY)
message(STATUS "Building only for local GPU target")
if (COMMAND rocm_local_targets)
rocm_local_targets(DEFAULT_GPUS)
else()
message(WARNING "Unable to determine local GPU targets. Falling back to default GPUs.")
endif()
endif()
# Determine which GPU architectures to build for
set(GPU_TARGETS "${DEFAULT_GPUS}" CACHE STRING "Target default GPUs if GPU_TARGETS is not defined.")
# ROCM NetIB patch
include(cmake/rocmIb.cmake)
# Modify GPU architectures for Address Sanitizer builds by appending "xnack+"
if (BUILD_ADDRESS_SANITIZER)
SET(amdgpu_targets "")
foreach(amdgpu_target IN LISTS GPU_TARGETS)
if(NOT amdgpu_target STREQUAL "")
string(FIND "${amdgpu_target}" ":xnack+" HAS_XNACK_SUFFIX)
if(HAS_XNACK_SUFFIX EQUAL -1)
list(APPEND amdgpu_targets "${amdgpu_target}:xnack+")
else()
list(APPEND amdgpu_targets "${amdgpu_target}")
endif()
endif()
endforeach()
SET(GPU_TARGETS "${amdgpu_targets}")
endif()
# Check if clang compiler can offload to GPU_TARGETS
if (COMMAND rocm_check_target_ids)
message(STATUS "Checking for ROCm support for GPU targets: " "${GPU_TARGETS}")
rocm_check_target_ids(SUPPORTED_GPUS TARGETS ${GPU_TARGETS})
else()
message(WARNING "Unable to check for supported GPU targets. Falling back to default GPUs.")
set(SUPPORTED_GPUS ${DEFAULT_GPUS})
endif()
set(GPU_TARGETS "${SUPPORTED_GPUS}")
message(STATUS "Compiling for ${GPU_TARGETS}")
## NOTE: Reload rocm-cmake in order to update GPU_TARGETS
include(cmake/Dependencies.cmake) # Reloading to use desired GPU_TARGETS instead of defaults
# Try to establish ROCM_PATH (for find_package)
#==================================================================================================
if(NOT DEFINED ROCM_PATH)
# Guess default location
set(ROCM_PATH "/opt/rocm")
message(WARNING "Unable to find ROCM_PATH: Falling back to ${ROCM_PATH}")
else()
message(STATUS "ROCM_PATH found: ${ROCM_PATH}")
endif()
set(ENV{ROCM_PATH} ${ROCM_PATH})
if("${CMAKE_CXX_COMPILER}" MATCHES ".*amdclang\\+\\+")
message(STATUS "Compiling with amdclang++")
set(COMPILER_EXE_NAME amdclang++)
set(COMPILER_GREP_STRING "AMD clang version")
set(COMPILER_AWK_CMD "awk -F\" \" '{ printf $4}'")
elseif("${CMAKE_CXX_COMPILER}" MATCHES ".*clang\\+\\+")
message(STATUS "Compiling with clang++")
set(COMPILER_EXE_NAME clang++)
set(COMPILER_GREP_STRING "AMD clang version")
set(COMPILER_AWK_CMD "awk -F\" \" '{ printf $4}'")
elseif("${CMAKE_CXX_COMPILER}" MATCHES ".*hipcc$")
message(STATUS "Compiling with hipcc")
set(COMPILER_EXE_NAME hipcc)
set(COMPILER_GREP_STRING "HIP version")
set(COMPILER_AWK_CMD "awk -F\" \" '{ printf $3}' | awk -F\"-\" '{ printf $1}'")
else()
message(FATAL_ERROR "RCCL can be built only with hipcc or amdclang++")
endif()
# Set CMAKE flags
#==================================================================================================
set(CMAKE_INSTALL_PREFIX "${ROCM_PATH}" CACHE PATH "")
set(CMAKE_CXX_STANDARD 17) # We use C++17 features, this will add compile option: -std=c++17
set(CMAKE_CXX_EXTENSIONS OFF) # Without this line, it will add -std=gnu++17 instead, which has some issues.
if(ROCM_PATH)
list(APPEND CMAKE_PREFIX_PATH # Add ROCM_PATH to CMake search paths (for finding HIP / HSA
${ROCM_PATH}
${ROCM_PATH}/hip
${ROCM_PATH}/llvm)
endif()
# Check for required dependencies
#==================================================================================================
## Check for Threads
set(THREADS_PREFER_PTHREAD_FLAG ON)
find_package(Threads REQUIRED)
## Check for HIP
find_package(hip REQUIRED)
message(STATUS "HIP compiler: ${HIP_COMPILER}")
message(STATUS "HIP runtime: ${HIP_RUNTIME}")
if(NOT "${HIP_COMPILER}" MATCHES "clang")
message(FATAL_ERROR "RCCL requires clang-based compiler (amdclang++ or hipcc)")
endif()
## Check for compiler version
find_program(compiler_executable ${COMPILER_EXE_NAME})
message(STATUS "${COMPILER_EXE_NAME} executable: ${compiler_executable}")
execute_process(
COMMAND bash "-c" "${compiler_executable} --version | grep \"${COMPILER_GREP_STRING}\" | ${COMPILER_AWK_CMD}"
OUTPUT_VARIABLE compiler_version_string)
message(STATUS "${COMPILER_EXE_NAME} version: ${compiler_version_string}")
## Check for HIP version
find_program(hipconfig_executable hipconfig)
message(STATUS "hipconfig executable: ${hipconfig_executable}")
execute_process(
COMMAND bash "-c" "${hipconfig_executable} -v | awk -F\"-\" '{ printf $1 }'"
OUTPUT_VARIABLE hip_version_string)
message(STATUS "${COMPILER_EXE_NAME} HIP version: ${hip_version_string}")
## Check for ROCm version
set(EXPLICIT_ROCM_VERSION "" CACHE STRING "Explicit ROCM version to compile to (auto detect if empty)")
if(NOT DEFINED ROCMCORE_PATH)
set(ROCMCORE_PATH "${ROCM_PATH}" CACHE PATH "Path to ROCm core")
endif()
if(EXPLICIT_ROCM_VERSION)
set(rocm_version_string "${EXPLICIT_ROCM_VERSION}")
elseif(ROCMCORE_PATH)
message(STATUS "Reading ROCM version from ${ROCMCORE_PATH}/.info/version")
file(READ "${ROCMCORE_PATH}/.info/version" rocm_version_string)
else()
message(FATAL_ERROR "Could not determine ROCM version (set EXPLICIT_ROCM_VERSION or set ROCM_PATH to a valid installation)")
endif()
string(REGEX MATCH "([0-9]+)\\.([0-9]+)\\.([0-9]+)" rocm_version_matches ${rocm_version_string})
if (rocm_version_matches)
set(ROCM_MAJOR_VERSION ${CMAKE_MATCH_1})
set(ROCM_MINOR_VERSION ${CMAKE_MATCH_2})
set(ROCM_PATCH_VERSION ${CMAKE_MATCH_3})
message(STATUS "ROCm version: ${ROCM_MAJOR_VERSION}.${ROCM_MINOR_VERSION}.${ROCM_PATCH_VERSION}")
# Convert the version components to int for comparison
math(EXPR ROCM_VERSION "(10000 * ${ROCM_MAJOR_VERSION}) + (100 * ${ROCM_MINOR_VERSION}) + ${ROCM_PATCH_VERSION}")
add_definitions("-DROCM_VERSION=${ROCM_VERSION}")
else()
message(WARNING "Failed to extract ROCm version.")
endif()
### Required for checking HIP device symbols when building with amdclang++
set(CMAKE_REQUIRED_LIBRARIES hip::device)
### Check for hipDeviceMallocUncached support
check_symbol_exists("hipDeviceMallocUncached" "hip/hip_runtime_api.h" HIP_UNCACHED_MEMORY)
### Check for hipHostMallocUncached support
check_symbol_exists("hipHostMallocUncached" "hip/hip_runtime_api.h" HIP_HOST_UNCACHED_MEMORY)
### Check for hipDeviceMallocContiguous support
check_symbol_exists("hipDeviceMallocContiguous" "hip/hip_runtime_api.h" HIP_CONTIGUOUS_MEMORY)
unset(CMAKE_REQUIRED_LIBRARIES)
### Check for indirect function call support
if(ENABLE_IFC)
if("${hip_version_string}" VERSION_GREATER_EQUAL "5.5.30201")
set(IFC_ENABLED ON)
message(STATUS "Indirect function call enabled")
else()
set(IFC_ENABLED OFF)
message(WARNING "Indirect function call disabled - requires HIP version >= 5.5.30201")
endif()
else()
set(IFC_ENABLED OFF)
endif()
## Check for LL128 support
if("${hip_version_string}" VERSION_GREATER_EQUAL "6.1.33591")
set(LL128_ENABLED ON)
message(STATUS "RCCL LL128 protocol enabled")
else()
message(STATUS "RCCL LL128 protocol disabled - requires HIP version >= 6.1.33591")
endif()
## Check for hsa-runtime64
find_package(hsa-runtime64 REQUIRED)
get_target_property(HSA_INCLUDE_PATH hsa-runtime64::hsa-runtime64 INTERFACE_INCLUDE_DIRECTORIES)
message(STATUS "HSA runtime: ${HSA_INCLUDE_PATH}")
## Check for amd-smi if ROCm 7.11.0 or newer
if(ROCM_VERSION VERSION_GREATER_EQUAL "71100")
find_package(amd_smi PATHS ${ROCM_PATH}/lib/cmake/amd_smi)
if(amd_smi_FOUND)
message(STATUS "amd_smi_INCLUDE_DIR: ${amd_smi_INCLUDE_DIR}")
message(STATUS "amd_smi_LIB_DIR: ${amd_smi_LIB_DIR}")
set(SMI_INCLUDE_DIR "${amd_smi_INCLUDE_DIR}" CACHE INTERNAL "amd-smi include directory")
set(SMI_LIB_DIR "${amd_smi_LIB_DIR}" CACHE INTERNAL "amd-smi library directory")
set(SMI_LIB_NAME "amd-smi-lib" CACHE INTERNAL "amd-smi-lib for packaging")
if(NOT EXISTS "${SMI_INCLUDE_DIR}" OR NOT EXISTS "${SMI_LIB_DIR}")
message(FATAL_ERROR "amd_smi not found in ${SMI_INCLUDE_DIR}")
endif()
message(STATUS "Found amd_smi at ${SMI_INCLUDE_DIR}")
set(SMI_LIBRARIES amd_smi)
set(USE_AMDSMI ON CACHE INTERNAL "Use amd-smi instead of rocm-smi")
endif()
endif()
if(NOT USE_AMDSMI)
## Fallback to rocm-smi if amd-smi not found or ROCm < 7.11.0
message(WARNING "Could not find amd_smi. Falling back to rocm_smi.")
find_package(rocm_smi PATHS ${ROCM_PATH}/lib/cmake/rocm_smi)
if(rocm_smi_FOUND)
set(SMI_INCLUDE_DIR "${rocm_smi_INCLUDE_DIR}" CACHE INTERNAL "rocm-smi include directory")
set(SMI_LIB_DIR "${rocm_smi_LIB_DIR}" CACHE INTERNAL "rocm-smi library directory")
else()
message(WARNING "CMake could not find rocm-smi. Checking old include directory structure for rocm_smi")
set(SMI_INCLUDE_DIR "${ROCM_PATH}/rocm_smi/include")
set(SMI_LIB_DIR "${ROCM_PATH}/rocm_smi/lib")
endif()
if(NOT EXISTS "${SMI_INCLUDE_DIR}" OR NOT EXISTS "${SMI_LIB_DIR}")
message(FATAL_ERROR "rocm_smi not found in ${SMI_INCLUDE_DIR}")
endif()
message(STATUS "Found rocm_smi at ${SMI_INCLUDE_DIR}")
set(SMI_LIB_NAME "rocm-smi-lib" CACHE INTERNAL "rocm-smi-lib for packaging")
set(SMI_LIBRARIES rocm_smi64)
check_include_file_cxx("${SMI_INCLUDE_DIR}/rocm_smi/rocm_smi64Config.h" HAVE_ROCM_SMI64CONFIG)
### Check for RSMI_INIT_FLAG_THRAD_ONLY_MUTEX support
file(READ "${SMI_INCLUDE_DIR}/rocm_smi/rocm_smi.h" rocm_smi_incl)
string(FIND "${rocm_smi_incl}" "RSMI_INIT_FLAG_THRAD_ONLY_MUTEX" matchres)
if(${matchres} EQUAL -1)
message(STATUS "RSMI_INIT_FLAG_THRAD_ONLY_MUTEX not supported")
else()
message(STATUS "RSMI_INIT_FLAG_THRAD_ONLY_MUTEX supported")
set(HAVE_ROCM_SMI_THREAD_ONLY_MUTEX True)
endif ()
endif()
## Check for BFD library if custom backtrace is requested
if(BUILD_BFD)
enable_language(C)
check_include_files(bfd.h HAVE_BFD)
if (HAVE_BFD)
message(STATUS "-- Found BFD support")
### Required for checking HIP device symbols when building with amdclang++
set(CMAKE_REQUIRED_LIBRARIES hip::device)
# Check for specific BFD feature support
CHECK_SYMBOL_EXISTS(bfd_get_section_flags "bfd.h" HAVE_DECL_BFD_GET_SECTION_FLAGS)
CHECK_SYMBOL_EXISTS(bfd_get_section_vma "bfd.h" HAVE_DECL_BFD_GET_SECTION_VMA)
CHECK_CXX_SOURCE_COMPILES(
"#include <bfd.h>
int main (int argc, char **argv){
bfd_size_type size;
bfd abfd;
asection sec;
size = bfd_section_size(&abfd, &sec);
return (int)(size);
}"
HAVE_TWO_ARG_BFD_SECTION_SIZE)
unset(CMAKE_REQUIRED_LIBRARIES)
# Check for iberty support
find_library(HAVE_IBERTY iberty PATHS /usr/lib64 /usr/lib/ PATH_SUFFIXES x86_64-linux-gnu)
if(HAVE_IBERTY)
message(STATUS "iberty found @ ${HAVE_IBERTY}")
endif()
# Check for demangle support
find_path(DEMANGLE_DIR demangle.h PATHS /usr/include PATH_SUFFIXES libiberty)
if(NOT DEMANGLE_DIR)
message(WARNING "Could not find demangle.h ${DEMANGLE_DIR}")
else()
message(STATUS "Found demangle.h in ${DEMANGLE_DIR}")
endif()
else()
message(WARNING "bfd.h header not found - Disabling custom backtrace")
endif()
endif()
# Check for --amdgpu-kernarg-preload-count
check_cxx_compiler_flag("-mllvm --amdgpu-kernarg-preload-count=16" HAVE_KERNARG_PRELOAD)
if (HAVE_KERNARG_PRELOAD)
message(STATUS "Kernarg preloading to SGPR enabled")
endif()
check_cxx_compiler_flag("-parallel-jobs=12" HAVE_PARALLEL_JOBS)
if (HAVE_PARALLEL_JOBS)
message(STATUS "Parallel jobs enabled")
endif()
## Disable building MSCCL++ if the build environment is invalid
## Currently MSCCL++ is supported only on gfx942 and gfx950, and only on Ubuntu and CentOS
set(MSCCLPP_SUPPORTED_ARCHS "gfx942" "gfx942:xnack-" "gfx942:xnack+" "gfx950" "gfx950:xnack-" "gfx950:xnack+")
# Check if any of the supported architectures are in GPU_TARGETS
set(ARCH_MATCH_FOUND OFF)
set(MSCCLPP_GPU_TARGETS "")
foreach(ARCH IN LISTS GPU_TARGETS)
if(ARCH IN_LIST MSCCLPP_SUPPORTED_ARCHS)
set(ARCH_MATCH_FOUND ON)
list(APPEND MSCCLPP_GPU_TARGETS "${ARCH}")
endif()
endforeach()
set(MSCCLPP_GPU_TARGETS "${MSCCLPP_GPU_TARGETS}" CACHE STRING "GPU Targets supported by MSCCL++" FORCE)
if (ENABLE_MSCCLPP AND NOT ARCH_MATCH_FOUND)
set(ENABLE_MSCCLPP OFF)
message(WARNING "Can only build MSCCL++ for supported GPU_TARGETS: ${MSCCLPP_SUPPORTED_ARCHS}; current GPU_TARGETS: ${GPU_TARGETS}; so disabling MSCCL++ build")
endif()
# MSCCL++ is only supported on ROCm 6.2.0 or newer
if (ENABLE_MSCCLPP AND ROCM_VERSION VERSION_LESS "60200")
set(ENABLE_MSCCLPP OFF)
message(WARNING "MSCCL++ integration only supported on ROCm 6.2.0 or greater; disabling MSCCL++ build")
endif()
## Disable WARP_SPEED if the build environment is invalid
set(WARP_SPEED_SUPPORTED_ARCHS "gfx942" "gfx942:xnack-" "gfx942:xnack+" "gfx950" "gfx950:xnack-" "gfx950:xnack+")
set(ARCH_MATCH_FOUND OFF)
foreach(ARCH IN LISTS GPU_TARGETS)
if(ARCH IN_LIST WARP_SPEED_SUPPORTED_ARCHS)
set(ARCH_MATCH_FOUND ON)
endif()
endforeach()
if (NOT ARCH_MATCH_FOUND)
set(ENABLE_WARP_SPEED OFF)
message(WARNING "Can only build WARP_SPEED for supported GPU_TARGETS: ${WARP_SPEED_SUPPORTED_ARCHS}; current GPU_TARGETS: ${GPU_TARGETS}; so disabling WARP_SPEED build")
endif()
# cmake_host_system_information(RESULT HOST_OS_ID QUERY DISTRIB_ID) ## Requires cmake 3.22
execute_process(
COMMAND bash -c "grep '^ID=' /etc/os-release | cut -d'=' -f2 | cut -d'\"' -f2"
OUTPUT_VARIABLE HOST_OS_ID
OUTPUT_STRIP_TRAILING_WHITESPACE
)
execute_process(
COMMAND bash -c "grep '^ID_LIKE=' /etc/os-release | cut -d'=' -f2 | cut -d'\"' -f2"
OUTPUT_VARIABLE HOST_OS_FAMILY
OUTPUT_STRIP_TRAILING_WHITESPACE
)
if (ENABLE_MSCCLPP AND NOT(${HOST_OS_ID} STREQUAL "ubuntu" OR ${HOST_OS_ID} STREQUAL "centos"))
set(ENABLE_MSCCLPP OFF)
message(WARNING "MSCCL++ integration not supported on this OS (${HOST_OS_ID}); disabling MSCCL++ build")
endif()
# Check for ROCTX
if(ROCTX)
find_library(ROCTX_LIB NAMES roctx64)
find_path(ROCTRACER_INCLUDE_DIR "roctracer/roctx.h")
if(ROCTX_LIB AND ROCTRACER_INCLUDE_DIR)
set(ROCTX_ENABLE ON)
message(STATUS "ROCTX include directory found: ${ROCTRACER_INCLUDE_DIR}")
message(STATUS "ROCTX library found: ${ROCTX_LIB}")
else()
message(WARNING "ROCTX library not found. Skipping ROCTX linking.")
endif()
endif()
# Determine version from makefiles/version.mk and fill in templates
#==================================================================================================
## parse version from Makefile NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH must exist
## NCCL_SUFFIX is optional
## NCCL_VERSION formatting is ((X) * 1000 + (Y) * 100 + (Z)) so we must first detect one or two digits first
file(READ makefiles/version.mk version_mk_text)
if("${version_mk_text}" MATCHES "NCCL_MAJOR *:= *([0-9]*)")
set(NCCL_MAJOR ${CMAKE_MATCH_1})
else()
message(FATAL_ERROR "Failed to parse NCCL_MAJOR")
endif()
if("${version_mk_text}" MATCHES "NCCL_MINOR *:= *([0-9]*)")
set(NCCL_MINOR ${CMAKE_MATCH_1})
else()
message(FATAL_ERROR "Failed to parse NCCL_MINOR")
endif()
if("${version_mk_text}" MATCHES "NCCL_PATCH *:= *([0-9]*)")
set(NCCL_PATCH ${CMAKE_MATCH_1})
else()
message(FATAL_ERROR "Failed to parse NCCL_PATCH")
endif()
if("${version_mk_text}" MATCHES "NCCL_SUFFIX *:= *([0-9]*)")
set(NCCL_SUFFIX ${CMAKE_MATCH_1})
else()
set(NCCL_SUFFIX)
endif()
if("${version_mk_text}" MATCHES "PKG_REVISION *:= *([0-9]*)")
set(PKG_REVISION ${CMAKE_MATCH_1})
else()
message(FATAL_ERROR "Failed to parse PKG_REVISION")
endif()
if("${NCCL_PATCH}" MATCHES "[0-9][0-9]")
set(NCCL_VERSION "${NCCL_MAJOR}${NCCL_MINOR}${NCCL_PATCH}")
else()
set(NCCL_VERSION "${NCCL_MAJOR}${NCCL_MINOR}0${NCCL_PATCH}")
endif()
## Setup VERSION
set(VERSION_STRING "${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}")
rocm_setup_version(VERSION ${VERSION_STRING})
## Fill in version information for main header file
configure_file(src/nccl.h.in ${PROJECT_BINARY_DIR}/include/rccl/rccl.h) # For external linking
configure_file(src/nccl.h.in ${PROJECT_BINARY_DIR}/include/nccl.h) # Used by some internal files
# Collect list of all source files
#==================================================================================================
# E.g: find src -type f \( -name "*.cc" -o -name "*.h" -o -name "*.hpp" \) | sort
set(SRC_FILES
src/allocator.cc
src/bootstrap.cc
src/ce_coll.cc
src/channel.cc
src/collectives.cc
src/commDump.cc
src/debug.cc
src/dev_runtime.cc
src/enqueue.cc
src/group.cc
src/init.cc
src/init_nvtx.cc
src/mnnvl.cc
src/msccl.cc
src/proxy.cc
src/rccl_wrap.cc
src/sym_kernels.cc
src/transport.cc
src/device/all_gather.h
src/device/all_reduce.h
src/device/alltoall_pivot.h
src/device/alltoall_gda.h
src/device/broadcast.h
src/device/common.h
src/device/common_kernel.h
src/device/op128.h
src/device/primitives.h
src/device/prims_ll128.h
src/device/prims_ll.h
src/device/prims_simple.h
src/device/reduce.h
src/device/reduce_kernel.h
src/device/reduce_scatter.h
src/device/rccl_metadata.h
src/device/rccl_ptr.h
src/device/sendrecv.h
src/device/common.cu
src/device/onerank.cu
src/device/network/unpack/unpack_defs.h
src/device/network/unpack/unpack.h
src/device/symmetric/all_gather.cuh
src/device/symmetric/all_reduce.cuh
src/device/symmetric/kernel.cuh
src/device/symmetric/primitives.cuh
src/device/symmetric/reduce_scatter.cuh
src/graph/connect.cc
src/graph/paths.cc
src/graph/rings.cc
src/graph/rings.h
src/graph/rome_models.cc
src/graph/rome_models.h
src/graph/search.cc
src/graph/topo.cc
src/graph/topo.h
src/graph/trees.cc
src/graph/tuning.cc
src/graph/xml.cc
src/graph/xml.h
src/include/alloc.h
src/include/allocator.h
src/include/alt_rsmi.h
src/include/archinfo.h
src/include/api_trace.h
src/include/argcheck.h
src/include/BfdBacktrace.hpp
src/include/bitops.h
src/include/bootstrap.h
src/include/ce_coll.h
src/include/channel.h
src/include/checks.h
src/include/collectives.h
src/include/coll_net.h
src/include/comm.h
src/include/core.h
src/include/cpuset.h
# src/include/cudawrap.h
src/include/debug.h
src/include/dev_runtime.h
src/include/device.h
src/include/enqueue.h
src/include/gdrwrap.h
src/include/git_version.h
src/include/graph.h
src/include/group.h
src/include/hip_rocm_version_info.h
src/include/ibvcore.h
src/include/ibvsymbols.h
src/include/ibvwrap.h
src/include/info.h
src/include/ipcsocket.h
src/include/mnnvl.h
src/include/nccl_common.h
src/include/nccl_device.h
src/include/net_device.h
src/include/net.h
src/include/nvmlwrap.h
src/include/nvtx.h
src/include/nvtx_payload_schemas.h
src/include/nvtx_stub.h
src/include/p2p.h
src/include/param.h
src/include/profiler.h
src/include/proxy.h
src/include/ras.h
src/include/rccl_common.h
src/include/rccl_vars.h
src/include/register.h
src/include/register_inline.h
src/include/rccl_float8.h
src/include/rocmwrap.h
src/include/roctx.h
src/include/recorder.h
src/include/scheduler.h
src/include/shm.h
src/include/shmutils.h
src/include/signals.h
src/include/socket.h
src/include/strongstream.h
src/include/sym_kernels.h
src/include/timer.h
src/include/transport.h
src/include/trees.h
src/include/tuner.h
src/include/utils.h
src/include/mlx5/mlx5dvcore.h
src/include/mlx5/mlx5dvsymbols.h
src/include/mlx5/mlx5dvwrap.h
src/include/ionic/ionicdvcore.h
src/include/ionic/ionicdvsymbols.h
src/include/ionic/ionicdvwrap.h
src/include/msccl/msccl_lifecycle.h
src/include/msccl/msccl_parser.h
src/include/msccl/msccl_scheduler.h
src/include/msccl/msccl_setup.h
src/include/msccl/msccl_status.h
src/include/msccl/msccl_struct.h
src/include/nccl_device/comm.h
src/include/nccl_device/coop.h
src/include/nccl_device/core.h
src/include/nccl_device/ll_a2a.h
src/include/nccl_device/mem_barrier.h
src/include/nccl_device/ptr.h
src/include/nccl_device/utility.h
src/include/nccl_device/impl/comm__funcs.h
src/include/nccl_device/impl/comm__types.h
src/include/nccl_device/impl/core__funcs.h
src/include/nccl_device/impl/core__types.h
src/include/nccl_device/impl/ll_a2a__funcs.h
src/include/nccl_device/impl/ll_a2a__types.h
src/include/nccl_device/impl/mem_barrier__funcs.h
src/include/nccl_device/impl/mem_barrier__types.h
src/include/nccl_device/impl/ptr__funcs.h
src/include/nccl_device/impl/ptr__types.h
src/include/npkit/npkit.h
src/include/npkit/npkit_event.h
src/include/npkit/npkit_struct.h
src/include/nvtx3/nvToolsExt.h
src/include/nvtx3/nvToolsExtCounters.h
src/include/nvtx3/nvToolsExtCuda.h
src/include/nvtx3/nvToolsExtCudaRt.h
src/include/nvtx3/nvToolsExtMem.h
src/include/nvtx3/nvToolsExtMemCudaRt.h
src/include/nvtx3/nvToolsExtOpenCL.h
src/include/nvtx3/nvToolsExtPayload.h
src/include/nvtx3/nvToolsExtPayloadHelper.h
src/include/nvtx3/nvToolsExtSemanticsCounters.h
src/include/nvtx3/nvToolsExtSemanticsScope.h
src/include/nvtx3/nvToolsExtSync.h
src/include/nvtx3/nvtx3.hpp
src/include/nvtx3/nvtxDetail/nvtxExtHelperMacros.h
src/include/nvtx3/nvtxDetail/nvtxExtImpl.h
src/include/nvtx3/nvtxDetail/nvtxExtImplCounters_v1.h
src/include/nvtx3/nvtxDetail/nvtxExtImplMem_v1.h
src/include/nvtx3/nvtxDetail/nvtxExtImplMemCudaRt_v1.h
src/include/nvtx3/nvtxDetail/nvtxExtImplPayload_v1.h
src/include/nvtx3/nvtxDetail/nvtxExtInit.h
src/include/nvtx3/nvtxDetail/nvtxExtPayloadHelperInternal.h
src/include/nvtx3/nvtxDetail/nvtxExtPayloadTypeInfo.h
src/include/nvtx3/nvtxDetail/nvtxExtTypes.h
src/include/nvtx3/nvtxDetail/nvtxImpl.h
src/include/nvtx3/nvtxDetail/nvtxImplCore.h
src/include/nvtx3/nvtxDetail/nvtxImplCuda_v3.h
src/include/nvtx3/nvtxDetail/nvtxImplCudaRt_v3.h
src/include/nvtx3/nvtxDetail/nvtxImplOpenCL_v3.h
src/include/nvtx3/nvtxDetail/nvtxImplSync_v3.h
src/include/nvtx3/nvtxDetail/nvtxInit.h
src/include/nvtx3/nvtxDetail/nvtxInitDecls.h
src/include/nvtx3/nvtxDetail/nvtxInitDefs.h
src/include/nvtx3/nvtxDetail/nvtxLinkOnce.h
src/include/nvtx3/nvtxDetail/nvtxTypes.h
src/include/proxy_trace/proxy_trace.h
src/include/plugin/nccl_net.h
src/include/plugin/nccl_profiler.h
src/include/plugin/nccl_tuner.h
src/include/plugin/plugin.h
src/include/plugin/net/net_v6.h
src/include/plugin/net/net_v7.h
src/include/plugin/net/net_v8.h
src/include/plugin/net/net_v9.h
src/include/plugin/net/net_v10.h
src/include/plugin/net/net_v11.h
src/include/plugin/profiler/net_ib_v1.h
src/include/plugin/profiler/net_ib.h
src/include/plugin/profiler/net_socket_v1.h
src/include/plugin/profiler/net_socket.h
src/include/plugin/profiler/profiler_v1.h
src/include/plugin/profiler/profiler_v2.h
src/include/plugin/profiler/profiler_v3.h
src/include/plugin/profiler/profiler_v4.h
src/include/plugin/profiler/profiler_v5.h
src/include/plugin/tuner/tuner_v2.h
src/include/plugin/tuner/tuner_v3.h
src/include/plugin/tuner/tuner_v4.h
src/include/plugin/tuner/tuner_v5.h
src/misc/alt_rsmi.cc
src/misc/archinfo.cc
src/misc/argcheck.cc
src/misc/api_trace.c
src/misc/api_trace.cc
# src/misc/cudawrap.cc
# src/misc/gdrwrap.cc
src/misc/ibvsymbols.cc
src/misc/ibvwrap.cc
src/misc/ipcsocket.cc
src/misc/mlx5dvsymbols.cc
src/misc/mlx5dvwrap.cc
src/misc/ionicdvsymbols.cc
src/misc/ionicdvwrap.cc
src/misc/npkit.cc
# src/misc/nvmlwrap.cc
src/misc/nvmlwrap_stub.cc
src/misc/param.cc
src/misc/rocmwrap.cc
src/misc/roctx.cc
src/misc/recorder.cc
src/misc/shmutils.cc
src/misc/signals.cc
src/misc/socket.cc
src/misc/strongstream.cc
src/misc/utils.cc
src/misc/msccl/msccl_lifecycle.cc
src/misc/msccl/msccl_parser.cc
src/misc/msccl/msccl_setup.cc
src/misc/msccl/msccl_status.cc
src/misc/proxy_trace/proxy_trace.cc
src/nccl_device/core.cc
src/nccl_device/ll_a2a.cc
src/nccl_device/mem_barrier.cc
src/plugin/net.cc
src/plugin/plugin_open.cc
src/plugin/profiler.cc
src/plugin/tuner.cc
src/plugin/net/net_v6.cc
src/plugin/net/net_v7.cc
src/plugin/net/net_v8.cc
src/plugin/net/net_v9.cc
src/plugin/net/net_v10.cc
src/plugin/net/net_v11.cc
src/plugin/profiler/profiler_v1.cc
src/plugin/profiler/profiler_v2.cc
src/plugin/profiler/profiler_v3.cc
src/plugin/profiler/profiler_v4.cc
src/plugin/profiler/profiler_v5.cc
src/plugin/tuner/tuner_v2.cc
src/plugin/tuner/tuner_v3.cc
src/plugin/tuner/tuner_v4.cc
src/plugin/tuner/tuner_v5.cc
src/ras/client.cc
src/ras/client_support.cc
src/ras/collectives.cc
src/ras/peers.cc
src/ras/ras.cc
src/ras/ras_internal.h
src/ras/rasnet.cc
src/register/coll_reg.cc
src/register/register.cc
src/register/sendrecv_reg.cc
src/scheduler/symmetric_sched.cc
src/transport/coll_net.cc
src/transport/generic.cc
src/transport/net.cc
src/transport/net_ib.cc
src/transport/net_ib_rocm.cc
src/transport/net_socket.cc
src/transport/nvls.cc
src/transport/p2p.cc
src/transport/profiler.cc
src/transport/shm.cc
src/include/latency_profiler/CollTrace.h
src/include/latency_profiler/CollTraceEvent.h
src/include/latency_profiler/CollTraceFunc.h
src/include/latency_profiler/CollTraceUtils.h
src/include/latency_profiler/EventQueue.h
src/misc/latency_profiler/CollTrace.cc
src/misc/latency_profiler/CollTraceEvent.cc
src/misc/latency_profiler/CollTraceFunc.cc
src/misc/latency_profiler/CollTraceUtils.cc
)
if(USE_AMDSMI)
set(SMI_SOURCES
src/include/amdsmi_wrap.h
src/misc/amdsmi_wrap.cc
)
else()
set(SMI_SOURCES
src/include/rocm_smi_wrap.h
src/misc/rocm_smi_wrap.cc
)
endif()
list(APPEND SRC_FILES ${SMI_SOURCES})
if (ENABLE_MSCCL_KERNEL)
set(MSCCL_KERNEL_SOURCES
src/device/msccl_kernel_impl.h
src/include/msccl/msccl_kernel.h
)
list(APPEND SRC_FILES ${MSCCL_KERNEL_SOURCES})
endif()
if (ENABLE_MSCCLPP)
set(MSCCLPP_SOURCES
src/include/mscclpp/mscclpp_nccl.h
src/misc/mscclpp/mscclpp_nccl.cc
)
list(APPEND SRC_FILES ${MSCCLPP_SOURCES})
endif()
# Hipify source files (copy of source generated into hipify directory)
#==================================================================================================
find_program(hipify-perl_executable hipify-perl)
if(NOT hipify-perl_executable)
message(FATAL_ERROR "hipify-perl not found")
endif()
set(HIPIFY_DIR "${CMAKE_CURRENT_BINARY_DIR}/hipify")
## Loop over each source file to hipify
foreach(SRC_FILE ${SRC_FILES})
# Check that file exists
if (NOT EXISTS ${CMAKE_SOURCE_DIR}/${SRC_FILE})
message(FATAL_ERROR "Unable to find file listed in CMakeLists.txt: ${CMAKE_SOURCE_DIR}/${SRC_FILE}")
endif()
# Establish hipified copy of the source file
set(HIP_FILE "${HIPIFY_DIR}/${SRC_FILE}")
get_filename_component(HIP_FILE_DIR ${HIP_FILE} DIRECTORY)
# Make sure the file name is unique and there is no duplicate
add_file_unique(HIP_SOURCES ${HIP_FILE})
# Convert .cu files to .cpp so that they get processed properly
string(REPLACE "\.cuh" "\.h" HIP_FILE ${HIP_FILE})
string(REPLACE "\.cu" "\.cu.cpp" HIP_FILE ${HIP_FILE})
list(APPEND HIP_SOURCES ${HIP_FILE})
# Create a custom command to create hipified source code
if (FAULT_INJECTION)
add_custom_command(
OUTPUT ${HIP_FILE}
COMMAND mkdir -p ${HIP_FILE_DIR}
&& ${hipify-perl_executable} -quiet-warnings ${CMAKE_SOURCE_DIR}/${SRC_FILE} -o ${HIP_FILE}
&& ${CMAKE_COMMAND} -E env bash ${CMAKE_CURRENT_SOURCE_DIR}/cmake/scripts/add_unroll.sh ${HIP_FILE}
&& ${CMAKE_COMMAND} -E env bash ${CMAKE_CURRENT_SOURCE_DIR}/cmake/scripts/add_faults.sh ${HIP_FILE}
MAIN_DEPENDENCY ${SRC_FILE}
COMMENT "Hipifying ${SRC_FILE} -> ${HIP_FILE}"
)
else()
add_custom_command(
OUTPUT ${HIP_FILE}
COMMAND mkdir -p ${HIP_FILE_DIR}
&& ${hipify-perl_executable} -quiet-warnings ${CMAKE_SOURCE_DIR}/${SRC_FILE} -o ${HIP_FILE}
&& ${CMAKE_COMMAND} -E env bash ${CMAKE_CURRENT_SOURCE_DIR}/cmake/scripts/add_unroll.sh ${HIP_FILE}
MAIN_DEPENDENCY ${SRC_FILE}
COMMENT "Hipifying ${SRC_FILE} -> ${HIP_FILE}"
)
endif()
endforeach()
# Adding custom target to hipify all the source files
# This is required to make sure that all the hipified source files are
# available before compiling the unit tests executable(s)
add_custom_target(hipify_all DEPENDS ${HIP_SOURCES})
# Generate device/host tables and all the collective functions that are going to be in librccl.so
#==================================================================================================
find_package(Python3 COMPONENTS Interpreter REQUIRED)
if (NOT Python3_FOUND)
message(FATAL_ERROR "RCCL requires Python3 for generating host/device tables")
endif()
set(GEN_DIR "${HIPIFY_DIR}/gensrc")
set(GEN_SYM_DIR "${GEN_DIR}/symmetric")
if(ONLY_FUNCS)
message(WARNING "Using ONLY_FUNCS = ${ONLY_FUNCS}. Not meant for release builds.")
endif()
# Execute the python script to generate required collective functions
execute_process(
COMMAND ${Python3_EXECUTABLE} ${CMAKE_SOURCE_DIR}/src/device/generate.py ${GEN_DIR} ${IFC_ENABLED} ${COLLTRACE} ${ENABLE_MSCCL_KERNEL} ${BUILD_LOCAL_GPU_TARGET_ONLY} ${ONLY_FUNCS}
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
RESULT_VARIABLE gen_py_result
ERROR_VARIABLE gen_py_error
)
if (gen_py_result)
message(SEND_ERROR "Error: ${gen_py_error}")
message(FATAL_ERROR "${CMAKE_SOURCE_DIR}/src/device/generate.py failed")
endif()
if (GENERATE_SYM_KERNELS)
# Execute the python script to generate required symmetric memory kernels
execute_process(
COMMAND ${Python3_EXECUTABLE} ${CMAKE_SOURCE_DIR}/src/device/symmetric/generate.py ${GEN_SYM_DIR}
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
RESULT_VARIABLE gen_sym_py_result
ERROR_VARIABLE gen_sym_py_error
)
if (gen_sym_py_result)
message(SEND_ERROR "Error: ${gen_sym_py_error}")
message(FATAL_ERROR "${CMAKE_SOURCE_DIR}/src/device/symmetric/generate.py failed")
endif()
endif()
# Find the generated files in the output directory
file(GLOB_RECURSE GENERATED_FILES "${GEN_DIR}/*")
# Append all found generated files to the list
foreach(file ${GENERATED_FILES})
list(APPEND HIP_SOURCES ${file})
endforeach()
# Create an initial git_version.cpp file (that will be updated with latest git version)
#==================================================================================================
# Create initial empty file at configure time
file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/git_version.cpp "")
# Add a custom target that always runs at build time to update git version
add_custom_target(update_git_version
ALL
COMMAND ${CMAKE_COMMAND} -DRCCL_SOURCE_DIR=${CMAKE_CURRENT_SOURCE_DIR} -DRCCL_BINARY_DIR=${CMAKE_CURRENT_BINARY_DIR} -P ${CMAKE_CURRENT_SOURCE_DIR}/cmake/scripts/git_version.cmake
BYPRODUCTS ${CMAKE_CURRENT_BINARY_DIR}/git_version.cpp
COMMENT "Updating git version information"
VERBATIM
)
list(APPEND HIP_SOURCES ${CMAKE_CURRENT_BINARY_DIR}/git_version.cpp)
# Set up RCCL library
#==================================================================================================
## Set RCCL source files
add_library(rccl ${HIP_SOURCES})
## Set RCCL dependencies
## Ensure git version is updated before building rccl
add_dependencies(rccl update_git_version)
## Set RCCL include directories
target_include_directories(rccl PRIVATE ${PROJECT_BINARY_DIR}/include) # for generated rccl.h header
target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src) # for hipfied headers
target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/device)
target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/device/network/unpack)
target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/include)
target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/include/mlx5)
target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/include/nccl_device)
target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/include/ionic)
target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/include/plugin)
target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/gensrc)
target_include_directories(rccl PRIVATE ${HSA_INCLUDE_PATH})
target_include_directories(rccl PRIVATE ${ROCM_SMI_INCLUDE_DIR})
target_include_directories(rccl PRIVATE ${ROCMCORE_PATH}/include)
if(DEMANGLE_DIR)
target_include_directories(rccl PRIVATE ${DEMANGLE_DIR})
endif()
if(ROCTX_ENABLE)
target_include_directories(rccl PRIVATE ${ROCTRACER_INCLUDE_DIR})
endif()
## Set RCCL compile definitions
if(COLLTRACE)
target_compile_definitions(rccl PRIVATE ENABLE_COLLTRACE)
endif()
if(ENABLE_MSCCL_KERNEL)
message(WARNING "MSCCL is deprecated and will be removed in a future version of RCCL.")
target_compile_definitions(rccl PRIVATE COMPILE_MSCCL_KERNEL)
endif()
if(ENABLE_MSCCLPP)
target_compile_definitions(rccl PRIVATE ENABLE_MSCCLPP)
endif()
if(USE_AMDSMI)
target_compile_definitions(rccl PRIVATE USE_AMDSMI)
else()
if(HAVE_ROCM_SMI64CONFIG)
target_compile_definitions(rccl PRIVATE USE_ROCM_SMI64CONFIG)
endif()
if(HAVE_ROCM_SMI_THREAD_ONLY_MUTEX)
target_compile_definitions(rccl PRIVATE USE_ROCM_SMI_THREAD_ONLY_MUTEX)
endif()
endif()
if(ENABLE_WARP_SPEED)
target_compile_definitions(rccl PRIVATE ENABLE_WARP_SPEED)
endif()
if(ENABLE_ROCSHMEM)
target_compile_definitions(rccl PRIVATE ENABLE_ROCSHMEM)
endif()
# ==== rocSHMEM integration (optional) ====
if (ENABLE_ROCSHMEM)
add_rocshmem_targets()
# Ensure rocSHMEM is fully built/installed before compiling rccl
if (TARGET rocshmem_ext)
add_dependencies(rccl rocshmem_ext)
endif()
if (ROCSHMEM_INCLUDE_DIR)
target_include_directories(rccl PRIVATE ${ROCSHMEM_INCLUDE_DIR})
endif()
# Moved to where MSCCL target_links
## target_link_libraries(rccl PRIVATE ${ROCSHMEM_LIBRARY})
target_link_libraries(rccl PRIVATE ${IBVERBS})
endif()
# NPKit flags
## May be better to move these to a separate file
if(ENABLE_NPKIT)
message(WARNING "NPKit is deprecated and will be removed in a future version of RCCL. Please consider using alternative profiling tools.")
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_ENTRY)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_EXIT)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_ENTRY)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_EXIT)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_ENTRY)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_EXIT)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_COPY_SEND_ENTRY)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_COPY_SEND_EXIT)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_DIRECT_COPY_SEND_ENTRY)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_DIRECT_COPY_SEND_EXIT)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_DIRECT_RECV_ENTRY)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_DIRECT_RECV_EXIT)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_DIRECT_RECV_COPY_SEND_ENTRY)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_DIRECT_RECV_COPY_SEND_EXIT)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_DIRECT_RECV_REDUCE_COPY_SEND_ENTRY)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_DIRECT_RECV_REDUCE_COPY_SEND_EXIT)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_DIRECT_SEND_ENTRY)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_DIRECT_SEND_EXIT)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_DIRECT_SEND_FROM_OUTPUT_ENTRY)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_DIRECT_SEND_FROM_OUTPUT_EXIT)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_RECV_ENTRY)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_RECV_EXIT)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_RECV_COPY_SEND_ENTRY)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_RECV_COPY_SEND_EXIT)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_ENTRY)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_EXIT)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_SEND_ENTRY)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_SEND_EXIT)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_RECV_REDUCE_SEND_ENTRY)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_RECV_REDUCE_SEND_EXIT)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_SEND_ENTRY)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_SEND_EXIT)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_SEND_FROM_OUTPUT_ENTRY)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_SEND_FROM_OUTPUT_EXIT)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_PRIM_SIMPLE_WAIT_PEER_ENTRY)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_PRIM_SIMPLE_WAIT_PEER_EXIT)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_PRIM_LL_WAIT_SEND_ENTRY)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_PRIM_LL_WAIT_SEND_EXIT)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_PRIM_LL128_WAIT_SEND_ENTRY)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_PRIM_LL128_WAIT_SEND_EXIT)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_ENTRY)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_EXIT)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_NET_SEND_ENTRY)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_NET_SEND_EXIT)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_NET_TEST_ENTRY)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_NET_TEST_EXIT)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_NET_RECV_ENTRY)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_NET_RECV_EXIT)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_SEND_ENTRY)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_SEND_EXIT)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_ENTRY)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_EXIT)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_ENTRY)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_EXIT)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_ENTRY)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_EXIT)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_ENTRY)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_EXIT)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_ENTRY)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_EXIT)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_ENTRY)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_EXIT)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_ENTRY)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_EXIT)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_ENTRY)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_EXIT)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_ENTRY)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_EXIT)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_SEND_RECV_LOCAL_COPY_ENTRY)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_SEND_RECV_LOCAL_COPY_EXIT)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_SEND_RECV_SEND_ENTRY)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_SEND_RECV_SEND_EXIT)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_SEND_RECV_RECV_ENTRY)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_SEND_RECV_RECV_EXIT)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_GATHER_RING_ENTRY)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_GATHER_RING_EXIT)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_GATHER_RING_SEND_ENTRY)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_GATHER_RING_SEND_EXIT)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_GATHER_RING_RECV_COPY_SEND_ENTRY)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_GATHER_RING_RECV_COPY_SEND_EXIT)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_GATHER_RING_DIRECT_RECV_ENTRY)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_GATHER_RING_DIRECT_RECV_EXIT)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_MSCCL_GENERIC_OP_ENTRY)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_MSCCL_GENERIC_OP_EXIT)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_MSCCL_REDUCE_ENTRY)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_MSCCL_REDUCE_EXIT)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_MSCCL_SEND_ENTRY)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_MSCCL_SEND_EXIT)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_MSCCL_RECV_ENTRY)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_MSCCL_RECV_EXIT)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_MSCCL_RUN_ENTRY)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_MSCCL_RUN_EXIT)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_MSCCL_RECV_REDUCE_COPY_ENTRY)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_MSCCL_RECV_REDUCE_COPY_EXIT)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_MSCCL_INIT_ENTRY)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_MSCCL_INIT_EXIT)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_BROADCAST_RING_ENTRY)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_BROADCAST_RING_EXIT)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_ENTRY)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_EXIT)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_SEND_ENTRY)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_SEND_EXIT)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_RECV_REDUCE_SEND_ENTRY)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_RECV_REDUCE_SEND_EXIT)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_RECV_REDUCE_COPY_ENTRY)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_RECV_REDUCE_COPY_EXIT)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)
endif()
if(PROFILE)
target_compile_definitions(rccl PRIVATE ENABLE_PROFILING)
endif()
if(ROCTX_ENABLE)
target_compile_definitions(rccl PRIVATE ROCTX_ENABLE)
else()
target_compile_definitions(rccl PRIVATE NVTX_NO_IMPL)
target_compile_definitions(rccl PRIVATE NVTX_DISABLE)
endif()
if(TRACE)
target_compile_definitions(rccl PRIVATE ENABLE_TRACE)
endif()
if(${HIP_CONTIGUOUS_MEMORY})
target_compile_definitions(rccl PRIVATE HIP_CONTIGUOUS_MEMORY)
message(STATUS "HIP_CONTIGUOUS_MEMORY enabled")
else()
message(STATUS "HIP_CONTIGUOUS_MEMORY disabled")
endif()
if("${hip_version_string}" VERSION_GREATER_EQUAL "5.7.31920")
target_compile_definitions(rccl PRIVATE HIP_UNCACHED_MEMORY)
message(STATUS "HIP_UNCACHED_MEMORY enabled")
else()
message(STATUS "HIP_UNCACHED_MEMORY disabled - requires HIP version >= 5.7.31920")
# keep --hipcc-func-supp on older HIP and compiler
if(NOT IFC_ENABLED)
target_compile_options(rccl PRIVATE --hipcc-func-supp)
message(STATUS "--hipcc-func-supp enabled")
else()
message(STATUS "--hipcc-func-supp disabled")
endif()
endif()
if (HIP_HOST_UNCACHED_MEMORY)
target_compile_definitions(rccl PRIVATE HIP_HOST_UNCACHED_MEMORY)
message(STATUS "HIP_HOST_UNCACHED_MEMORY enabled")
else()
message(STATUS "HIP_HOST_UNCACHED_MEMORY disabled")
endif()
if (BUILD_BFD)
if (HAVE_BFD)
target_compile_definitions(rccl PRIVATE HAVE_BFD)
endif()
if (HAVE_DECL_BFD_GET_SECTION_FLAGS)
target_compile_definitions(rccl PRIVATE HAVE_DECL_BFD_GET_SECTION_FLAGS)
endif()
if (HAVE_DECL_BFD_GET_SECTION_VMA)
target_compile_definitions(rccl PRIVATE HAVE_DECL_BFD_GET_SECTION_VMA)
endif()
if (HAVE_TWO_ARG_BFD_SECTION_SIZE)
target_compile_definitions(rccl PRIVATE HAVE_TWO_ARG_BFD_SECTION_SIZE)
endif()
endif()
if (IFC_ENABLED)
target_compile_definitions(rccl PRIVATE USE_INDIRECT_FUNCTION_CALL)
endif()
if(DEMANGLE_DIR)
target_compile_definitions(rccl PRIVATE "HAVE_CPLUS_DEMANGLE=1")
target_compile_definitions(rccl PRIVATE "HAVE_DECL_BASENAME=1")
endif()
if(LL128_ENABLED)
target_compile_definitions(rccl PRIVATE ENABLE_LL128)
endif()
## Set RCCL compile options
if (HAVE_PARALLEL_JOBS)
target_compile_options(rccl PRIVATE -parallel-jobs=12)
endif()
if (ROCM_VERSION VERSION_GREATER_EQUAL "60200")
target_compile_options(rccl PRIVATE --offload-compress) # Compress GPU code at compile time.
target_link_libraries(rccl PRIVATE --offload-compress) # Compress GPU code at link time.
message(STATUS "--offload-compress enabled - ROCm version >= 6.2.0")
else()
message(STATUS "--offload-compress disabled - ROCm version < 6.2.0")
endif()
target_compile_options(rccl PRIVATE -Werror=uninitialized)
target_compile_options(rccl PRIVATE -Werror=sometimes-uninitialized)
target_compile_options(rccl PRIVATE -Wall)
target_compile_options(rccl PRIVATE -Werror=deprecated-copy-with-user-provided-copy)
target_compile_options(rccl PRIVATE -Wno-format-nonliteral)
target_compile_options(rccl PRIVATE -Wno-unused-function)
target_compile_options(rccl PRIVATE -fgpu-rdc)
if(QUIET_WARNINGS)
target_compile_options(rccl PRIVATE -Wno-invalid-offsetof)
target_compile_options(rccl PRIVATE -Wno-unused-result)
target_compile_options(rccl PRIVATE -Wno-macro-redefined)
target_compile_options(rccl PRIVATE -Wno-unused-label)
target_compile_options(rccl PRIVATE -Wno-unused-variable)
target_compile_options(rccl PRIVATE -Wno-unused-private-field)
target_compile_options(rccl PRIVATE -Wno-null-conversion)
target_compile_options(rccl PRIVATE -Wno-missing-braces)
endif()
## Set RCCL compile and linker options for unit tests and code coverage
if(ENABLE_CODE_COVERAGE)
if(NOT CMAKE_BUILD_TYPE MATCHES "Debug")
message(FATAL_ERROR "Code coverage is enabled, but the build type is '${CMAKE_BUILD_TYPE}'. "
"Code coverage requires 'Debug' build types to expose internal symbols. "
"Please set CMAKE_BUILD_TYPE to 'Debug' and reconfigure.")
endif()
message(STATUS "Code coverage is enabled with build type '${CMAKE_BUILD_TYPE}'.")
target_compile_options(rccl PRIVATE
-fvisibility=default -Xarch_host -fprofile-instr-generate -Xarch_host -fcoverage-mapping)
set(COVERAGE_SHARED_LINKER_FLAGS
-fprofile-generate
-Wl,--enable-new-dtags,--build-id=sha1,--rpath,$ORIGIN
)
set(COVERAGE_EXE_LINKER_FLAGS
-fprofile-generate
-Wl,--enable-new-dtags,--build-id=sha1,--rpath,$ORIGIN/../lib
)
target_link_options(rccl PRIVATE ${COVERAGE_SHARED_LINKER_FLAGS})
target_link_options(rccl PRIVATE ${COVERAGE_EXE_LINKER_FLAGS})
elseif(BUILD_TESTS) # Enable default/hidden visibility based on build type and ROCM_VERSION
if (ROCM_VERSION VERSION_GREATER_EQUAL "60400" AND CMAKE_BUILD_TYPE MATCHES "Debug")
target_compile_options(rccl PRIVATE -fvisibility=default)
else()
target_compile_options(rccl PRIVATE -fvisibility=hidden)
endif()
else() # Enable hidden visibility for library without tests/code coverage enabled
target_compile_options(rccl PRIVATE -fvisibility=hidden)
endif()
if (HAVE_KERNARG_PRELOAD)
target_compile_options(rccl PRIVATE -mllvm --amdgpu-kernarg-preload-count=16)
endif()
if (REPORT_KERNEL_RESOURCE_USE)
target_link_options(rccl PRIVATE -Rpass-analysis=kernel-resource-usage)
endif()
if (DUMP_ASM) # Save temporary files from kernel compilation
message(STATUS "Disassembling librccl.so to asm")
# Maintain symbols but without changing code. Keep additional data in dwarf section of binary.
target_compile_options(rccl PRIVATE -gline-tables-only)
set(OBJ_DUMP ${ROCM_PATH}/llvm/bin/llvm-objdump)
add_custom_command(TARGET rccl POST_BUILD
COMMENT "Disassembling RCCL library"
COMMAND /bin/bash -c "${OBJ_DUMP} --offload-fatbin librccl.so"
VERBATIM
)
foreach(GPUARCH ${GPU_TARGETS})
add_custom_command(TARGET rccl POST_BUILD
COMMENT "Disassembling RCCL library to dump assembly for ${GPUARCH}"
COMMAND /bin/bash -c "${OBJ_DUMP} -d -l --source --symbolize-operands librccl.so.0.hipv4-amdgcn-amd-amdhsa--${GPUARCH} > librccl.${GPUARCH}.s"
VERBATIM
)
endforeach()
endif()
## NOTE: This is currently being handled by rocm-cmake, however may need to be re-enabled in the future
#foreach(target ${GPU_TARGETS})
# target_compile_options(rccl PRIVATE --offload-arch=${target})
#endforeach()
if(BUILD_ADDRESS_SANITIZER)
target_compile_options(rccl PRIVATE -fsanitize=address -shared-libasan)
endif()
if(TIMETRACE)
target_compile_options(rccl PRIVATE -ftime-trace)
endif()
if (FAULT_INJECTION)
target_compile_definitions(rccl PRIVATE ENABLE_FAULT_INJECTION)
message(STATUS "Fault injection enabled")
endif()
## Set RCCL linked library directories
target_link_directories(rccl PRIVATE ${SMI_LIB_DIR})
if (ROCM_VERSION VERSION_GREATER_EQUAL "60100")
option(RCCL_ROCPROFILER_REGISTER "Enable rocprofiler-register support" ON)
else()
if(RCCL_ROCPROFILER_REGISTER)
message(AUTHOR_WARNING "RCCL_ROCPROFILER_REGISTER is not valid option for ROCm < 6.2. Current ROCm version: ${ROCM_VERSION}")
endif()
set(RCCL_ROCPROFILER_REGISTER OFF CACHE BOOL "" FORCE)
endif()
if(RCCL_ROCPROFILER_REGISTER)
find_package(rocprofiler-register REQUIRED)
target_compile_definitions(rccl PRIVATE RCCL_ROCPROFILER_REGISTER=1)
target_link_libraries(
rccl PRIVATE rocprofiler-register::rocprofiler-register)
endif()
## Set RCCL linked libraries
if (HAVE_BFD)
target_link_libraries(rccl PRIVATE bfd)
if(HAVE_IBERTY)
target_link_libraries(rccl PRIVATE iberty z)
endif()
endif()
if (ROCTX_ENABLE)
target_link_libraries(rccl PRIVATE ${ROCTX_LIB})
endif()
target_link_libraries(rccl PRIVATE -fgpu-rdc) # Required when linking relocatable device code
target_link_libraries(rccl PRIVATE Threads::Threads)
target_link_libraries(rccl INTERFACE hip::host)
target_link_libraries(rccl PRIVATE hip::device)
target_link_libraries(rccl PRIVATE dl)
target_link_libraries(rccl PRIVATE ${SMI_LIBRARIES})
target_link_libraries(rccl PRIVATE fmt::fmt-header-only)
if(ENABLE_MSCCLPP)
target_link_libraries(rccl PRIVATE mscclpp_nccl)
endif()
if(ENABLE_ROCSHMEM)
target_link_libraries(rccl PRIVATE ${ROCSHMEM_LIBRARY})
target_link_libraries(rccl PRIVATE ${IBVERBS})
endif()
## Set RCCL link options
## Find out available memory
execute_process(
COMMAND bash "-c" "cat /sys/fs/cgroup/memory.max"
OUTPUT_VARIABLE memory_max_string)
if (${memory_max_string} MATCHES "^[0-9]+")
math(EXPR memory_in_gb "${memory_max_string} / (1024 * 1024 * 1024)")
else()
execute_process(
COMMAND bash "-c" "free | grep -o '[[:digit:]]*' | head -1"
OUTPUT_VARIABLE memory_max_string)
## memory_max_string holds the free memory in KB
if (${memory_max_string} MATCHES "^[0-9]+")
math(EXPR memory_in_gb "${memory_max_string} / (1024 * 1024)") ## KB to GB conversion
else()
cmake_host_system_information(RESULT memory_max_string QUERY AVAILABLE_PHYSICAL_MEMORY )
math(EXPR memory_in_gb "${memory_max_string} / 1024")
endif()
endif()
## Reserve 16GB for each linker job. Limit max number of linker jobs to 16
if (HAVE_PARALLEL_JOBS)
math(EXPR num_linker_jobs "(${memory_in_gb} + 15) / 16")
if (${num_linker_jobs} GREATER_EQUAL "16")
set(num_linker_jobs "16")
endif()
message(STATUS "Use ${num_linker_jobs} jobs for linking")
target_link_options(rccl PRIVATE -parallel-jobs=${num_linker_jobs}) # Use multiple threads to link
endif()
if(BUILD_ADDRESS_SANITIZER)
target_link_options(rccl PRIVATE -fuse-ld=lld)
endif()
if(TIMETRACE)
target_link_options(rccl PRIVATE -ftime-trace)
endif()
if(NOT BUILD_SHARED_LIBS)
message(STATUS "Building static RCCL library")
else()
message(STATUS "Building shared RCCL library")
endif()
if (HAVE_KERNARG_PRELOAD)
target_link_options(rccl PRIVATE "SHELL:-Xoffload-linker -mllvm=-amdgpu-kernarg-preload-count=16")
endif()
if(ENABLE_MSCCLPP)
include(cmake/MSCCLPP.cmake)
endif()
## Track linking time
set_property(TARGET rccl PROPERTY RULE_LAUNCH_LINK "${CMAKE_COMMAND} -E time")
## Setup librccl.so version
rocm_set_soversion(rccl "1.0")
if(NOT BUILD_SHARED_LIBS)
# To create a static lib with `-fgpu-rdc`, you need `--emit-static-lib` and `--hip-link`.
# You also need to invoke amdclang++ again to trigger GPU code generation.
set(static_link_flags
${CXXFLAGS}
--hip-link
-fgpu-rdc
--emit-static-lib
)
# Find all the libraries we need to link at link time to include them in the clang link
# command line.
get_target_property(rccl_libs rccl LINK_LIBRARIES)
foreach(target ${rccl_libs})
if(TARGET ${target})
get_target_property(location ${target} LOCATION)
if(location)
LIST(APPEND static_link_flags -l${location})
endif()
endif()
endforeach()
foreach(target ${GPU_TARGETS})
list(APPEND static_link_flags --offload-arch=${target})
endforeach()
list(JOIN static_link_flags " " flags_str)
# Invoking amdclang++ this way will produce a static archive, so just override ARCHIVE_CREATE.
set(CMAKE_CXX_ARCHIVE_CREATE "<CMAKE_CXX_COMPILER> ${flags_str} -o <TARGET> <OBJECTS>")
endif()
# Install settings
#==================================================================================================
## Specify install targets
rocm_install_targets(TARGETS rccl)
rocm_install(FILES ${PROJECT_BINARY_DIR}/include/rccl/rccl.h src/include/plugin/nccl_net.h
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rccl)
rocm_install(FILES src/include/api_trace.h
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rccl/amd_detail)
file(COPY tools/msccl-algorithms DESTINATION ${PROJECT_BINARY_DIR})
file(COPY tools/msccl-unit-test-algorithms DESTINATION ${PROJECT_BINARY_DIR})
## Install Algorithm files under share folder
rocm_install(DIRECTORY ${PROJECT_BINARY_DIR}/msccl-algorithms DESTINATION ${CMAKE_INSTALL_DATADIR}/rccl)
rocm_install(DIRECTORY ${PROJECT_BINARY_DIR}/msccl-unit-test-algorithms DESTINATION ${CMAKE_INSTALL_DATADIR}/rccl)
rocm_export_targets(
NAMESPACE roc::
TARGETS rccl
DEPENDS hip)
## Set package dependencies
if(BUILD_ADDRESS_SANITIZER)
set(DEPENDS_HIP_RUNTIME "hip-runtime-amd-asan" )
else()
set(DEPENDS_HIP_RUNTIME "hip-runtime-amd" )
endif()
rocm_package_add_dependencies(DEPENDS "${DEPENDS_HIP_RUNTIME} >= 4.5.0" "${SMI_LIB_NAME}")
set(CPACK_DEB_COMPONENT_INSTALL ON)
set(CPACK_DEBIAN_PACKAGE_SHLIBDEPS ON)
set(CPACK_RPM_COMPONENT_INSTALL ON)
set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt" "${ROCM_PATH}")
find_file (DEBIAN debian_version debconf.conf PATHS /etc)
if(DEBIAN)
# Write copyright file
file(WRITE "${CMAKE_BINARY_DIR}/copyright"
"Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
Upstream-Name: rccl
Source: https://github.com/ROCm/rccl
Files: *
Copyright: (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
Modifications Copyright (c) 2020-2023 Advanced Micro Devices, Inc. All rights reserved.
Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
License: See LICENSE.txt for license information\n")
rocm_install(FILES "${CMAKE_BINARY_DIR}/copyright" DESTINATION ${CMAKE_INSTALL_DATADIR}/rccl)
# Write changelog file
find_program( date_executable date )
execute_process(COMMAND ${date_executable} -R OUTPUT_VARIABLE TIMESTAMP)
file(WRITE "${CMAKE_BINARY_DIR}/changelog"
"rccl (${VERSION_STRING}-1) unstable; urgency=medium
* Initial release.
-- RCCL Maintainer <rccl-maintainer@amd.com> ${TIMESTAMP}\n")
find_program( gzip_executable gzip )
execute_process(COMMAND bash "-c" "${gzip_executable} -9 -c -n ${CMAKE_BINARY_DIR}/changelog"
WORKING_DIRECTORY ${CMAKE_BINARY_DIR} OUTPUT_FILE "${CMAKE_BINARY_DIR}/changelog.Debian.gz")
rocm_install(FILES "${CMAKE_BINARY_DIR}/changelog.Debian.gz" DESTINATION ${CMAKE_INSTALL_DATADIR}/rccl)
set(CPACK_DEBIAN_PACKAGE_DESCRIPTION "ROCm Communication Collectives Library
Optimized primitives for collective multi-GPU communication")
endif()
## Building RCCL RAS
include(cmake/rcclRAS.cmake)
if(BUILD_TESTS)
rocm_package_setup_component(clients)
rocm_package_setup_client_component(tests PACKAGE_NAME unittests)
add_subdirectory(test)
if(BUILD_SHARED_LIBS)
add_custom_command(TARGET rccl POST_BUILD
COMMENT "Extracting metadata from librccl.so"
COMMAND COMMAND ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_SOURCE_DIR}/cmake/scripts/extract_metadata.cmake
VERBATIM
)
endif()
endif()
rocm_create_package(
NAME rccl
DESCRIPTION "ROCm Communication Collectives Library"
MAINTAINER "RCCL Maintainer <rccl-maintainer@amd.com>"
LDCONFIG)
================================================
FILE: CppCheckSuppressions.txt
================================================
arrayIndexThenCheck:src/bootstrap.cc:304
arrayIndexThenCheck:src/debug.cc:88
arrayIndexThenCheck:src/graph/search.cc:844
arrayIndexThenCheck:src/graph/search.cc:916
arrayIndexThenCheck:src/graph/search.cc:927
clarifyCalculation:src/graph/topo.cc:702
clarifyCalculation:src/graph/topo.cc:720
clarifyCondition:src/enqueue.cc:416
funcArgNamesDifferent:src/graph/topo.cc:135
funcArgNamesDifferent:src/graph/topo.h:144
nullPointerRedundantCheck:src/misc/utils.cc:102
nullPointerRedundantCheck:src/misc/utils.cc:109
nullPointerRedundantCheck:src/proxy.cc:143
nullPointerRedundantCheck:src/proxy.cc:144
nullPointerRedundantCheck:src/proxy.cc:147
nullPointerRedundantCheck:src/proxy.cc:148
nullPointerRedundantCheck:src/proxy.cc:149
nullPointerRedundantCheck:src/proxy.cc:150
nullPointerRedundantCheck:src/proxy.cc:151
nullPointerRedundantCheck:src/proxy.cc:155
nullPointerRedundantCheck:src/proxy.cc:159
nullPointerRedundantCheck:src/proxy.cc:160
nullPointerRedundantCheck:src/proxy.cc:161
nullPointerRedundantCheck:src/proxy.cc:163
nullPointerRedundantCheck:src/proxy.cc:165
nullPointerRedundantCheck:src/proxy.cc:167
nullPointerRedundantCheck:src/proxy.cc:168
nullPointerRedundantCheck:src/proxy.cc:340
nullPointerRedundantCheck:src/proxy.cc:342
nullPointerRedundantCheck:src/proxy.cc:93
nullPointerRedundantCheck:src/proxy.cc:94
redundantAssignment:src/proxy.cc:161
redundantAssignment:src/proxy.cc:163
redundantCopy:src/graph/rings.cc:16
redundantCopy:src/graph/rings.cc:17
terminateStrncpy:src/misc/utils.cc:99
terminateStrncpy:src/transport/net_socket.cc:245
unreachableCode:src/transport/net.cc:555
unreadVariable:src/graph/tuning.cc:109
unreadVariable:src/graph/tuning.cc:110
unreadVariable:src/graph/tuning.cc:113
unusedFunction:src/graph/topo.cc:37
unusedFunction:src/graph/topo.cc:836
unusedFunction:src/misc/gdrwrap.cc:109
unusedFunction:src/misc/gdrwrap.cc:117
unusedFunction:src/misc/gdrwrap.cc:130
unusedFunction:src/misc/gdrwrap.cc:144
unusedFunction:src/misc/gdrwrap.cc:158
unusedFunction:src/misc/gdrwrap.cc:172
unusedFunction:src/misc/gdrwrap.cc:186
unusedFunction:src/misc/gdrwrap.cc:200
unusedFunction:src/misc/gdrwrap.cc:209
unusedFunction:src/misc/gdrwrap.cc:218
unusedFunction:src/misc/gdrwrap.cc:232
unusedFunction:src/misc/gdrwrap.cc:52
unusedFunction:src/misc/ibvwrap.cc:203
unusedFunction:src/misc/ibvwrap.cc:239
unusedFunction:src/misc/ibvwrap.cc:255
unusedFunction:src/misc/nvmlwrap.cc:112
unusedFunction:src/misc/nvmlwrap_stub.cc:31
unusedFunction:src/misc/nvmlwrap_stub.cc:35
unusedFunction:src/transport.cc:71
unusedLabel:src/bootstrap.cc:349
unusedLabel:src/clique/ShmObject.h:112
unusedLabel:src/clique/ShmObject.h:204
unusedLabel:src/enqueue.cc:108
unusedLabel:src/enqueue.cc:1093
unusedLabel:src/enqueue.cc:989
unusedLabel:src/init.cc:1189
unusedLabel:src/init.cc:1240
unusedLabel:src/init.cc:1267
unusedLabel:src/transport.cc:238
unusedStructMember:src/graph/xml.cc:410
unusedStructMember:src/graph/xml.cc:411
unusedStructMember:src/graph/xml.cc:412
unusedStructMember:src/graph/xml.cc:428
unusedStructMember:src/graph/xml.cc:431
unusedStructMember:src/graph/xml.cc:432
unusedStructMember:src/graph/xml.cc:435
unusedStructMember:src/graph/xml.cc:437
variableScope:src/graph/search.cc:494
variableScope:src/init.cc:240
variableScope:src/transport/net_ib.cc:117
variableScope:src/transport/net_socket.cc:431
================================================
FILE: LICENSE.txt
================================================
Attributions
Contains contributions from NVIDIA.
Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
Modifications Copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved.
Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National
Laboratory, the U.S. Department of Energy, nor the names of their
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
The U.S. Department of Energy funded the development of this software
under subcontract 7078610 with Lawrence Berkeley National Laboratory.
This code also includes files from the NVIDIA Tools Extension SDK project.
See:
https://github.com/NVIDIA/NVTX
for more information and license details.
================================================
FILE: Makefile
================================================
#
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
.PHONY : all clean
default : src.build
install : src.install
BUILDDIR ?= $(abspath ./build)
ABSBUILDDIR := $(abspath $(BUILDDIR))
TARGETS := src pkg
clean: ${TARGETS:%=%.clean}
test.build: src.build
LICENSE_FILES := LICENSE.txt
LICENSE_TARGETS := $(LICENSE_FILES:%=$(BUILDDIR)/%)
lic: $(LICENSE_TARGETS)
${BUILDDIR}/%.txt: %.txt
@printf "Copying %-35s > %s\n" $< $@
mkdir -p ${BUILDDIR}
cp $< $@
src.%:
${MAKE} -C src $* BUILDDIR=${ABSBUILDDIR}
pkg.%:
${MAKE} -C pkg $* BUILDDIR=${ABSBUILDDIR}
pkg.debian.prep: lic
pkg.txz.prep: lic
================================================
FILE: NOTICES.txt
================================================
Notices and Licenses file
_______________________________________________________________
Dependencies on nvidia-nccl v2.27.3-1 (BSD3)
Copyright (c) 2015-2025, NVIDIA CORPORATION. All rights reserved.
Modifications Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All rights reserved.
Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National
Laboratory, the U.S. Department of Energy, nor the names of their
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
The U.S. Department of Energy funded the development of this software
under subcontract 7078610 with Lawrence Berkeley National Laboratory.
This code also includes files from the NVIDIA Tools Extension SDK project.
See:
https://github.com/NVIDIA/NVTX
for more information and license details.
_______________________________________________________________
Dependencies on NPKit (MIT License)
Copyright (c) Microsoft Corporation.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE
_______________________________________________________________
Dependencies on MSCCL++ (MIT License)
Copyright (c) Microsoft Corporation.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE
See:
https://github.com/microsoft/mscclpp
for more information and license details.
_______________________________________________________________
Dependencies on Latency Profiler (MIT License)
Copyright (c) Meta Platforms, Inc. and affiliates.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
See:
src/include/latency_profiler
src/misc/latency_profiler
================================================
FILE: README.md
================================================
# RCCL
> [!CAUTION]
> The rccl repository is retired, please use the [ROCm/rocm-systems](https://github.com/ROCm/rocm-systems) repository
ROCm Communication Collectives Library
[](https://dev.azure.com/ROCm-CI/ROCm-CI/_build/latest?definitionId=107&repoName=ROCm%2Frccl&branchName=develop)
[](https://github.com/ROCm/rccl/actions/workflows/therock-ci.yml)
> **Note:** The published documentation is available at [RCCL](https://rocm.docs.amd.com/projects/rccl/en/latest/index.html) in an organized easy-to-read format that includes a table of contents and search functionality. The documentation source files reside in the [rccl/docs](https://github.com/ROCm/rccl/tree/develop/docs) folder in this repository. As with all ROCm projects, the documentation is open source. For more information, see [Contribute to ROCm documentation](https://rocm.docs.amd.com/en/latest/contribute/contributing.html).
## Introduction
RCCL (pronounced "Rickle") is a stand-alone library of standard collective communication routines for GPUs, implementing all-reduce, all-gather, reduce, broadcast, reduce-scatter, gather, scatter, and all-to-all. There is also initial support for direct GPU-to-GPU send and receive operations. It has been optimized to achieve high bandwidth on platforms using PCIe, xGMI as well as networking using InfiniBand Verbs or TCP/IP sockets. RCCL supports an arbitrary number of GPUs installed in a single node or multiple nodes, and can be used in either single- or multi-process (e.g., MPI) applications.
The collective operations are implemented using ring and tree algorithms and have been optimized for throughput and latency. For best performance, small operations can be either batched into larger operations or aggregated through the API.
## Requirements
1. ROCm supported GPUs
2. ROCm stack installed on the system (HIP runtime & HIP-Clang)
## Quickstart RCCL Build
RCCL directly depends on HIP runtime plus the HIP-Clang compiler, which are part of the ROCm software stack.
For ROCm installation instructions, see https://github.com/ROCm/ROCm.
The root of this repository has a helper script `install.sh` to build and install RCCL with a single command. It hard-codes configurations that can be specified through invoking cmake directly, but it's a great way to get started quickly and can serve as an example of how to build/install RCCL.
### To build the library using the install script:
```shell
./install.sh
```
For more info on build options/flags when using the install script, use `./install.sh --help`
```shell
./install.sh --help
RCCL build & installation helper script
Options:
--address-sanitizer Build with address sanitizer enabled
-c|--enable-code-coverage Enable code coverage
-d|--dependencies Install RCCL dependencies
--debug Build debug library
--enable_backtrace Build with custom backtrace support
--disable-colltrace Build without collective trace
--enable-msccl-kernel Build with MSCCL kernels
--enable-mscclpp Build with MSCCL++ support
--enable-mscclpp-clip Build MSCCL++ with clip wrapper on bfloat16 and half addition routines
--disable-roctx Build without ROCTX logging
-f|--fast Quick-build RCCL (local gpu arch only, no backtrace, and collective trace support)
-h|--help Prints this help message
-i|--install Install RCCL library (see --prefix argument below)
-j|--jobs Specify how many parallel compilation jobs to run ($nproc by default)
-l|--local_gpu_only Only compile for local GPU architecture
--amdgpu_targets Only compile for specified GPU architecture(s). For multiple targets, separate by ';' (builds for all supported GPU architectures by default)
--no_clean Don't delete files if they already exist
--npkit-enable Compile with npkit enabled
--log-trace Build with log trace enabled (i.e. NCCL_DEBUG=TRACE)
--openmp-test-enable Enable OpenMP in rccl unit tests
-p|--package_build Build RCCL package
--prefix Specify custom directory to install RCCL to (default: `/opt/rocm`)
--run_tests_all Run all rccl unit tests (must be built already)
-r|--run_tests_quick Run small subset of rccl unit tests (must be built already)
--static Build RCCL as a static library instead of shared library
-t|--tests_build Build rccl unit tests, but do not run
--time-trace Plot the build time of RCCL (requires `ninja-build` package installed on the system)
--verbose Show compile commands
```
By default, RCCL builds for all GPU targets defined in `DEFAULT_GPUS` in `CMakeLists.txt`. To target specific GPU(s), and potentially reduce build time, use `--amdgpu_targets` as a `;` separated string listing GPU(s) to target.
## Manual build
### To build the library using CMake:
```shell
$ git clone --recursive https://github.com/ROCm/rccl.git
$ cd rccl
$ mkdir build
$ cd build
$ cmake ..
$ make -j 16 # Or some other suitable number of parallel jobs
```
If you have already cloned, you can checkout the external submodules manually.
```shell
$ git submodule update --init --recursive --depth=1
```
You may substitute an installation path of your own choosing by passing `CMAKE_INSTALL_PREFIX`. For example:
```shell
$ cmake -DCMAKE_INSTALL_PREFIX=$PWD/rccl-install -DCMAKE_BUILD_TYPE=Release ..
```
Note: ensure rocm-cmake is installed, `apt install rocm-cmake`.
### To build the RCCL package and install package :
Assuming you have already cloned this repository and built the library as shown in the previous section:
```shell
$ cd rccl/build
$ make package
$ sudo dpkg -i *.deb
```
RCCL package install requires sudo/root access because it installs under `/opt/rocm/`. This is an optional step as RCCL can instead be used directly by including the path containing `librccl.so`.
## Docker build
Refer to [docker/README.md](docker/README.md "docker/README.md")
## Tests
There are rccl unit tests implemented with the Googletest framework in RCCL. The rccl unit tests require Googletest 1.10 or higher to build and execute properly (installed with the -d option to install.sh).
To invoke the rccl unit tests, go to the build folder, then the test subfolder, and execute the appropriate rccl unit test executable(s).
rccl unit test names are now of the format:
CollectiveCall.[Type of test]
Filtering of rccl unit tests should be done with environment variable and by passing the `--gtest_filter` command line flag, for example:
```shell
UT_DATATYPES=ncclBfloat16 UT_REDOPS=prod ./rccl-UnitTests --gtest_filter="AllReduce.C*"
```
will run only AllReduce correctness tests with float16 datatype. A list of available filtering environment variables appears at the top of every run. See "Running a Subset of the Tests" at https://google.github.io/googletest/advanced.html#running-a-subset-of-the-tests for more information on how to form more advanced filters.
There are also other performance and error-checking tests for RCCL. These are maintained separately at https://github.com/ROCm/rccl-tests.
See the rccl-tests README for more information on how to build and run those tests.
## Library and API Documentation
Please refer to the [RCCL Documentation Site](https://rocm.docs.amd.com/projects/rccl/en/latest/) for current documentation.
### How to build documentation
Run the steps below to build documentation locally.
```shell
cd docs
pip3 install -r sphinx/requirements.txt
python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
```
## Copyright
All source code and accompanying documentation is copyright (c) 2015-2025, NVIDIA CORPORATION. All rights reserved.
All modifications are copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved.
================================================
FILE: cmake/CheckSymbolExistsNoWarn.cmake
================================================
# MIT License
#
# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
# These overrides are due to CMake CHECK_SYMBOL_EXISTS modifying CMAKE_CXX_FLAGS to do a test compile,
# while ROCMChecks gives a warning if this variable is modified manually without a target.
# We now choose to disable ROCMChecks for this one case.
set(DISABLE_ROCM_CHECK OFF)
function(rocm_check_toolchain_var var access value list_file)
if(NOT DISABLE_ROCM_CHECK)
_rocm_check_toolchain_var("${var}" "${access}" "${value}" "${list_file}")
endif()
endfunction()
macro(CHECK_SYMBOL_EXISTS)
set(DISABLE_ROCM_CHECK ON)
_check_symbol_exists(${ARGN})
set(DISABLE_ROCM_CHECK OFF)
endmacro()
================================================
FILE: cmake/Dependencies.cmake
================================================
# MIT License
#
# Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
# Dependencies
# HIP dependency is handled earlier in the project cmake file
# when VerifyCompiler.cmake is included.
# GIT
# Test dependencies
# For downloading, building, and installing required dependencies
include(cmake/DownloadProject.cmake)
include(FetchContent)
if(NOT INSTALL_DEPENDENCIES)
find_package(GTest 1.11)
endif()
if(NOT GTest_FOUND AND BUILD_TESTS OR INSTALL_DEPENDENCIES)
if(CMAKE_CXX_COMPILER MATCHES ".*/hipcc$")
# hip-clang cannot compile googlebenchmark for some reason
set(COMPILER_OVERRIDE "-DCMAKE_CXX_COMPILER=g++")
endif()
# unset(GTEST_INCLUDE_DIR CACHE)
# unset(GTEST_INCLUDE_DIRS CACHE)
message(STATUS "GTest not found. Downloading and building GTest.")
# Download, build and install googletest library
set(GTEST_ROOT ${CMAKE_CURRENT_BINARY_DIR}/gtest CACHE PATH "")
download_project(PROJ googletest
GIT_REPOSITORY https://github.com/google/googletest.git
GIT_TAG release-1.12.0
INSTALL_DIR ${GTEST_ROOT}
CMAKE_ARGS -DBUILD_GTEST=ON -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR> ${COMPILER_OVERRIDE} -DBUILD_SHARED_LIBS=OFF
LOG_DOWNLOAD TRUE
LOG_CONFIGURE TRUE
LOG_BUILD TRUE
LOG_INSTALL TRUE
UPDATE_DISCONNECTED TRUE
)
set(GTEST_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/gtest/include CACHE PATH "")
set(GMOCK_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/gmock/include CACHE PATH "")
if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/gtest/lib)
set(GTEST_BOTH_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/gtest/lib/libgtest.a;${CMAKE_CURRENT_BINARY_DIR}/gtest/lib/libgtest_main.a CACHE PATH "")
set(GMOCK_BOTH_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/gtest/lib/libgmock.a;${CMAKE_CURRENT_BINARY_DIR}/gtest/lib/libgmock_main.a CACHE PATH "")
elseif(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/gtest/lib64)
set(GTEST_BOTH_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/gtest/lib64/libgtest.a;${CMAKE_CURRENT_BINARY_DIR}/gtest/lib64/libgtest_main.a CACHE PATH "")
set(GMOCK_BOTH_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/gtest/lib64/libgmock.a;${CMAKE_CURRENT_BINARY_DIR}/gtest/lib64/libgmock_main.a CACHE PATH "")
else()
message(FATAL_ERROR "Cannot find gtest library installation path.")
find_package(GTest REQUIRED CONFIG PATHS ${GTEST_ROOT})
find_package(GMock REQUIRED CONFIG PATHS ${GTEST_ROOT})
endif()
elseif(GTest_FOUND AND BUILD_TESTS)
set(GTEST_BOTH_LIBRARIES "GTest::gtest;GTest::gtest_main")
set(GMOCK_BOTH_LIBRARIES "GTest::gmock;GTest::gmock_main")
endif()
# Find or download/install rocm-cmake project
set( PROJECT_EXTERN_DIR ${CMAKE_CURRENT_BINARY_DIR}/extern )
find_package(ROCM 0.7.3 QUIET CONFIG PATHS /opt/rocm)
if(NOT ROCM_FOUND)
set(rocm_cmake_tag "master" CACHE STRING "rocm-cmake tag to download")
file(
DOWNLOAD https://github.com/ROCm/rocm-cmake/archive/${rocm_cmake_tag}.zip
${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag}.zip
STATUS rocm_cmake_download_status LOG rocm_cmake_download_log
)
list(GET rocm_cmake_download_status 0 rocm_cmake_download_error_code)
if(rocm_cmake_download_error_code)
message(FATAL_ERROR "Error: downloading "
"https://github.com/ROCm/rocm-cmake/archive/${rocm_cmake_tag}.zip failed "
"error_code: ${rocm_cmake_download_error_code} "
"log: ${rocm_cmake_download_log} "
)
endif()
execute_process(
COMMAND ${CMAKE_COMMAND} -E tar xzf ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag}.zip
WORKING_DIRECTORY ${PROJECT_EXTERN_DIR}
RESULT_VARIABLE rocm_cmake_unpack_error_code
)
execute_process( COMMAND ${CMAKE_COMMAND} -DCMAKE_INSTALL_PREFIX=${PROJECT_EXTERN_DIR}/rocm-cmake .
WORKING_DIRECTORY ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag} )
execute_process( COMMAND ${CMAKE_COMMAND} --build rocm-cmake-${rocm_cmake_tag} --target install
WORKING_DIRECTORY ${PROJECT_EXTERN_DIR})
if(rocm_cmake_unpack_error_code)
message(FATAL_ERROR "Error: unpacking ${CMAKE_CURRENT_BINARY_DIR}/rocm-cmake-${rocm_cmake_tag}.zip failed")
endif()
find_package( ROCM 0.7.3 REQUIRED CONFIG PATHS ${PROJECT_EXTERN_DIR}/rocm-cmake )
endif()
set(CMAKE_INSTALL_LIBDIR lib CACHE STRING "Define install directory for libraries" FORCE)
# Find or download/install fmt
find_package(fmt QUIET)
if(NOT fmt_FOUND)
set(FMT_INSTALL OFF)
message(STATUS "fmt not found, fetching from source...")
FetchContent_Declare(
fmt
GIT_REPOSITORY https://github.com/fmtlib/fmt
GIT_TAG e69e5f977d458f2650bb346dadf2ad30c5320281 # 10.2.1
)
FetchContent_MakeAvailable(fmt)
else()
message(STATUS "Using system fmt")
get_target_property(FMT_INCLUDE_DIRS fmt::fmt-header-only INTERFACE_INCLUDE_DIRECTORIES)
message(STATUS "fmt include directories: ${FMT_INCLUDE_DIRS}")
endif()
# Find available local ROCM targets
# NOTE: This will eventually be part of ROCm-CMake and should be removed at that time
function(rocm_local_targets VARIABLE)
set(${VARIABLE} "NOTFOUND" PARENT_SCOPE)
find_program(_rocm_agent_enumerator rocm_agent_enumerator HINTS /opt/rocm/bin ENV ROCM_PATH)
if(NOT _rocm_agent_enumerator STREQUAL "_rocm_agent_enumerator-NOTFOUND")
execute_process(
COMMAND "${_rocm_agent_enumerator}"
RESULT_VARIABLE _found_agents
OUTPUT_VARIABLE _rocm_agents
ERROR_QUIET
)
if (_found_agents EQUAL 0)
string(REPLACE "\n" ";" _rocm_agents "${_rocm_agents}")
unset(result)
foreach (agent IN LISTS _rocm_agents)
if (NOT agent STREQUAL "gfx000")
list(APPEND result "${agent}")
endif()
endforeach()
if(result)
list(REMOVE_DUPLICATES result)
set(${VARIABLE} "${result}" PARENT_SCOPE)
endif()
endif()
endif()
endfunction()
# Iterate over the "source" list and check if there is a duplicate file name
# NOTE: This is due to compiler bug '--save-temps' and can be removed when fix availabe
function(add_file_unique FILE_LIST FILE)
get_filename_component(FILE_NAME "${FILE}" NAME)
# Iterate over whatever is in the list so far
foreach(curr_file IN LISTS ${FILE_LIST})
get_filename_component(curr_file_name ${curr_file} NAME)
# Check if duplicate
if(${FILE_NAME} STREQUAL ${curr_file_name})
get_filename_component(DIR_PATH "${FILE}" DIRECTORY)
get_filename_component(FILE_NAME_WE "${FILE}" NAME_WE)
get_filename_component(FILE_EXT "${FILE}" EXT)
# Construct a new file name by adding _tmp
set(HIP_FILE "${DIR_PATH}/${FILE_NAME_WE}_tmp${FILE_EXT}" PARENT_SCOPE)
endif()
endforeach()
endfunction()
include(ROCMSetupVersion)
include(ROCMCreatePackage)
include(ROCMInstallTargets)
include(ROCMPackageConfigHelpers)
include(ROCMInstallSymlinks)
include(ROCMCheckTargetIds)
include(ROCMClients)
include(ROCMHeaderWrapper)
================================================
FILE: cmake/DownloadProject.CMakeLists.cmake.in
================================================
# Distributed under the OSI-approved MIT License. See accompanying
# file LICENSE or https://github.com/Crascit/DownloadProject for details.
cmake_minimum_required(VERSION 2.8.2)
project(${DL_ARGS_PROJ}-download NONE)
include(ExternalProject)
ExternalProject_Add(${DL_ARGS_PROJ}-download
${DL_ARGS_UNPARSED_ARGUMENTS}
SOURCE_DIR "${DL_ARGS_SOURCE_DIR}"
BUILD_IN_SOURCE TRUE
TEST_COMMAND ""
)
================================================
FILE: cmake/DownloadProject.cmake
================================================
# Distributed under the OSI-approved MIT License. See accompanying
# file LICENSE or https://github.com/Crascit/DownloadProject for details.
#
# MODULE: DownloadProject
#
# PROVIDES:
# download_project( PROJ projectName
# [PREFIX prefixDir]
# [DOWNLOAD_DIR downloadDir]
# [SOURCE_DIR srcDir]
# [BINARY_DIR binDir]
# [QUIET]
# ...
# )
#
# Provides the ability to download and unpack a tarball, zip file, git repository,
# etc. at configure time (i.e. when the cmake command is run). How the downloaded
# and unpacked contents are used is up to the caller, but the motivating case is
# to download source code which can then be included directly in the build with
# add_subdirectory() after the call to download_project(). Source and build
# directories are set up with this in mind.
#
# The PROJ argument is required. The projectName value will be used to construct
# the following variables upon exit (obviously replace projectName with its actual
# value):
#
# projectName_SOURCE_DIR
# projectName_BINARY_DIR
#
# The SOURCE_DIR and BINARY_DIR arguments are optional and would not typically
# need to be provided. They can be specified if you want the downloaded source
# and build directories to be located in a specific place. The contents of
# projectName_SOURCE_DIR and projectName_BINARY_DIR will be populated with the
# locations used whether you provide SOURCE_DIR/BINARY_DIR or not.
#
# The DOWNLOAD_DIR argument does not normally need to be set. It controls the
# location of the temporary CMake build used to perform the download.
#
# The PREFIX argument can be provided to change the base location of the default
# values of DOWNLOAD_DIR, SOURCE_DIR and BINARY_DIR. If all of those three arguments
# are provided, then PREFIX will have no effect. The default value for PREFIX is
# CMAKE_BINARY_DIR.
#
# The QUIET option can be given if you do not want to show the output associated
# with downloading the specified project.
#
# In addition to the above, any other options are passed through unmodified to
# ExternalProject_Add() to perform the actual download, patch and update steps.
#
# Only those ExternalProject_Add() arguments which relate to downloading, patching
# and updating of the project sources are intended to be used. Also note that at
# least one set of download-related arguments are required.
#
# If using CMake 3.2 or later, the UPDATE_DISCONNECTED option can be used to
# prevent a check at the remote end for changes every time CMake is run
# after the first successful download. See the documentation of the ExternalProject
# module for more information. It is likely you will want to use this option if it
# is available to you. Note, however, that the ExternalProject implementation contains
# bugs which result in incorrect handling of the UPDATE_DISCONNECTED option when
# using the URL download method or when specifying a SOURCE_DIR with no download
# method. Fixes for these have been created, the last of which is scheduled for
# inclusion in CMake 3.8.0. Details can be found here:
#
# https://gitlab.kitware.com/cmake/cmake/commit/bdca68388bd57f8302d3c1d83d691034b7ffa70c
# https://gitlab.kitware.com/cmake/cmake/issues/16428
#
# If you experience build errors related to the update step, consider avoiding
# the use of UPDATE_DISCONNECTED.
#
# EXAMPLE USAGE:
#
# include(DownloadProject)
# download_project(PROJ googletest
# GIT_REPOSITORY https://github.com/google/googletest.git
# GIT_TAG master
# UPDATE_DISCONNECTED 1
# QUIET
# )
#
# add_subdirectory(${googletest_SOURCE_DIR} ${googletest_BINARY_DIR})
#
#========================================================================================
set(_DownloadProjectDir "${CMAKE_CURRENT_LIST_DIR}")
include(CMakeParseArguments)
function(download_project)
set(options QUIET)
set(oneValueArgs
PROJ
PREFIX
DOWNLOAD_DIR
SOURCE_DIR
BINARY_DIR
)
set(multiValueArgs "")
cmake_parse_arguments(DL_ARGS "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
# Hide output if requested
if (DL_ARGS_QUIET)
set(OUTPUT_QUIET "OUTPUT_QUIET")
else()
unset(OUTPUT_QUIET)
message(STATUS "Downloading/updating ${DL_ARGS_PROJ}")
endif()
# Set up where we will put our temporary CMakeLists.txt file and also
# the base point below which the default source and binary dirs will be.
# The prefix must always be an absolute path.
if (NOT DL_ARGS_PREFIX)
set(DL_ARGS_PREFIX "${CMAKE_BINARY_DIR}")
else()
get_filename_component(DL_ARGS_PREFIX "${DL_ARGS_PREFIX}" ABSOLUTE
BASE_DIR "${CMAKE_CURRENT_BINARY_DIR}")
endif()
if (NOT DL_ARGS_DOWNLOAD_DIR)
set(DL_ARGS_DOWNLOAD_DIR "${DL_ARGS_PREFIX}/${DL_ARGS_PROJ}-download")
endif()
# Ensure the caller can know where to find the source and build directories
if (NOT DL_ARGS_SOURCE_DIR)
set(DL_ARGS_SOURCE_DIR "${DL_ARGS_PREFIX}/${DL_ARGS_PROJ}-src")
endif()
if (NOT DL_ARGS_BINARY_DIR)
set(DL_ARGS_BINARY_DIR "${DL_ARGS_PREFIX}/${DL_ARGS_PROJ}-build")
endif()
set(${DL_ARGS_PROJ}_SOURCE_DIR "${DL_ARGS_SOURCE_DIR}" PARENT_SCOPE)
set(${DL_ARGS_PROJ}_BINARY_DIR "${DL_ARGS_BINARY_DIR}" PARENT_SCOPE)
# The way that CLion manages multiple configurations, it causes a copy of
# the CMakeCache.txt to be copied across due to it not expecting there to
# be a project within a project. This causes the hard-coded paths in the
# cache to be copied and builds to fail. To mitigate this, we simply
# remove the cache if it exists before we configure the new project. It
# is safe to do so because it will be re-generated. Since this is only
# executed at the configure step, it should not cause additional builds or
# downloads.
file(REMOVE "${DL_ARGS_DOWNLOAD_DIR}/CMakeCache.txt")
# Create and build a separate CMake project to carry out the download.
# If we've already previously done these steps, they will not cause
# anything to be updated, so extra rebuilds of the project won't occur.
# Make sure to pass through CMAKE_MAKE_PROGRAM in case the main project
# has this set to something not findable on the PATH.
configure_file("${_DownloadProjectDir}/DownloadProject.CMakeLists.cmake.in"
"${DL_ARGS_DOWNLOAD_DIR}/CMakeLists.txt")
execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}"
-D "CMAKE_MAKE_PROGRAM:FILE=${CMAKE_MAKE_PROGRAM}"
.
RESULT_VARIABLE result
${OUTPUT_QUIET}
WORKING_DIRECTORY "${DL_ARGS_DOWNLOAD_DIR}"
)
if(result)
message(FATAL_ERROR "CMake step for ${DL_ARGS_PROJ} failed: ${result}")
endif()
execute_process(COMMAND ${CMAKE_COMMAND} --build . -j16
RESULT_VARIABLE result
${OUTPUT_QUIET}
WORKING_DIRECTORY "${DL_ARGS_DOWNLOAD_DIR}"
)
if(result)
message(FATAL_ERROR "Build step for ${DL_ARGS_PROJ} failed: ${result}")
endif()
endfunction()
================================================
FILE: cmake/FindIBVerbs.cmake
================================================
# MIT License
#
# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
find_path(IBVERBS_INCLUDE_DIRS
NAMES infiniband/verbs.h
HINTS
${IBVERBS_INCLUDE_DIR}
${IBVERBS_ROOT_DIR}
${IBVERBS_ROOT_DIR}/include)
find_library(IBVERBS_LIBRARIES
NAMES ibverbs
HINTS
${IBVERBS_LIB_DIR}
${IBVERBS_ROOT_DIR}
${IBVERBS_ROOT_DIR}/lib)
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(IBVerbs DEFAULT_MSG IBVERBS_INCLUDE_DIRS IBVERBS_LIBRARIES)
mark_as_advanced(IBVERBS_INCLUDE_DIR IBVERBS_LIBRARIES)
================================================
FILE: cmake/Findmscclpp_nccl.cmake
================================================
# MIT License
#
# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
find_path(MSCCLPP_INCLUDE_DIRS
NAMES mscclpp/gpu.hpp
HINTS
${MSCCLPP_ROOT}/include)
find_library(MSCCLPP_LIBRARIES
NAMES mscclpp_nccl
HINTS
${MSCCLPP_ROOT}/lib)
include (FindPackageHandleStandardArgs)
find_package_handle_standard_args(mscclpp_nccl DEFAULT_MSG MSCCLPP_INCLUDE_DIRS MSCCLPP_LIBRARIES)
mark_as_advanced(MSCCLPP_INCLUDE_DIRS MSCCLPP_LIBRARIES)
================================================
FILE: cmake/Findrocshmem_static.cmake
================================================
# MIT License
#
# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
find_path(ROCSHMEM_INCLUDE_DIR
NAMES rocshmem/rocshmem.hpp rocshmem/rocshmem.h
HINTS ${ROCSHMEM_INSTALL_DIR}/include/)
find_library(ROCSHMEM_LIBRARY
NAMES rocshmem
HINTS ${ROCSHMEM_INSTALL_DIR}/lib)
## -- todo --- what to do with verbs? add to handle args call below? -- ##
find_library(IBVERBS ibverbs)
find_package_handle_standard_args(rocshmem_static DEFAULT_MSG ROCSHMEM_INCLUDE_DIR ROCSHMEM_LIBRARY)
## mark_as_advanced(MSCCLPP_INCLUDE_DIRS MSCCLPP_NCCL_STATIC_LIB) add this for Rocshmem?
================================================
FILE: cmake/MSCCLPP.cmake
================================================
# MIT License
#
# Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
# Dependencies
# HIP dependency is handled earlier in the project cmake file
# when VerifyCompiler.cmake is included.
# GIT
# Test dependencies
# For downloading, building, and installing required dependencies
include(cmake/DownloadProject.cmake)
if(ENABLE_MSCCLPP)
# Try to find the mscclpp install
set(MSCCLPP_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/ext/mscclpp CACHE PATH "")
execute_process(
COMMAND mkdir -p ${MSCCLPP_ROOT}
)
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
find_package(mscclpp_nccl)
#if(NOT mscclpp_nccl_FOUND)
# Ensure the source code is checked out
set(MSCCLPP_SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/mscclpp CACHE PATH "")
set(JSON_SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/json CACHE PATH "")
if((NOT EXISTS ${MSCCLPP_SOURCE}/CMakeLists.txt) OR (NOT EXISTS ${JSON_SOURCE}/CMakeLists.txt))
message(STATUS "Checking out external code")
execute_process(
COMMAND git submodule update --init --recursive
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
)
endif()
execute_process(
COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/cpx.patch
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
)
execute_process(
COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/read-allred.patch
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
)
execute_process(
COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/mscclpp_ibv_access_relaxed_ordering.patch
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
)
execute_process(
COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/mem-reg.patch
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
)
execute_process(
COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/non-multiple-128-fix.patch
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
)
execute_process(
COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/bf16-tuning.patch
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
)
execute_process(
COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/reg-fix.patch
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
)
execute_process(
COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/no-cache.patch
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
)
execute_process(
COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/device-flag.patch
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
)
execute_process(
COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/remove-clip.patch
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
)
execute_process(
COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/disable-executor.patch
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
)
execute_process(
COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/disable-format-checks.patch
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
)
set(CMAKE_INHERITED_ARGS "")
set(CMAKE_ARGS_LIST "CMAKE_PREFIX_PATH;CMAKE_INSTALL_RPATH_USE_LINK_PATH;HIP_COMPILER")
foreach(arg IN LISTS CMAKE_ARGS_LIST)
if(DEFINED ${arg})
string(REPLACE ";" "%" ARG_VALUE "${${arg}}") # Replace ; with new list separator symbol % to avoid CMake errors
string(STRIP "${ARG_VALUE}" ARG_VALUE) # Eliminate whitespace, reducing to empty string if necessary
# Only add a cmake argument if it has a value
if("${ARG_VALUE}" STREQUAL "")
continue()
endif()
string(APPEND CMAKE_INHERITED_ARGS "-D${arg}=\"${ARG_VALUE}\" ")
endif()
endforeach()
if(NOT DEFINED CACHE{MSCCLPP_GPU_TARGETS})
message(STATUS "Building MSCCL++ only for supported variants: gfx942;gfx950")
set(MSCCLPP_GPU_TARGETS "gfx942;gfx950")
if(BUILD_ADDRESS_SANITIZER)
set(MSCCLPP_GPU_TARGETS "gfx942:xnack+;gfx950:xnack+")
endif()
else()
message(STATUS "Building MSCCL++ for ${MSCCLPP_GPU_TARGETS}")
endif()
string(REPLACE ";" "%" MSCCLPP_GPU_TARGETS "${MSCCLPP_GPU_TARGETS}")
download_project(PROJ mscclpp_nccl
#GIT_REPOSITORY https://github.com/microsoft/mscclpp.git
#GIT_TAG 4ee15b7ad085daaf74349d4c49c9b8480d28f0dc
INSTALL_DIR ${MSCCLPP_ROOT}
LIST_SEPARATOR %
CMAKE_ARGS "-DGPU_TARGETS=${MSCCLPP_GPU_TARGETS}" -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DMSCCLPP_BUILD_APPS_NCCL=ON -DMSCCLPP_BUILD_PYTHON_BINDINGS=OFF -DMSCCLPP_BUILD_TESTS=OFF -DMSCCLPP_CLIP_ENABLED=${ENABLE_MSCCLPP_CLIP} -DMSCCLPP_ENABLE_EXECUTOR=${ENABLE_MSCCLPP_EXECUTOR} -DMSCCLPP_ENABLE_FORMAT_CHECKS=${ENABLE_MSCCLPP_FORMAT_CHECKS} -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR> -DCMAKE_VERBOSE_MAKEFILE=1 "${CMAKE_INHERITED_ARGS}" -DFETCHCONTENT_SOURCE_DIR_JSON=${JSON_SOURCE}
LOG_DOWNLOAD FALSE
LOG_CONFIGURE FALSE
LOG_BUILD FALSE
LOG_INSTALL FALSE
UPDATE_DISCONNECTED TRUE
SOURCE_DIR ${MSCCLPP_SOURCE}
)
find_package(mscclpp_nccl REQUIRED)
execute_process(
COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/disable-format-checks.patch
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
)
execute_process(
COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/disable-executor.patch
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
)
execute_process(
COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/remove-clip.patch
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
)
execute_process(
COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/device-flag.patch
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
)
execute_process(
COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/no-cache.patch
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
)
execute_process(
COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/reg-fix.patch
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
)
execute_process(
COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/bf16-tuning.patch
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
)
execute_process(
COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/non-multiple-128-fix.patch
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
)
execute_process(
COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/mem-reg.patch
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
)
execute_process(
COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/mscclpp_ibv_access_relaxed_ordering.patch
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
)
execute_process(
COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/read-allred.patch
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
)
execute_process(
COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/cpx.patch
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
)
#endif()
execute_process(COMMAND objcopy
--redefine-syms=${CMAKE_CURRENT_SOURCE_DIR}/src/misc/mscclpp/mscclpp_nccl_syms.txt
"${MSCCLPP_ROOT}/lib/libmscclpp_nccl_static.a"
"${PROJECT_BINARY_DIR}/libmscclpp_nccl.a"
)
add_library(mscclpp_nccl STATIC IMPORTED)
set_target_properties(mscclpp_nccl PROPERTIES IMPORTED_LOCATION ${PROJECT_BINARY_DIR}/libmscclpp_nccl.a)
endif()
================================================
FILE: cmake/ROCSHMEM.cmake
================================================
# MIT License
#
# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
include(ExternalProject)
function(add_rocshmem_targets)
# Check for an existing installation via the user-provided prefix ROCSHMEM_INSTALL DIR
if(ROCSHMEM_INSTALL_DIR)
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
find_package(rocshmem_static)
if(NOT IBVERBS)
find_library(IBVERBS ibverbs)
if(IBVERBS)
set(IBVERBS ${IBVERBS} PARENT_SCOPE)
endif()
endif()
endif()
# If no pre-existing installation, build from submodule into ext/rocshmem
if(NOT rocshmem_static_FOUND)
set(_rccl_root "${CMAKE_SOURCE_DIR}")
set(ROCSHMEM_SOURCE "${_rccl_root}/ext-src/rocSHMEM")
set(ROCSHMEM_INSTALL_DIR "${_rccl_root}/ext/rocshmem")
# Make sure submodule exists (same style as MSCCL++: custom rule + target)
add_custom_command(
OUTPUT "${ROCSHMEM_SOURCE}/CMakeLists.txt"
COMMAND git submodule update --init --recursive ext-src/rocSHMEM
WORKING_DIRECTORY "${_rccl_root}"
COMMENT "Checking out submodule: ext-src/rocSHMEM"
VERBATIM
)
add_custom_target(rocshmem_checkout_submodule
DEPENDS "${ROCSHMEM_SOURCE}/CMakeLists.txt")
# Where our patch files live (like MSCCL++)
set(EXT_SOURCE "${_rccl_root}/ext-src")
# Build and install rocSHMEM. We run `../build_scripts/gdx_bxnt`
# from a 'build' dir just like the README shows.
ExternalProject_Add(rocshmem_ext
SOURCE_DIR "${ROCSHMEM_SOURCE}"
INSTALL_DIR "${ROCSHMEM_INSTALL_DIR}"
UPDATE_DISCONNECTED TRUE
LOG_DOWNLOAD FALSE
LOG_CONFIGURE FALSE
LOG_BUILD FALSE
LOG_INSTALL FALSE
BUILD_IN_SOURCE TRUE
DOWNLOAD_COMMAND "" # using the submodule checkout above
TEST_COMMAND ""
DEPENDS rocshmem_checkout_submodule
# Rocshmem submodule commit hash -> commit b28a56bd54ccc581d05a439ffa466c3dacb3385
# The project has its own scripts; we replicate the README sequence:
CONFIGURE_COMMAND ""
BUILD_COMMAND
${CMAKE_COMMAND} -E make_directory build
&& ${CMAKE_COMMAND} -E chdir build bash -lc "../scripts/build_configs/gda_bnxt -DUSE_EXTERNAL_MPI=OFF -DUSE_IPC=ON -DBUILD_EXAMPLES=OFF "
&& ${CMAKE_COMMAND} -E chdir build ${CMAKE_COMMAND}
-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
-DCMAKE_INSTALL_PREFIX=<INSTALL_DIR>
-DBUILD_EXAMPLES=OFF ..
&& ${CMAKE_COMMAND} -E chdir build ${CMAKE_MAKE_PROGRAM} -j
INSTALL_COMMAND
${CMAKE_COMMAND} -E chdir build ${CMAKE_MAKE_PROGRAM} install
)
# After build, define the variables RCCL expects
set(ROCSHMEM_INCLUDE_DIR "${ROCSHMEM_INSTALL_DIR}/include" PARENT_SCOPE)
set(ROCSHMEM_LIBRARY "${ROCSHMEM_INSTALL_DIR}/lib/librocshmem.a" PARENT_SCOPE)
find_library(_IBVERBS ibverbs)
if(NOT _IBVERBS)
message(FATAL_ERROR "libibverbs not found (install rdma-core/libibverbs-dev)")
endif()
set(IBVERBS ${_IBVERBS} PARENT_SCOPE)
# Provide a dummy target other code can depend on
add_custom_target(rocshmem_static ALL DEPENDS rocshmem_ext)
else()
# We found a prebuilt rocSHMEM; export variables upward as-is
set(ROCSHMEM_INCLUDE_DIR "${ROCSHMEM_INCLUDE_DIR}" PARENT_SCOPE)
set(ROCSHMEM_LIBRARY "${ROCSHMEM_LIBRARY}" PARENT_SCOPE)
find_library(_IBVERBS ibverbs)
if(NOT _IBVERBS)
message(FATAL_ERROR "libibverbs not found")
endif()
set(IBVERBS ${_IBVERBS} PARENT_SCOPE)
endif()
endfunction()
================================================
FILE: cmake/rcclRAS.cmake
================================================
# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
cmake_minimum_required(VERSION 3.16)
message("Building rccl RAS client executable")
add_executable(rcclras "${PROJECT_BINARY_DIR}/hipify/src/ras/client.cc")
target_include_directories(rcclras PRIVATE ${PROJECT_BINARY_DIR}/include)
target_include_directories(rcclras PRIVATE ${HIPIFY_DIR}/src)
target_include_directories(rcclras PRIVATE ${HIPIFY_DIR}/src/include)
target_link_libraries(rcclras PRIVATE hip::host)
target_link_libraries(rcclras PRIVATE dl)
if(BUILD_SHARED_LIBS)
target_link_libraries(rcclras PRIVATE rccl hip::device)
else()
add_dependencies(rcclras rccl)
target_link_libraries(rcclras PRIVATE dl rt -lrccl -L${CMAKE_BINARY_DIR} -lamdhip64 -L${ROCM_PATH}/lib)
endif()
rocm_install(TARGETS rcclras)
================================================
FILE: cmake/rocmIb.cmake
================================================
# MIT License
#
# Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
# Dependencies
# HIP dependency is handled earlier in the project cmake file
# when VerifyCompiler.cmake is included.
# GIT
# Test dependencies
# For downloading, building, and installing required dependencies
include(cmake/DownloadProject.cmake)
message(STATUS "Generating ROCM NetIB... ")
# -------------------------
# Configurable paths
# -------------------------
# Path to RCCL source tree (local clone)
set(RCCL_SRC_DIR "${CMAKE_SOURCE_DIR}" CACHE PATH "Path to RCCL source directory")
# Path to patch file
set(ROCM_NETIB_PATCH_FILE "${CMAKE_SOURCE_DIR}/ext-src/rocm_netib.patch" CACHE FILEPATH "ROCM NETIB Patch file to apply to RCCL")
set(ROCM_NETIB_FILE "${CMAKE_SOURCE_DIR}/src/transport/net_ib_rocm.cc" CACHE FILEPATH "Generated ROCM NETIB file")
# -------------------------
# Find tools
# -------------------------
find_program(PATCH_EXECUTABLE patch)
find_program(SED_EXECUTABLE sed)
execute_process(
COMMAND ${CMAKE_COMMAND} -E echo "Applying RCCL ROCM NetIB patch... to ${CMAKE_SOURCE_DIR}"
COMMAND bash -c "patch -p1 -i ${ROCM_NETIB_PATCH_FILE} -o ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/NCCL_PARAM(Ib/NCCL_PARAM(RocmIb/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/RCCL_PARAM(Ib/RCCL_PARAM(RocmIb/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclParamIb/ncclParamRocmIb/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/rcclParamIb/rcclParamRocmIb/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbMergedDevs/rocmIbMergedDevs/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbDevs/rocmIbDevs/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbLock/rocmIbLock/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ibProviderName/rocmIbProviderName/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbAsyncThread/rocmIbAsyncThread/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbGdrSupport/rocmIbGdrSupport/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbDmaBufSupport/rocmIbDmaBufSupport/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbInitCommDevBase/rocmIbInitCommDevBase/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbDestroyBase/rocmIbDestroyBase/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbRtrQp/rocmIbRtrQp/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbRtsQp/rocmIbRtsQp/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ForceEnableGdrdma/RocmForceEnableGdrdma/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbCheckVProps/rocmIbCheckVProps/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbGetRequest/rocmIbGetRequest/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbFreeRequest/rocmIbFreeRequest/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbRegMrDmaBufInternal/rocmIbRegMrDmaBufInternal/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbGetNetCommDevBase/rocmIbGetNetCommDevBase/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbDeregMrInternal/rocmIbDeregMrInternal/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbPostFifo/rocmIbPostFifo/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/reqTypeStr/rocmIbReqTypeStr/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/rcclNetP2pPolicy/rcclRocmNetP2pPolicy/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbMakeVDeviceInternal/rocmIbMakeVDeviceInternal/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbMakeVDevice/rocmIbMakeVDevice/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbInit/rocmIbInit/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbDevices/rocmIbDevices/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbGetPhysProperties/rocmIbGetPhysProperties/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbGetProperties/rocmIbGetProperties/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbListen\(/rocmIbListen\(/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbListen,/rocmIbListen,/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbConnect\(/rocmIbConnect\(/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbConnect /rocmIbConnect /g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbConnect,/rocmIbConnect,/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbAccept/rocmIbAccept/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbTest/rocmIbTest/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbRegMrDmaBuf/rocmIbRegMrDmaBuf/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbRegMr/rocmIbRegMr/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbDeregMr/rocmIbDeregMr/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbIsend/rocmIbIsend/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbIrecv/rocmIbIrecv/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbIflush/rocmIbIflush/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbCloseSend/rocmIbCloseSend/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbCloseRecv/rocmIbCloseRecv/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbCloseListen/rocmIbCloseListen/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclNetIb/rocmNetIb/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbFinalize/rocmNetIbFinalize/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbSetNetAttr/rocmNetIbSetNetAttr/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
================================================
FILE: cmake/scripts/add_faults.sh
================================================
# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, dis
gitextract_sughb21j/
├── .azuredevops/
│ ├── multinode-ci-nightly.yml
│ ├── multinode-ci-pr.yml
│ ├── multinode-ci-slurm-nightly.yml
│ ├── multinode-ci-slurm-pr.yml
│ ├── rocm-ci.yml
│ ├── slurm/
│ │ ├── build.sh
│ │ ├── test_rccl-UnitTests.sh
│ │ └── test_rccl-tests.sh
│ ├── templates/
│ │ ├── build.yml
│ │ ├── test_rccl-UnitTests.yml
│ │ └── test_rccl-tests.yml
│ └── tests/
│ └── pytest/
│ └── HelloWorld.py
├── .clang-format
├── .github/
│ ├── CODEOWNERS
│ ├── PULL_REQUEST_TEMPLATE.md
│ ├── dependabot.yml
│ ├── scripts/
│ │ └── therock_configure_ci.py
│ └── workflows/
│ ├── therock-ci-linux.yml
│ ├── therock-ci.yml
│ ├── therock-test-packages-multi-node.yml
│ └── therock-test-packages-single-node.yml
├── .gitignore
├── .gitmodules
├── .readthedocs.yaml
├── CHANGELOG.md
├── CMakeLists.txt
├── CppCheckSuppressions.txt
├── LICENSE.txt
├── Makefile
├── NOTICES.txt
├── README.md
├── cmake/
│ ├── CheckSymbolExistsNoWarn.cmake
│ ├── Dependencies.cmake
│ ├── DownloadProject.CMakeLists.cmake.in
│ ├── DownloadProject.cmake
│ ├── FindIBVerbs.cmake
│ ├── Findmscclpp_nccl.cmake
│ ├── Findrocshmem_static.cmake
│ ├── MSCCLPP.cmake
│ ├── ROCSHMEM.cmake
│ ├── rcclRAS.cmake
│ ├── rocmIb.cmake
│ └── scripts/
│ ├── add_faults.sh
│ ├── add_unroll.sh
│ ├── extract_metadata.cmake
│ └── git_version.cmake
├── docker/
│ ├── Dockerfile.ubuntu
│ └── README.md
├── docs/
│ ├── .gitignore
│ ├── api-reference/
│ │ ├── api-library.rst
│ │ ├── env-variables.rst
│ │ └── library-specification.rst
│ ├── attributions.rst
│ ├── conf.py
│ ├── doxygen/
│ │ └── Doxyfile
│ ├── how-to/
│ │ ├── rccl-usage-tips.rst
│ │ ├── troubleshooting-rccl.rst
│ │ ├── using-nccl.rst
│ │ └── using-rccl-tuner-plugin-api.rst
│ ├── index.rst
│ ├── install/
│ │ ├── building-installing.rst
│ │ ├── docker-install.rst
│ │ └── installation.rst
│ ├── license.rst
│ ├── sphinx/
│ │ ├── _toc.yml.in
│ │ ├── requirements.in
│ │ └── requirements.txt
│ └── what-is-rccl.rst
├── ext-net/
│ ├── README.md
│ ├── example/
│ │ ├── CMakeLists.txt
│ │ ├── Makefile
│ │ ├── nccl/
│ │ │ ├── common.h
│ │ │ ├── err.h
│ │ │ ├── net.h
│ │ │ ├── net_device.h
│ │ │ ├── net_v10.h
│ │ │ ├── net_v11.h
│ │ │ ├── net_v2.h
│ │ │ ├── net_v3.h
│ │ │ ├── net_v4.h
│ │ │ ├── net_v5.h
│ │ │ ├── net_v6.h
│ │ │ ├── net_v7.h
│ │ │ ├── net_v8.h
│ │ │ ├── net_v9.h
│ │ │ └── types.h
│ │ └── plugin.c
│ └── google-fastsocket/
│ └── Makefile
├── ext-profiler/
│ ├── README.md
│ ├── example/
│ │ ├── CMakeLists.txt
│ │ ├── Makefile
│ │ ├── README.md
│ │ ├── event.h
│ │ ├── nccl/
│ │ │ ├── common.h
│ │ │ ├── err.h
│ │ │ ├── net_ib_v1.h
│ │ │ ├── net_socket_v1.h
│ │ │ ├── profiler.h
│ │ │ ├── profiler_net.h
│ │ │ ├── profiler_v1.h
│ │ │ ├── profiler_v2.h
│ │ │ ├── profiler_v3.h
│ │ │ ├── profiler_v4.h
│ │ │ ├── profiler_v5.h
│ │ │ └── types.h
│ │ ├── plugin.cc
│ │ ├── plugin.h
│ │ ├── print_event.cc
│ │ ├── print_event.h
│ │ └── queue.h
│ ├── google-CoMMA/
│ │ └── Makefile
│ └── inspector/
│ ├── Makefile
│ ├── README.md
│ ├── exporter/
│ │ └── example/
│ │ ├── README.md
│ │ ├── perf_summary_exporter.py
│ │ └── requirements.txt
│ ├── inspector.cc
│ ├── inspector.h
│ ├── inspector_plugin.cc
│ ├── json.cc
│ ├── json.h
│ ├── nccl/
│ │ ├── common.h
│ │ ├── profiler.h
│ │ ├── profiler_net.h
│ │ ├── profiler_v1.h
│ │ ├── profiler_v2.h
│ │ ├── profiler_v3.h
│ │ ├── profiler_v4.h
│ │ ├── profiler_v5.h
│ │ └── types.h
│ └── version.h
├── ext-src/
│ ├── bf16-tuning.patch
│ ├── check_ibv_access_relaxed_ordering.cc
│ ├── cpx.patch
│ ├── device-flag.patch
│ ├── disable-executor.patch
│ ├── disable-format-checks.patch
│ ├── mem-reg.patch
│ ├── mscclpp_ibv_access_relaxed_ordering.patch
│ ├── no-cache.patch
│ ├── non-multiple-128-fix.patch
│ ├── read-allred.patch
│ ├── reg-fix.patch
│ ├── remove-clip.patch
│ └── rocm_netib.patch
├── ext-tuner/
│ ├── README.md
│ ├── basic/
│ │ ├── Makefile
│ │ ├── README.md
│ │ ├── nccl/
│ │ │ ├── common.h
│ │ │ ├── err.h
│ │ │ └── tuner.h
│ │ └── plugin.c
│ ├── example/
│ │ ├── .gitignore
│ │ ├── CMakeLists.txt
│ │ ├── Makefile
│ │ ├── README.md
│ │ ├── nccl/
│ │ │ ├── common.h
│ │ │ ├── err.h
│ │ │ └── tuner.h
│ │ ├── nccl_tuner.conf
│ │ ├── plugin.c
│ │ ├── scripts/
│ │ │ ├── README.md
│ │ │ └── optimize_config.py
│ │ └── test/
│ │ ├── Makefile
│ │ ├── README.md
│ │ └── test_plugin.c
│ └── model_demo/
│ ├── Makefile
│ ├── README.md
│ ├── nccl/
│ │ ├── common.h
│ │ ├── err.h
│ │ └── tuner.h
│ └── plugin.c
├── install.sh
├── makefiles/
│ ├── common.mk
│ ├── formatting.mk
│ └── version.mk
├── pkg/
│ ├── Makefile
│ ├── debian/
│ │ ├── .gitignore
│ │ ├── Makefile
│ │ ├── changelog.in
│ │ ├── compat
│ │ ├── control.in
│ │ ├── gbp.conf
│ │ ├── libnccl-dev.install.in
│ │ ├── libnccl2.install.in
│ │ ├── rules
│ │ └── source/
│ │ └── format
│ ├── redhat/
│ │ ├── Makefile
│ │ └── nccl.spec.in
│ ├── srctxz/
│ │ ├── Makefile
│ │ └── create_srctxz.sh.in
│ └── txz/
│ ├── Makefile
│ └── create_txz.sh.in
├── rtest.xml
├── src/
│ ├── CMakeLists.txt
│ ├── Makefile
│ ├── allocator.cc
│ ├── bootstrap.cc
│ ├── ce_coll.cc
│ ├── channel.cc
│ ├── collectives.cc
│ ├── commDump.cc
│ ├── debug.cc
│ ├── dev_runtime.cc
│ ├── device/
│ │ ├── CMakeLists.txt
│ │ ├── Makefile
│ │ ├── all_gather.h
│ │ ├── all_reduce.h
│ │ ├── alltoall_gda.h
│ │ ├── alltoall_pivot.h
│ │ ├── broadcast.h
│ │ ├── common.cu
│ │ ├── common.h
│ │ ├── common_kernel.h
│ │ ├── generate.py
│ │ ├── msccl_kernel_impl.h
│ │ ├── network/
│ │ │ └── unpack/
│ │ │ ├── unpack.h
│ │ │ └── unpack_defs.h
│ │ ├── onerank.cu
│ │ ├── op128.h
│ │ ├── primitives.h
│ │ ├── prims_ll.h
│ │ ├── prims_ll128.h
│ │ ├── prims_simple.h
│ │ ├── rccl_metadata.h
│ │ ├── rccl_ptr.h
│ │ ├── reduce.h
│ │ ├── reduce_kernel.h
│ │ ├── reduce_scatter.h
│ │ ├── sendrecv.h
│ │ └── symmetric/
│ │ ├── all_gather.cuh
│ │ ├── all_reduce.cuh
│ │ ├── generate.py
│ │ ├── kernel.cuh
│ │ ├── primitives.cuh
│ │ └── reduce_scatter.cuh
│ ├── enhcompat.cc
│ ├── enqueue.cc
│ ├── graph/
│ │ ├── CMakeLists.txt
│ │ ├── connect.cc
│ │ ├── paths.cc
│ │ ├── rings.cc
│ │ ├── rings.h
│ │ ├── rome_models.cc
│ │ ├── rome_models.h
│ │ ├── search.cc
│ │ ├── topo.cc
│ │ ├── topo.h
│ │ ├── trees.cc
│ │ ├── tuning.cc
│ │ ├── xml.cc
│ │ └── xml.h
│ ├── group.cc
│ ├── include/
│ │ ├── BfdBacktrace.hpp
│ │ ├── alloc.h
│ │ ├── allocator.h
│ │ ├── alt_rsmi.h
│ │ ├── amdsmi_wrap.h
│ │ ├── api_trace.h
│ │ ├── archinfo.h
│ │ ├── argcheck.h
│ │ ├── bitops.h
│ │ ├── bootstrap.h
│ │ ├── ce_coll.h
│ │ ├── channel.h
│ │ ├── checks.h
│ │ ├── coll_net.h
│ │ ├── collectives.h
│ │ ├── comm.h
│ │ ├── core.h
│ │ ├── cpuset.h
│ │ ├── cudawrap.h
│ │ ├── debug.h
│ │ ├── dev_runtime.h
│ │ ├── device.h
│ │ ├── enqueue.h
│ │ ├── gdrwrap.h
│ │ ├── git_version.h
│ │ ├── graph.h
│ │ ├── group.h
│ │ ├── hip_rocm_version_info.h
│ │ ├── ibvcore.h
│ │ ├── ibvsymbols.h
│ │ ├── ibvwrap.h
│ │ ├── info.h
│ │ ├── ionic/
│ │ │ ├── ionicdvcore.h
│ │ │ ├── ionicdvsymbols.h
│ │ │ └── ionicdvwrap.h
│ │ ├── ipcsocket.h
│ │ ├── latency_profiler/
│ │ │ ├── CollTrace.h
│ │ │ ├── CollTraceEvent.h
│ │ │ ├── CollTraceFunc.h
│ │ │ ├── CollTraceUtils.h
│ │ │ ├── EventQueue.h
│ │ │ └── MIT-LICENSE.txt
│ │ ├── mlx5/
│ │ │ ├── mlx5dvcore.h
│ │ │ ├── mlx5dvsymbols.h
│ │ │ └── mlx5dvwrap.h
│ │ ├── mnnvl.h
│ │ ├── msccl/
│ │ │ ├── msccl_kernel.h
│ │ │ ├── msccl_lifecycle.h
│ │ │ ├── msccl_parser.h
│ │ │ ├── msccl_scheduler.h
│ │ │ ├── msccl_setup.h
│ │ │ ├── msccl_status.h
│ │ │ └── msccl_struct.h
│ │ ├── mscclpp/
│ │ │ └── mscclpp_nccl.h
│ │ ├── nccl_common.h
│ │ ├── nccl_device/
│ │ │ ├── README.md
│ │ │ ├── comm.h
│ │ │ ├── coop.h
│ │ │ ├── core.h
│ │ │ ├── impl/
│ │ │ │ ├── comm__funcs.h
│ │ │ │ ├── comm__types.h
│ │ │ │ ├── core__funcs.h
│ │ │ │ ├── core__types.h
│ │ │ │ ├── ll_a2a__funcs.h
│ │ │ │ ├── ll_a2a__types.h
│ │ │ │ ├── mem_barrier__funcs.h
│ │ │ │ ├── mem_barrier__types.h
│ │ │ │ ├── ptr__funcs.h
│ │ │ │ └── ptr__types.h
│ │ │ ├── ll_a2a.h
│ │ │ ├── mem_barrier.h
│ │ │ ├── ptr.h
│ │ │ └── utility.h
│ │ ├── nccl_device.h
│ │ ├── net.h
│ │ ├── net_device.h
│ │ ├── npkit/
│ │ │ ├── npkit.h
│ │ │ ├── npkit_event.h
│ │ │ └── npkit_struct.h
│ │ ├── nvmlwrap.h
│ │ ├── nvtx.h
│ │ ├── nvtx3/
│ │ │ ├── nvToolsExt.h
│ │ │ ├── nvToolsExtCounters.h
│ │ │ ├── nvToolsExtCuda.h
│ │ │ ├── nvToolsExtCudaRt.h
│ │ │ ├── nvToolsExtMem.h
│ │ │ ├── nvToolsExtMemCudaRt.h
│ │ │ ├── nvToolsExtOpenCL.h
│ │ │ ├── nvToolsExtPayload.h
│ │ │ ├── nvToolsExtPayloadHelper.h
│ │ │ ├── nvToolsExtSemanticsCounters.h
│ │ │ ├── nvToolsExtSemanticsScope.h
│ │ │ ├── nvToolsExtSync.h
│ │ │ ├── nvtx3.hpp
│ │ │ └── nvtxDetail/
│ │ │ ├── nvtxExtHelperMacros.h
│ │ │ ├── nvtxExtImpl.h
│ │ │ ├── nvtxExtImplCounters_v1.h
│ │ │ ├── nvtxExtImplMemCudaRt_v1.h
│ │ │ ├── nvtxExtImplMem_v1.h
│ │ │ ├── nvtxExtImplPayload_v1.h
│ │ │ ├── nvtxExtInit.h
│ │ │ ├── nvtxExtPayloadHelperInternal.h
│ │ │ ├── nvtxExtPayloadTypeInfo.h
│ │ │ ├── nvtxExtTypes.h
│ │ │ ├── nvtxImpl.h
│ │ │ ├── nvtxImplCore.h
│ │ │ ├── nvtxImplCudaRt_v3.h
│ │ │ ├── nvtxImplCuda_v3.h
│ │ │ ├── nvtxImplOpenCL_v3.h
│ │ │ ├── nvtxImplSync_v3.h
│ │ │ ├── nvtxInit.h
│ │ │ ├── nvtxInitDecls.h
│ │ │ ├── nvtxInitDefs.h
│ │ │ ├── nvtxLinkOnce.h
│ │ │ └── nvtxTypes.h
│ │ ├── nvtx_payload_schemas.h
│ │ ├── nvtx_stub.h
│ │ ├── p2p.h
│ │ ├── param.h
│ │ ├── plugin/
│ │ │ ├── nccl_net.h
│ │ │ ├── nccl_profiler.h
│ │ │ ├── nccl_tuner.h
│ │ │ ├── net/
│ │ │ │ ├── net_v10.h
│ │ │ │ ├── net_v11.h
│ │ │ │ ├── net_v6.h
│ │ │ │ ├── net_v7.h
│ │ │ │ ├── net_v8.h
│ │ │ │ └── net_v9.h
│ │ │ ├── plugin.h
│ │ │ ├── profiler/
│ │ │ │ ├── net_ib.h
│ │ │ │ ├── net_ib_v1.h
│ │ │ │ ├── net_socket.h
│ │ │ │ ├── net_socket_v1.h
│ │ │ │ ├── profiler_v1.h
│ │ │ │ ├── profiler_v2.h
│ │ │ │ ├── profiler_v3.h
│ │ │ │ ├── profiler_v4.h
│ │ │ │ └── profiler_v5.h
│ │ │ └── tuner/
│ │ │ ├── tuner_v2.h
│ │ │ ├── tuner_v3.h
│ │ │ ├── tuner_v4.h
│ │ │ └── tuner_v5.h
│ │ ├── profiler.h
│ │ ├── proxy.h
│ │ ├── proxy_trace/
│ │ │ └── proxy_trace.h
│ │ ├── ras.h
│ │ ├── rccl_common.h
│ │ ├── rccl_float8.h
│ │ ├── rccl_vars.h
│ │ ├── recorder.h
│ │ ├── register.h
│ │ ├── register_inline.h
│ │ ├── rocm_smi_wrap.h
│ │ ├── rocmwrap.h
│ │ ├── roctx.h
│ │ ├── scheduler.h
│ │ ├── shm.h
│ │ ├── shmutils.h
│ │ ├── signals.h
│ │ ├── socket.h
│ │ ├── strongstream.h
│ │ ├── sym_kernels.h
│ │ ├── timer.h
│ │ ├── transport.h
│ │ ├── trees.h
│ │ ├── tuner.h
│ │ └── utils.h
│ ├── init.cc
│ ├── init_nvtx.cc
│ ├── misc/
│ │ ├── CMakeLists.txt
│ │ ├── alt_rsmi.cc
│ │ ├── amdsmi_wrap.cc
│ │ ├── api_trace.c
│ │ ├── api_trace.cc
│ │ ├── archinfo.cc
│ │ ├── argcheck.cc
│ │ ├── cudawrap.cc
│ │ ├── gdrwrap.cc
│ │ ├── ibvsymbols.cc
│ │ ├── ibvwrap.cc
│ │ ├── ionicdvsymbols.cc
│ │ ├── ionicdvwrap.cc
│ │ ├── ipcsocket.cc
│ │ ├── latency_profiler/
│ │ │ ├── CollTrace.cc
│ │ │ ├── CollTraceEvent.cc
│ │ │ ├── CollTraceFunc.cc
│ │ │ ├── CollTraceUtils.cc
│ │ │ └── MIT-LICENSE.txt
│ │ ├── mlx5dvsymbols.cc
│ │ ├── mlx5dvwrap.cc
│ │ ├── msccl/
│ │ │ ├── msccl_lifecycle.cc
│ │ │ ├── msccl_parser.cc
│ │ │ ├── msccl_setup.cc
│ │ │ └── msccl_status.cc
│ │ ├── mscclpp/
│ │ │ ├── mscclpp_nccl.cc
│ │ │ └── mscclpp_nccl_syms.txt
│ │ ├── npkit.cc
│ │ ├── nvmlwrap.cc
│ │ ├── nvmlwrap_stub.cc
│ │ ├── param.cc
│ │ ├── proxy_trace/
│ │ │ └── proxy_trace.cc
│ │ ├── recorder.cc
│ │ ├── rocm_smi_wrap.cc
│ │ ├── rocmwrap.cc
│ │ ├── roctx.cc
│ │ ├── shmutils.cc
│ │ ├── signals.cc
│ │ ├── socket.cc
│ │ ├── strongstream.cc
│ │ └── utils.cc
│ ├── mnnvl.cc
│ ├── msccl.cc
│ ├── nccl.h.in
│ ├── nccl.pc.in
│ ├── nccl_device/
│ │ ├── CMakeLists.txt
│ │ ├── core.cc
│ │ ├── ll_a2a.cc
│ │ └── mem_barrier.cc
│ ├── plugin/
│ │ ├── CMakeLists.txt
│ │ ├── net/
│ │ │ ├── CMakeLists.txt
│ │ │ ├── net_v10.cc
│ │ │ ├── net_v11.cc
│ │ │ ├── net_v6.cc
│ │ │ ├── net_v7.cc
│ │ │ ├── net_v8.cc
│ │ │ └── net_v9.cc
│ │ ├── net.cc
│ │ ├── plugin_open.cc
│ │ ├── profiler/
│ │ │ ├── CMakeLists.txt
│ │ │ ├── profiler_v1.cc
│ │ │ ├── profiler_v2.cc
│ │ │ ├── profiler_v3.cc
│ │ │ ├── profiler_v4.cc
│ │ │ └── profiler_v5.cc
│ │ ├── profiler.cc
│ │ ├── tuner/
│ │ │ ├── CMakeLists.txt
│ │ │ ├── tuner_v2.cc
│ │ │ ├── tuner_v3.cc
│ │ │ ├── tuner_v4.cc
│ │ │ └── tuner_v5.cc
│ │ └── tuner.cc
│ ├── proxy.cc
│ ├── ras/
│ │ ├── CMakeLists.txt
│ │ ├── client.cc
│ │ ├── client_support.cc
│ │ ├── collectives.cc
│ │ ├── peers.cc
│ │ ├── ras.cc
│ │ ├── ras_internal.h
│ │ └── rasnet.cc
│ ├── rccl_wrap.cc
│ ├── register/
│ │ ├── CMakeLists.txt
│ │ ├── coll_reg.cc
│ │ ├── register.cc
│ │ └── sendrecv_reg.cc
│ ├── scheduler/
│ │ ├── CMakeLists.txt
│ │ └── symmetric_sched.cc
│ ├── sym_kernels.cc
│ ├── transport/
│ │ ├── CMakeLists.txt
│ │ ├── coll_net.cc
│ │ ├── generic.cc
│ │ ├── net.cc
│ │ ├── net_ib.cc
│ │ ├── net_socket.cc
│ │ ├── nvls.cc
│ │ ├── p2p.cc
│ │ ├── profiler.cc
│ │ └── shm.cc
│ └── transport.cc
├── test/
│ ├── AllGatherTests.cpp
│ ├── AllReduceTests.cpp
│ ├── AllToAllTests.cpp
│ ├── AllToAllVTests.cpp
│ ├── AllocTests.cpp
│ ├── AltRsmiTests.cpp
│ ├── ArgCheckTests.cpp
│ ├── BitOpsTests.cpp
│ ├── BroadcastTests.cpp
│ ├── CMakeLists.txt
│ ├── CommTests.cpp
│ ├── EnqueueTests.cpp
│ ├── GatherTests.cpp
│ ├── GroupCallTests.cpp
│ ├── IpcsocketTests.cpp
│ ├── NetSocketTests.cpp
│ ├── NonBlockingTests.cpp
│ ├── ParamTests.cpp
│ ├── ParamTestsConfFile.txt
│ ├── ProxyTests.cpp
│ ├── README.md
│ ├── RcclWrapTests.cpp
│ ├── ReduceScatterTests.cpp
│ ├── ReduceTests.cpp
│ ├── RegisterTests.cpp
│ ├── ScatterTests.cpp
│ ├── SendRecvTests.cpp
│ ├── StandaloneTests.cpp
│ ├── TransportTests.cpp
│ ├── _RecorderTests.cpp
│ ├── common/
│ │ ├── CallCollectiveForked.cpp
│ │ ├── CallCollectiveForked.hpp
│ │ ├── CollectiveArgs.cpp
│ │ ├── CollectiveArgs.hpp
│ │ ├── DeviceBufferHelpers.hpp
│ │ ├── EnvVars.cpp
│ │ ├── EnvVars.hpp
│ │ ├── ErrCode.hpp
│ │ ├── MPIEnvironment.cpp
│ │ ├── MPIEnvironment.hpp
│ │ ├── MPIHelpers.cpp
│ │ ├── MPIHelpers.hpp
│ │ ├── MPIStandaloneTest.hpp
│ │ ├── MPITestBase.hpp
│ │ ├── MPITestCore.cpp
│ │ ├── MPITestCore.hpp
│ │ ├── MPITestRunner.md
│ │ ├── PrepDataFuncs.cpp
│ │ ├── PrepDataFuncs.hpp
│ │ ├── ProcessIsolatedTestRunner.cpp
│ │ ├── ProcessIsolatedTestRunner.hpp
│ │ ├── ProcessIsolatedTestRunner.md
│ │ ├── PtrUnion.cpp
│ │ ├── PtrUnion.hpp
│ │ ├── RcclMockFuncs.hpp
│ │ ├── ResourceGuards.hpp
│ │ ├── StandaloneUtils.cpp
│ │ ├── StandaloneUtils.hpp
│ │ ├── TestBed.cpp
│ │ ├── TestBed.hpp
│ │ ├── TestBedChild.cpp
│ │ ├── TestBedChild.hpp
│ │ ├── TestChecks.cpp
│ │ ├── TestChecks.hpp
│ │ ├── TransportUtils.hpp
│ │ ├── main.cpp
│ │ ├── main_fixtures.cpp
│ │ └── main_mpi.cpp
│ ├── ext-plugins/
│ │ ├── .gitignore
│ │ ├── README.md
│ │ ├── assets/
│ │ │ └── csv_confs/
│ │ │ ├── incorrect_values_config.conf
│ │ │ ├── multinode_config.conf
│ │ │ ├── no_matching_config.conf
│ │ │ ├── singlenode_config.conf
│ │ │ ├── unsupported_algo_proto_config.conf
│ │ │ ├── valid_config_with_wildcards.conf
│ │ │ └── valid_config_without_wildcards.conf
│ │ ├── pytest.ini
│ │ ├── requirements.txt
│ │ └── tests/
│ │ ├── conftest.py
│ │ ├── ext-profiler/
│ │ │ ├── test_allgather.py
│ │ │ ├── test_allreduce.py
│ │ │ ├── test_alltoall.py
│ │ │ ├── test_broadcast.py
│ │ │ ├── test_reduce.py
│ │ │ ├── test_reducescatter.py
│ │ │ └── test_sendrecv.py
│ │ └── ext-tuner/
│ │ ├── test_allgather.py
│ │ ├── test_allreduce.py
│ │ ├── test_broadcast.py
│ │ ├── test_reduce.py
│ │ └── test_reducescatter.py
│ ├── graph/
│ │ └── XmlTests.cpp
│ ├── latency_profiler/
│ │ └── LatencyProfilerUnitTest.cpp
│ ├── proxy_trace/
│ │ └── ProxyTraceUnitTests.cpp
│ └── transport/
│ ├── NetIbMPITests.cpp
│ ├── NetMPITests.cpp
│ ├── P2pMPITests.cpp
│ ├── ShmMPITests.cpp
│ ├── TransportMPIBase.cpp
│ └── TransportMPIBase.hpp
├── toolchain-linux.cmake
└── tools/
├── EmptyKernelTest/
│ ├── EmptyKernelTest.cpp
│ ├── Makefile
│ └── run.sh
├── GraphBench/
│ ├── GraphBench.cpp
│ └── Makefile
├── HelloRccl/
│ ├── HelloRccl.cpp
│ ├── HelloRccl.hpp
│ ├── Makefile
│ └── runTest.sh
├── JitterBench/
│ ├── Common.hpp
│ ├── Compatibility.hpp
│ ├── GetClosestNumaNode.hpp
│ ├── JitterBench.cpp
│ ├── Makefile
│ ├── Timeline.hpp
│ └── runSweep.sh
├── RcclReplayer/
│ ├── Makefile
│ ├── README.md
│ ├── rcclReplayer.cpp
│ ├── rcclReplayer.hpp
│ └── replay_log_converter.py
├── TopoVisual/
│ ├── README.md
│ ├── extract_topo.awk
│ └── topo_visual.sh
├── TransferBench/
│ └── README.md
├── ib-test/
│ ├── Makefile
│ ├── ib_test.cpp
│ ├── include/
│ │ └── nccl.h
│ └── utils.cpp
├── msccl-algorithms/
│ ├── allgather_16n_direct_0_3m_ll128.xml
│ ├── allgather_16n_direct_0_3m_ll128_op.xml
│ ├── allgather_32n_direct_0_6m_ll128.xml
│ ├── allgather_32n_direct_0_6m_ll128_op.xml
│ ├── allreduce-allpairs-8n-ll-32tb-op.xml
│ ├── allreduce-allpairs-8n-ll-32tb.xml
│ ├── allreduce-allpairs-8n-ll-64tb-op.xml
│ ├── allreduce-allpairs-8n-ll-64tb.xml
│ ├── allreduce-allpairs-8n-simple-op.xml
│ ├── allreduce-allpairs-8n-simple.xml
│ ├── alltoall-8n-0-9kb.xml
│ ├── alltoall-8n-190kb-512kb.xml
│ ├── alltoall-8n-512kb-7mb.xml
│ ├── alltoall-8n-7mb-43mb.xml
│ └── alltoall-8n-9kb-190kb.xml
├── msccl-unit-test-algorithms/
│ ├── all-reduce-ring-ll.xml
│ ├── all-reduce-ring-ll128.xml
│ └── all-reduce-ring-simple.xml
├── p2p-latency-test/
│ ├── Makefile
│ ├── README.md
│ ├── build_and_run.sh
│ ├── ll_latency_test.cpp
│ ├── ll_latency_test.cu
│ └── p2p_latency_test.cpp
├── rccl-prim-test/
│ ├── Makefile
│ ├── copy_kernel.h
│ └── rccl_prim_test.cpp
├── scripts/
│ ├── exclude_static_list.txt
│ ├── npkit_trace_analysis.py
│ ├── npkit_trace_generator.py
│ ├── pytorch-all-reduce/
│ │ ├── README.md
│ │ ├── all_reduce.py
│ │ └── trace_runs.sh
│ ├── pytorch-log-parser.py
│ ├── rcclDiagnostics.py
│ ├── rccl_bw_test.py
│ ├── rocprof-log-parser.py
│ ├── test_runner/
│ │ ├── README.md
│ │ ├── configs/
│ │ │ ├── mi300x_mellanox_ib.json
│ │ │ ├── rccl_perf_tests.json
│ │ │ └── test_config_sample.json
│ │ ├── lib/
│ │ │ ├── __init__.py
│ │ │ ├── test_config.py
│ │ │ ├── test_executor.py
│ │ │ └── test_parser.py
│ │ └── test_runner.py
│ ├── topo_val.sh
│ └── ucx_ompi_rccl_rccltests_TB_script.sh
├── time-trace/
│ ├── rccl-TimeTrace.sh
│ └── time_trace_generator.py
└── topo_expl/
├── Makefile
├── README.md
├── include/
│ ├── device_table.h
│ ├── model.h
│ ├── nccl.h
│ └── utils.h
├── model.cpp
├── models/
│ ├── topo_16p1h.xml
│ ├── topo_16p1h_vm.xml
│ ├── topo_16p_gio-1s-1rp-cascade.xml
│ ├── topo_16p_gio-3s-1rp-split-flat.xml
│ ├── topo_3p_pcie.xml
│ ├── topo_3p_pcie_1.xml
│ ├── topo_4p1h.xml
│ ├── topo_4p1h_1.xml
│ ├── topo_4p2h.xml
│ ├── topo_4p2h_1.xml
│ ├── topo_4p2h_2nic.xml
│ ├── topo_4p3l.xml
│ ├── topo_4p3l_2h.xml
│ ├── topo_4p3l_ia.xml
│ ├── topo_4p3l_n2.xml
│ ├── topo_4p3l_n2_1.xml
│ ├── topo_4p3l_n4.xml
│ ├── topo_4p4h.xml
│ ├── topo_4p_942.xml
│ ├── topo_8p1h.xml
│ ├── topo_8p1h_1.xml
│ ├── topo_8p1h_2.xml
│ ├── topo_8p1h_3.xml
│ ├── topo_8p1h_4.xml
│ ├── topo_8p1h_5.xml
│ ├── topo_8p1h_n1.xml
│ ├── topo_8p6l.xml
│ ├── topo_8p6l_1nic.xml
│ ├── topo_8p6l_2nic.xml
│ ├── topo_8p6l_3nic.xml
│ ├── topo_8p6l_4nic.xml
│ ├── topo_8p6l_5nic.xml
│ ├── topo_8p6l_6nic.xml
│ ├── topo_8p_4nics.xml
│ ├── topo_8p_90a.xml
│ ├── topo_8p_90a_1.xml
│ ├── topo_8p_942.xml
│ ├── topo_8p_942vm.xml
│ ├── topo_8p_950.xml
│ ├── topo_8p_pcie.xml
│ ├── topo_8p_pcie_1.xml
│ ├── topo_8p_pcie_2nic.xml
│ ├── topo_8p_rome.xml
│ ├── topo_8p_rome_4n_1.xml
│ ├── topo_8p_rome_4n_2.xml
│ ├── topo_8p_rome_4nics.xml
│ ├── topo_8p_rome_n2.xml
│ ├── topo_8p_rome_n2_1.xml
│ ├── topo_8p_rome_n2_2.xml
│ ├── topo_8p_rome_n4.xml
│ ├── topo_8p_rome_n4_1.xml
│ ├── topo_8p_rome_pcie.xml
│ ├── topo_8p_rome_vm1.xml
│ ├── topo_8p_ts1.xml
│ ├── topo_8p_ts1_1.xml
│ ├── topo_8p_ts1_n4.xml
│ ├── topo_8p_ts1_n4_1.xml
│ ├── topo_8p_ts1_n4_2.xml
│ ├── topo_collnet_n1.xml
│ └── topo_collnet_n4.xml
├── topo_expl.cpp
└── utils.cpp
Showing preview only (508K chars total). Download the full file or copy to clipboard to get everything.
SYMBOL INDEX (5939 symbols across 443 files)
FILE: .azuredevops/tests/pytest/HelloWorld.py
function test_HelloWorld (line 3) | def test_HelloWorld():
FILE: .github/scripts/therock_configure_ci.py
function gha_set_output (line 12) | def gha_set_output(vars: Mapping[str, str | Path]):
function get_modified_paths (line 31) | def get_modified_paths(base_ref: str) -> Optional[Iterable[str]]:
function is_path_workflow_file_related_to_ci (line 54) | def is_path_workflow_file_related_to_ci(path: str) -> bool:
function check_for_workflow_file_related_to_ci (line 60) | def check_for_workflow_file_related_to_ci(paths: Optional[Iterable[str]]...
function is_path_skippable (line 80) | def is_path_skippable(path: str) -> bool:
function check_for_non_skippable_path (line 84) | def check_for_non_skippable_path(paths: Optional[Iterable[str]]) -> bool:
function should_ci_run_given_modified_paths (line 90) | def should_ci_run_given_modified_paths(paths: Optional[Iterable[str]]) -...
function main (line 121) | def main(args):
FILE: ext-net/example/nccl/common.h
type ncclDebugLogLevel (line 12) | typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_ERROR=1, NCCL_LOG_VERSION=2, NCC...
type ncclDebugLogSubSys (line 13) | typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET...
type ncclResult_t (line 19) | typedef ncclResult_t (*ncclProfilerCallback_t)(void** eHandle, int type,...
FILE: ext-net/example/nccl/err.h
type ncclResult_t (line 9) | typedef enum { ncclSuccess = 0,
FILE: ext-net/example/nccl/net.h
type ncclNet_v11_t (line 38) | typedef ncclNet_v11_t ncclNet_t;
type ncclNetProperties_v11_t (line 39) | typedef ncclNetProperties_v11_t ncclNetProperties_t;
type ncclNetVDeviceProps_v11_t (line 40) | typedef ncclNetVDeviceProps_v11_t ncclNetVDeviceProps_t;
type ncclNetCommConfig_v11_t (line 41) | typedef ncclNetCommConfig_v11_t ncclNetCommConfig_t;
FILE: ext-net/example/nccl/net_device.h
type ncclNetDeviceType (line 17) | typedef enum {NCCL_NET_DEVICE_HOST=0, NCCL_NET_DEVICE_UNPACK=1} ncclNetD...
type ncclNetDeviceHandle_v7_t (line 19) | typedef struct {
type ncclNetDeviceHandle_v7_t (line 27) | typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t;
type ncclNetDeviceHandle_v8_t (line 28) | typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_v9_t;
type ncclNetDeviceHandle_v9_t (line 29) | typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_v10_t;
type ncclNetDeviceHandle_v10_t (line 30) | typedef ncclNetDeviceHandle_v10_t ncclNetDeviceHandle_v11_t;
type ncclNetDeviceHandle_v11_t (line 31) | typedef ncclNetDeviceHandle_v11_t ncclNetDeviceHandle_t;
FILE: ext-net/example/nccl/net_v10.h
type ncclNetVDeviceProps_v10_t (line 8) | typedef struct {
type ncclNetCommConfig_v10_t (line 15) | typedef struct {
type ncclNetProperties_v10_t (line 21) | typedef struct {
type ncclNet_v10_t (line 41) | typedef struct {
FILE: ext-net/example/nccl/net_v11.h
type ncclNetVDeviceProps_v11_t (line 8) | typedef struct {
type ncclNetCommConfig_v11_t (line 15) | typedef struct {
type ncclNetProperties_v11_t (line 21) | typedef struct {
type ncclNetCommAttr_v11_t (line 42) | typedef struct {
type ncclNetAttr_v11_t (line 49) | typedef struct {
type ncclNet_v11_t (line 57) | typedef struct {
FILE: ext-net/example/nccl/net_v2.h
type ncclNet_v2_t (line 8) | typedef struct {
FILE: ext-net/example/nccl/net_v3.h
type ncclNetProperties_v4_t (line 10) | typedef ncclNetProperties_v4_t ncclNetProperties_v3_t;
type ncclNet_v3_t (line 11) | typedef struct {
FILE: ext-net/example/nccl/net_v4.h
type ncclNetProperties_v4_t (line 10) | typedef struct {
type ncclNet_v4_t (line 22) | typedef struct {
FILE: ext-net/example/nccl/net_v5.h
type ncclNetProperties_v6_t (line 8) | typedef ncclNetProperties_v6_t ncclNetProperties_v5_t;
type ncclNet_v5_t (line 9) | typedef struct {
FILE: ext-net/example/nccl/net_v6.h
type ncclNetProperties_v6_t (line 8) | typedef struct {
type ncclNet_v6_t (line 21) | typedef struct {
FILE: ext-net/example/nccl/net_v7.h
type ncclNetProperties_v7_t (line 8) | typedef struct {
type ncclNet_v7_t (line 23) | typedef struct {
FILE: ext-net/example/nccl/net_v8.h
type ncclNetProperties_v8_t (line 8) | typedef struct {
type ncclNet_v8_t (line 24) | typedef struct {
FILE: ext-net/example/nccl/net_v9.h
type ncclNetVDeviceProps_v9_t (line 8) | typedef struct {
type ncclNetProperties_v9_t (line 13) | typedef struct {
type ncclNet_v9_t (line 33) | typedef struct {
FILE: ext-net/example/nccl/types.h
type ncclDataType_t (line 9) | typedef enum { ncclInt8 = 0, ncclChar = 0,
FILE: ext-net/example/plugin.c
function __hidden (line 14) | __hidden ncclResult_t pluginInit(void** ctx, uint64_t commId, ncclNetCom...
function __hidden (line 15) | __hidden ncclResult_t pluginDevices(int* ndev) { *ndev = 0; return ncclS...
function __hidden (line 16) | __hidden ncclResult_t pluginPciPath(int dev, char** path) { return ncclI...
function __hidden (line 17) | __hidden ncclResult_t pluginPtrSupport(int dev, int* supportedTypes) { r...
function __hidden (line 18) | __hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_t* ...
function __hidden (line 54) | __hidden ncclResult_t pluginListen(void* ctx, int dev, void* handle, voi...
function __hidden (line 55) | __hidden ncclResult_t pluginConnect(void* ctx, int dev, void* handle, vo...
function __hidden (line 56) | __hidden ncclResult_t pluginAccept(void* listenComm, void** recvComm, nc...
function __hidden (line 57) | __hidden ncclResult_t pluginRegMr(void* collComm, void* data, size_t siz...
function __hidden (line 58) | __hidden ncclResult_t pluginRegMrDmaBuf(void* collComm, void* data, size...
function __hidden (line 59) | __hidden ncclResult_t pluginDeregMr(void* collComm, void* mhandle) { ret...
function __hidden (line 60) | __hidden ncclResult_t pluginIsend(void* sendComm, void* data, size_t siz...
function __hidden (line 61) | __hidden ncclResult_t pluginIrecv(void* recvComm, int n, void** data, si...
function __hidden (line 62) | __hidden ncclResult_t pluginIflush(void* recvComm, int n, void** data, i...
function __hidden (line 63) | __hidden ncclResult_t pluginTest(void* request, int* done, int* size) { ...
function __hidden (line 64) | __hidden ncclResult_t pluginCloseSend(void* sendComm) { return ncclInter...
function __hidden (line 65) | __hidden ncclResult_t pluginCloseRecv(void* recvComm) { return ncclInter...
function __hidden (line 66) | __hidden ncclResult_t pluginCloseListen(void* listenComm) { return ncclI...
function __hidden (line 67) | __hidden ncclResult_t pluginIrecvConsumed(void* recvComm, int n, void* r...
function __hidden (line 68) | __hidden ncclResult_t pluginGetDeviceMr(void* comm, void* mhandle, void*...
function __hidden (line 69) | __hidden ncclResult_t pluginMakeVDevice(int* d, ncclNetVDeviceProps_t* p...
function __hidden (line 70) | __hidden ncclResult_t pluginFinalize(void* ctx) { return ncclSuccess; }
function __hidden (line 98) | __hidden ncclResult_t pluginInit_v10(ncclDebugLogger_t logFunction, nccl...
function __hidden (line 99) | __hidden ncclResult_t pluginGetProperties_v10(int dev, ncclNetProperties...
function __hidden (line 135) | __hidden ncclResult_t pluginListen_v10(int d, void* handle, void** liste...
function __hidden (line 136) | __hidden ncclResult_t pluginConnect_v10(int dev, ncclNetCommConfig_v10_t...
function __hidden (line 137) | __hidden ncclResult_t pluginMakeVDevice_v10(int* d, ncclNetVDeviceProps_...
function __hidden (line 163) | __hidden ncclResult_t pluginInit_v9(ncclDebugLogger_t logFunction) {
function __hidden (line 167) | __hidden ncclResult_t pluginGetProperties_v9(int dev, ncclNetProperties_...
function __hidden (line 171) | __hidden ncclResult_t pluginConnect_v9(int dev, void* handle, void** sen...
function __hidden (line 175) | __hidden ncclResult_t pluginIsend_v9(void* sendComm, void* data, size_t ...
function __hidden (line 179) | __hidden ncclResult_t pluginIrecv_v9(void* recvComm, int n, void** data,...
function __hidden (line 183) | __hidden ncclResult_t pluginMakeVDevice_v9(int* d, ncclNetVDeviceProps_v...
function __hidden (line 208) | __hidden ncclResult_t pluginGetProperties_v8(int dev, ncclNetProperties_...
function __hidden (line 227) | __hidden ncclResult_t pluginIsend_v8(void* sendComm, void* data, int siz...
function __hidden (line 231) | __hidden ncclResult_t pluginIrecv_v8(void* recvComm, int n, void** data,...
function __hidden (line 259) | __hidden ncclResult_t pluginGetProperties_v7(int dev, ncclNetProperties_...
function __hidden (line 277) | __hidden ncclResult_t pluginRegMr_v7(void* collComm, void* data, int siz...
function __hidden (line 303) | __hidden ncclResult_t pluginGetProperties_v6(int dev, ncclNetProperties_...
function __hidden (line 319) | __hidden ncclResult_t pluginConnect_v6(int dev, void* handle, void** sen...
function __hidden (line 320) | __hidden ncclResult_t pluginAccept_v6(void* listenComm, void** recvComm)...
function ncclResult_t (line 363) | static ncclResult_t pluginGetProperties_v4(int dev, ncclNetProperties_v4...
function ncclResult_t (line 376) | static ncclResult_t pluginIsend_v4(void *sendComm, void* data, int size,...
function ncclResult_t (line 379) | static ncclResult_t pluginIrecv_v4(void* recvComm, void* data, int size,...
function ncclResult_t (line 383) | static ncclResult_t pluginIflush_v4(void* recvComm, void* data, int size...
function ncclResult_t (line 386) | static ncclResult_t pluginConnect_v4(int dev, void* handle, void** sendC...
function ncclResult_t (line 394) | static ncclResult_t pluginAccept_v4(void* listenComm, void** recvComm) {
function ncclResult_t (line 422) | static ncclResult_t pluginFlush(void* recvComm, void* data, int size, vo...
function ncclResult_t (line 431) | static ncclResult_t pluginInit_v3(ncclDebugLogger_t logFunction) {
function ncclResult_t (line 436) | static ncclResult_t pluginListen_v3(int dev, void* handle, void** listen...
function ncclResult_t (line 442) | static ncclResult_t pluginConnect_v3(int dev, void* handle, void** sendC...
FILE: ext-profiler/example/event.h
type proxyOp (line 24) | struct proxyOp
type proxyStep (line 25) | struct proxyStep
type netPlugin (line 27) | struct netPlugin {
type kernelCh (line 51) | struct kernelCh {
type proxyStep (line 69) | struct proxyStep {
type proxyOp (line 82) | struct proxyOp {
type group (line 100) | struct group
type context (line 101) | struct context
type proxyCtrl (line 103) | struct proxyCtrl {
type taskEventBase (line 113) | struct taskEventBase {
type collective (line 124) | struct collective {
type p2p (line 141) | struct p2p {
type group (line 153) | struct group {
type collApi (line 165) | struct collApi {
type p2pApi (line 184) | struct p2pApi {
type kernelLaunch (line 202) | struct kernelLaunch {
type groupApi (line 212) | struct groupApi {
type context (line 230) | struct context {
function taskEventQueueEmpty (line 278) | int taskEventQueueEmpty(T *obj) {
function taskEventQueueEnqueue (line 283) | void taskEventQueueEnqueue(T* obj, struct taskEventBase* event) {
type taskEventBase (line 291) | struct taskEventBase
type taskEventBase (line 296) | struct taskEventBase
type taskEventBase (line 297) | struct taskEventBase
function resetTaskEvents (line 304) | void resetTaskEvents(T *obj, struct context* ctx) {
FILE: ext-profiler/example/nccl/common.h
type ncclDebugLogLevel (line 10) | typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_ERROR=1, NCCL_LOG_VERSION=2, NCC...
type ncclDebugLogSubSys (line 11) | typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET...
FILE: ext-profiler/example/nccl/err.h
type ncclResult_t (line 11) | typedef enum { ncclSuccess = 0,
FILE: ext-profiler/example/nccl/net_ib_v1.h
type ncclProfilerNetIbDescr_v1_t (line 21) | typedef struct {
FILE: ext-profiler/example/nccl/net_socket_v1.h
type ncclProfilerNetSockDescr_v1_t (line 21) | typedef struct {
FILE: ext-profiler/example/nccl/profiler.h
type ncclProfilerEventState_t (line 30) | typedef enum {
type ncclProfilerEventState_t (line 68) | typedef ncclProfilerEventState_t ncclProfilerEventState_v1_t;
type ncclProfilerEventState_t (line 69) | typedef ncclProfilerEventState_t ncclProfilerEventState_v2_t;
type ncclProfilerEventState_t (line 70) | typedef ncclProfilerEventState_t ncclProfilerEventState_v3_t;
type ncclProfilerEventState_t (line 71) | typedef ncclProfilerEventState_t ncclProfilerEventState_v4_t;
type ncclProfilerEventState_t (line 72) | typedef ncclProfilerEventState_t ncclProfilerEventState_v5_t;
type ncclProfiler_v5_t (line 81) | typedef ncclProfiler_v5_t ncclProfiler_t;
type ncclProfilerEventDescr_v5_t (line 82) | typedef ncclProfilerEventDescr_v5_t ncclProfilerEventDescr_t;
type ncclProfilerEventStateArgs_v5_t (line 83) | typedef ncclProfilerEventStateArgs_v5_t ncclProfilerEventStateArgs_t;
FILE: ext-profiler/example/nccl/profiler_net.h
type ncclProfilerNetType (line 14) | typedef enum {
FILE: ext-profiler/example/nccl/profiler_v1.h
type ncclProfilerEventDescr_v1_t (line 12) | typedef struct {
type ncclProfilerEventStateArgs_v1_t (line 62) | typedef union {
type ncclProfiler_v1_t (line 73) | typedef struct {
FILE: ext-profiler/example/nccl/profiler_v2.h
type ncclProfilerEventDescr_v2_t (line 12) | typedef struct {
type ncclProfilerEventStateArgs_v2_t (line 59) | typedef union {
type ncclProfiler_v2_t (line 70) | typedef struct {
FILE: ext-profiler/example/nccl/profiler_v3.h
type ncclProfilerEventDescr_v3_t (line 12) | typedef struct {
type ncclProfilerEventStateArgs_v3_t (line 67) | typedef union {
type ncclProfiler_v3_t (line 78) | typedef struct {
FILE: ext-profiler/example/nccl/profiler_v4.h
type ncclProfilerEventDescr_v4_t (line 10) | typedef struct {
type ncclProfilerEventStateArgs_v4_t (line 63) | typedef union {
type ncclProfiler_v4_t (line 81) | typedef struct {
FILE: ext-profiler/example/nccl/profiler_v5.h
type ncclProfilerEventDescr_v5_t (line 11) | typedef struct {
type ncclProfilerEventStateArgs_v5_t (line 92) | typedef union {
type ncclProfiler_v5_t (line 110) | typedef struct {
FILE: ext-profiler/example/nccl/types.h
type ncclDataType_t (line 9) | typedef enum { ncclInt8 = 0, ncclChar = 0,
FILE: ext-profiler/example/plugin.cc
type proxyOp (line 47) | struct proxyOp
function __hidden (line 52) | __hidden double gettime(void) {
function __hidden (line 62) | __hidden ncclResult_t exampleProfilerInit(void** context, uint64_t commI...
function __hidden (line 177) | __hidden ncclResult_t exampleProfilerFinalize(void* context) {
function __hidden (line 232) | __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eH...
function updateEvent (line 611) | void updateEvent(void* handle) {
function __hidden (line 703) | __hidden ncclResult_t exampleProfilerStopEvent(void* eHandle) {
function __hidden (line 745) | __hidden ncclResult_t exampleProfilerRecordEventState(void* eHandle, ncc...
function exampleProfilerStart (line 817) | __attribute__((visibility("default"))) int exampleProfilerStart(int eAct...
function exampleProfilerStop (line 825) | __attribute__((visibility("default"))) int exampleProfilerStop(void) {
FILE: ext-profiler/example/print_event.cc
function __hidden (line 20) | __hidden void printGroupApiEventHeader(FILE* fh, struct groupApi* event) {
function __hidden (line 25) | __hidden void printGroupApiEventTrailer(FILE* fh, struct groupApi* event) {
function __hidden (line 31) | __hidden void printP2pApiEventHeader(FILE* fh, struct p2pApi* event) {
function __hidden (line 36) | __hidden void printP2pApiEventTrailer(FILE* fh, struct p2pApi* event) {
function __hidden (line 42) | __hidden void printCollApiEventHeader(FILE* fh, struct collApi* event) {
function __hidden (line 47) | __hidden void printCollApiEventTrailer(FILE* fh, struct collApi* event) {
function __hidden (line 53) | __hidden void printKernelLaunchEventHeader(FILE* fh, struct kernelLaunch...
function __hidden (line 57) | __hidden void printKernelLaunchEventTrailer(FILE* fh, struct kernelLaunc...
function __hidden (line 62) | __hidden void printGroupEventHeader(FILE* fh, struct group* event) {
function __hidden (line 67) | __hidden void printGroupEventTrailer(FILE* fh, struct group* event) {
function __hidden (line 73) | __hidden void printCollEventHeader(FILE* fh, struct collective* event) {
function __hidden (line 78) | __hidden void printCollEventTrailer(FILE* fh, struct collective* event) {
function __hidden (line 84) | __hidden void printP2pEventHeader(FILE* fh, struct p2p* event) {
function __hidden (line 89) | __hidden void printP2pEventTrailer(FILE* fh, struct p2p* event) {
function __hidden (line 95) | __hidden void printProxyOpEventHeader(FILE* fh, struct proxyOp* event) {
function __hidden (line 113) | __hidden void printProxyOpEventTrailer(FILE* fh, struct proxyOp* event) {
function __hidden (line 119) | __hidden void printProxyStepEventHeader(FILE* fh, struct proxyStep* even...
function __hidden (line 137) | __hidden void printProxyStepEventTrailer(FILE* fh, struct proxyStep* eve...
function __hidden (line 156) | __hidden void printKernelChEventHeader(FILE* fh, struct kernelCh* event) {
function __hidden (line 162) | __hidden void printKernelChEventTrailer(FILE* fh, struct kernelCh* event) {
function __hidden (line 169) | __hidden void printProxyCtrlEvent(FILE* fh, struct proxyCtrl* event) {
function __hidden (line 192) | __hidden void printNetPluginEvent(FILE* fh, struct netPlugin* event) {
function debugEvent (line 215) | void debugEvent(void* eHandle, const char* tag) {
function printEvent (line 286) | void printEvent(FILE* fh, void* handle) {
FILE: ext-profiler/inspector/exporter/example/perf_summary_exporter.py
function setup_logging (line 21) | def setup_logging(output_dir):
function smart_open (line 31) | def smart_open(filename, mode="r"):
function get_log_files_and_output_dir (line 41) | def get_log_files_and_output_dir():
function bytes_to_human_readable (line 93) | def bytes_to_human_readable(size_bytes):
function timestamp_to_datetime (line 118) | def timestamp_to_datetime(timestamp_us):
function microseconds_to_human_readable (line 122) | def microseconds_to_human_readable(microseconds):
function get_comm_type (line 131) | def get_comm_type(row) -> str:
function parse_file (line 141) | def parse_file(filepath: Path, output_dir):
function create_per_node_parquet_files (line 201) | def create_per_node_parquet_files(files, output_dir):
function generate_scatter_plot (line 216) | def generate_scatter_plot(df, comm_type, coll_type, output_file):
function generate_combined_scatter_plot (line 239) | def generate_combined_scatter_plot(df, comm_type, coll_type, output_file...
function generate_histogram (line 275) | def generate_histogram(df, comm_type, coll_type, output_file, message_si...
function generate_boxplot (line 300) | def generate_boxplot(df, comm_type, coll_type, output_file, message_size):
function summarize_data_per_comm_coll_type (line 351) | def summarize_data_per_comm_coll_type(output_root, comm_type, coll_type,...
function generate_visualizations (line 442) | def generate_visualizations(df, output_root, comm_type, coll_type):
function generate_summary (line 506) | def generate_summary(output_root, comm_type, coll_type, output_dir_name):
function generate_summary_wrapper (line 520) | def generate_summary_wrapper(args):
FILE: ext-profiler/inspector/inspector.cc
function inspectorGetTime (line 94) | uint64_t inspectorGetTime() {
function ncclDataType_t (line 124) | ncclDataType_t inspectorStringToDatatype(const char* str) {
function ncclFunc_t (line 159) | ncclFunc_t ncclStringToFunc(const char* str) {
type inspectorDumpThread (line 185) | struct inspectorDumpThread
method inspectorDumpThread (line 752) | inspectorDumpThread(const char* outputRoot, uint64_t sampleIntervalUsecs)
method startThread (line 773) | void startThread() {
method stopThread (line 785) | void stopThread() {
method inspectorResult_t (line 797) | inspectorResult_t inspectorStateDump(const char* output_root) {
type timespec (line 833) | struct timespec
function inspectorResult_t (line 190) | inspectorResult_t inspectorLockInit(pthread_rwlock_t* lockRef) {
function inspectorResult_t (line 198) | inspectorResult_t inspectorLockDestroy(pthread_rwlock_t* lockRef) {
function inspectorResult_t (line 206) | inspectorResult_t inspectorLockRd(pthread_rwlock_t* lockRef) {
function inspectorResult_t (line 214) | inspectorResult_t inspectorLockWr(pthread_rwlock_t* lockRef) {
function inspectorResult_t (line 222) | inspectorResult_t inspectorUnlockRWLock(pthread_rwlock_t* lockRef) {
type inspectorCommInfoList (line 255) | struct inspectorCommInfoList {
type inspectorCommInfo (line 256) | struct inspectorCommInfo
type inspectorState (line 261) | struct inspectorState {
type inspectorCommInfoList (line 262) | struct inspectorCommInfoList
type inspectorCommInfoList (line 263) | struct inspectorCommInfoList
function inspectorResult_t (line 269) | static inspectorResult_t inspectorCommInfoListInit(struct inspectorCommI...
function inspectorResult_t (line 279) | static inspectorResult_t inspectorGlobalStateInit() {
function inspectorResult_t (line 335) | static inspectorResult_t inspectorCommInfoHeader(jsonFileOutput* jfo,
function inspectorResult_t (line 364) | static inspectorResult_t inspectorCommInfoMetaHeader(jsonFileOutput* jfo) {
function inspectorResult_t (line 401) | static inline inspectorResult_t inspectorCompletedCollVerbose(jsonFileOu...
function inspectorResult_t (line 472) | static inline inspectorResult_t inspectorCompletedColl(jsonFileOutput* jfo,
function inspectorResult_t (line 521) | static inspectorResult_t inspectorCommInfoDump(jsonFileOutput* jfo,
function inspectorResult_t (line 584) | static inspectorResult_t inspectorCommInfoListDump(jsonFileOutput* jfo,
function inspectorResult_t (line 627) | static inspectorResult_t inspectorCommInfoListFinalize(struct inspectorC...
function ensureDir (line 663) | static bool ensureDir(char* workdir) {
function genDumpDir (line 718) | static void genDumpDir(char** workdir) {
type inspectorDumpThread (line 744) | struct inspectorDumpThread {
method inspectorDumpThread (line 752) | inspectorDumpThread(const char* outputRoot, uint64_t sampleIntervalUsecs)
method startThread (line 773) | void startThread() {
method stopThread (line 785) | void stopThread() {
method inspectorResult_t (line 797) | inspectorResult_t inspectorStateDump(const char* output_root) {
type timespec (line 833) | struct timespec
function showInspectorVersion (line 876) | static void showInspectorVersion() {
function showInspectorEnvVars (line 898) | static void showInspectorEnvVars() {
function inspectorResult_t (line 943) | inspectorResult_t inspectorGlobalInit(int rank) {
function inspectorResult_t (line 1091) | inspectorResult_t inspectorCommGetHashStr(uint64_t commHash,
function comm_eq (line 1117) | static bool comm_eq(uint64_t lCommHash, uint64_t rCommHash,
function inspectorResult_t (line 1149) | static inspectorResult_t inspectorFillCommInfo(struct inspectorCommInfo*...
function inspectorResult_t (line 1191) | inspectorResult_t inspectorAddComm(struct inspectorCommInfo **commInfo,
function inspectorResult_t (line 1264) | inspectorResult_t inspectorDelComm(struct inspectorCommInfo *commInfo) {
function inspectorComputeCollBw (line 1347) | void inspectorComputeCollBw(struct inspectorCommInfo *commInfo,
function calculateKernelGpuExecTimeUsecs (line 1406) | static uint64_t calculateKernelGpuExecTimeUsecs(struct inspectorKernelCh...
function calculateMaxKernelExecTimeUsecs (line 1441) | static uint64_t calculateMaxKernelExecTimeUsecs(struct inspectorCollInfo...
function inspectorUpdateCollPerf (line 1494) | void inspectorUpdateCollPerf(struct inspectorCompletedCollInfo *complete...
function inspectorResult_t (line 1523) | inspectorResult_t inspectorGlobalFinalize() {
FILE: ext-profiler/inspector/inspector.h
type ncclFunc_t (line 22) | typedef enum {
type inspectorResult_t (line 34) | typedef enum {
type inspectorTimingSource_t (line 53) | typedef enum {
type inspectorEventTraceInfo (line 59) | struct inspectorEventTraceInfo {
type inspectorEventTrkColl_t (line 64) | typedef enum {
type inspectorEventTrkKernel_t (line 70) | typedef enum {
type inspectorEventTrkKernelInfo (line 77) | struct inspectorEventTrkKernelInfo {
type inspectorEventTrkCollInfo (line 81) | struct inspectorEventTrkCollInfo {
type inspectorCompletedCollInfo (line 88) | struct inspectorCompletedCollInfo {
type inspectorCommInfo (line 104) | struct inspectorCommInfo {
type inspectorKernelChInfo (line 119) | struct inspectorKernelChInfo {
type inspectorCollInfo (line 130) | struct inspectorCollInfo {
function ncclTypeSize (line 154) | inline int ncclTypeSize(ncclDataType_t type) {
type inspectorCommInfo (line 187) | struct inspectorCommInfo
type inspectorCommInfo (line 190) | struct inspectorCommInfo
type inspectorCompletedCollInfo (line 192) | struct inspectorCompletedCollInfo
type inspectorCollInfo (line 193) | struct inspectorCollInfo
type inspectorCommInfo (line 196) | struct inspectorCommInfo
type inspectorCompletedCollInfo (line 197) | struct inspectorCompletedCollInfo
FILE: ext-profiler/inspector/inspector_plugin.cc
function inspectorRecordEventTrace (line 51) | static uint64_t inspectorRecordEventTrace(struct inspectorEventTraceInfo...
function __hidden (line 86) | __hidden ncclResult_t inspectorPluginInit(void** context, uint64_t commH...
function __hidden (line 141) | __hidden ncclResult_t inspectorPluginFinalize(void* context) {
function inspectorResult_t (line 151) | inspectorResult_t inspectorPluginCollInfoRef(struct inspectorCollInfo *c...
function inspectorResult_t (line 156) | inspectorResult_t inspectorPluginCollInfoRefSafe(struct inspectorCollInf...
function inspectorResult_t (line 163) | inspectorResult_t inspectorPluginCollInfoDeRef(struct inspectorCollInfo ...
function inspectorResult_t (line 174) | inspectorResult_t inspectorPluginCollInfoDeRefSafe(struct inspectorCollI...
function inspectorPluginCollInfoInit (line 202) | static void inspectorPluginCollInfoInit(struct inspectorCollInfo **collI...
function inspectorPluginKernelChInfoInit (line 258) | static void inspectorPluginKernelChInfoInit(struct inspectorKernelChInfo...
function __hidden (line 313) | __hidden ncclResult_t inspectorPluginStartEvent(void* context,
function __hidden (line 357) | __hidden ncclResult_t inspectorPluginStopEvent(void *eHandle) {
function __hidden (line 453) | __hidden ncclResult_t inspectorPluginRecordEventState(void* eHandle,
FILE: ext-profiler/inspector/json.cc
type jsonFileOutput (line 35) | struct jsonFileOutput {
function jsonResult_t (line 43) | jsonResult_t jsonInitFileOutput(jsonFileOutput** jfo, const char* outfil...
function jsonResult_t (line 66) | jsonResult_t jsonNewline(jsonFileOutput* jfo) {
function jsonResult_t (line 71) | jsonResult_t jsonFlushOutput(jsonFileOutput* jfo) {
function jsonResult_t (line 76) | jsonResult_t jsonLockOutput(jsonFileOutput* jfo) {
function jsonResult_t (line 83) | jsonResult_t jsonUnlockOutput(jsonFileOutput* jfo) {
function jsonResult_t (line 90) | jsonResult_t jsonFinalizeFileOutput(jsonFileOutput* jfo) {
function utf8copy (line 112) | static int utf8copy(unsigned char* out, int out_lim, const unsigned char...
function jsonResult_t (line 151) | static jsonResult_t sanitizeJson(unsigned char out[], int lim, const uns...
function max (line 202) | static size_t max(size_t a, size_t b) {
function jsonResult_t (line 211) | static jsonResult_t jsonPushState(jsonFileOutput* jfo, jsonState_t state) {
function jsonState_t (line 227) | static jsonState_t jsonCurrState(const jsonFileOutput* jfo) {
function jsonResult_t (line 235) | static jsonResult_t jsonReplaceState(jsonFileOutput* jfo, jsonState_t st...
function jsonState_t (line 247) | static jsonState_t jsonPopState(jsonFileOutput* jfo) {
function jsonResult_t (line 257) | jsonResult_t jsonKey(jsonFileOutput* jfo, const char* name) {
function jsonResult_t (line 281) | static jsonResult_t jsonValHelper(jsonFileOutput* jfo) {
function jsonResult_t (line 301) | jsonResult_t jsonStartObject(jsonFileOutput* jfo) {
function jsonResult_t (line 311) | jsonResult_t jsonFinishObject(jsonFileOutput* jfo) {
function jsonResult_t (line 324) | jsonResult_t jsonStartList(jsonFileOutput* jfo) {
function jsonResult_t (line 334) | jsonResult_t jsonFinishList(jsonFileOutput* jfo) {
function jsonResult_t (line 347) | jsonResult_t jsonNull(jsonFileOutput* jfo) {
function jsonResult_t (line 357) | jsonResult_t jsonStr(jsonFileOutput* jfo, const char* str) {
function jsonResult_t (line 376) | jsonResult_t jsonBool(jsonFileOutput* jfo, bool val) {
function jsonResult_t (line 381) | jsonResult_t jsonInt(jsonFileOutput* jfo, const int val) {
function jsonResult_t (line 391) | jsonResult_t jsonUint32(jsonFileOutput* jfo, const uint32_t val) {
function jsonResult_t (line 402) | jsonResult_t jsonUint64(jsonFileOutput* jfo, const uint64_t val) {
function jsonResult_t (line 412) | jsonResult_t jsonSize_t(jsonFileOutput* jfo, const size_t val) {
function jsonResult_t (line 422) | jsonResult_t jsonDouble(jsonFileOutput* jfo, const double val) {
function main (line 451) | int main() {
FILE: ext-profiler/inspector/json.h
type jsonState_t (line 7) | typedef enum {
type jsonResult_t (line 16) | typedef enum {
type jsonFileOutput (line 30) | typedef struct jsonFileOutput jsonFileOutput;
FILE: ext-profiler/inspector/nccl/common.h
type ncclDataType_t (line 14) | typedef enum { ncclInt8 = 0, ncclChar = 0,
type ncclDebugLogLevel (line 29) | typedef enum {
type ncclResult_t (line 38) | typedef enum { ncclSuccess = 0,
type ncclDebugLogSubSys (line 49) | typedef enum {
FILE: ext-profiler/inspector/nccl/profiler.h
type ncclProfilerEventState_t (line 30) | typedef enum {
type ncclProfilerEventState_t (line 68) | typedef ncclProfilerEventState_t ncclProfilerEventState_v1_t;
type ncclProfilerEventState_t (line 69) | typedef ncclProfilerEventState_t ncclProfilerEventState_v2_t;
type ncclProfilerEventState_t (line 70) | typedef ncclProfilerEventState_t ncclProfilerEventState_v3_t;
type ncclProfilerEventState_t (line 71) | typedef ncclProfilerEventState_t ncclProfilerEventState_v4_t;
type ncclProfilerEventState_t (line 72) | typedef ncclProfilerEventState_t ncclProfilerEventState_v5_t;
type ncclProfiler_v5_t (line 81) | typedef ncclProfiler_v5_t ncclProfiler_t;
type ncclProfilerEventDescr_v5_t (line 82) | typedef ncclProfilerEventDescr_v5_t ncclProfilerEventDescr_t;
type ncclProfilerEventStateArgs_v5_t (line 83) | typedef ncclProfilerEventStateArgs_v5_t ncclProfilerEventStateArgs_t;
FILE: ext-profiler/inspector/nccl/profiler_net.h
type ncclProfilerNetType (line 14) | typedef enum {
FILE: ext-profiler/inspector/nccl/profiler_v1.h
type ncclProfilerEventDescr_v1_t (line 15) | typedef struct {
type ncclProfilerEventStateArgs_v1_t (line 65) | typedef union {
type ncclProfiler_v1_t (line 76) | typedef struct {
FILE: ext-profiler/inspector/nccl/profiler_v2.h
type ncclProfilerEventDescr_v2_t (line 14) | typedef struct {
type ncclProfilerEventStateArgs_v2_t (line 61) | typedef union {
type ncclProfiler_v2_t (line 72) | typedef struct {
FILE: ext-profiler/inspector/nccl/profiler_v3.h
type ncclProfilerEventDescr_v3_t (line 14) | typedef struct {
type ncclProfilerEventStateArgs_v3_t (line 69) | typedef union {
type ncclProfiler_v3_t (line 80) | typedef struct {
FILE: ext-profiler/inspector/nccl/profiler_v4.h
type ncclProfilerEventDescr_v4_t (line 14) | typedef struct {
type ncclProfilerEventStateArgs_v4_t (line 67) | typedef union {
type ncclProfiler_v4_t (line 85) | typedef struct {
FILE: ext-profiler/inspector/nccl/profiler_v5.h
type ncclProfilerEventDescr_v5_t (line 10) | typedef struct {
type ncclProfilerEventStateArgs_v5_t (line 91) | typedef union {
type ncclProfiler_v5_t (line 109) | typedef struct {
FILE: ext-profiler/inspector/nccl/types.h
type ncclDataType_t (line 9) | typedef enum { ncclInt8 = 0, ncclChar = 0,
FILE: ext-src/check_ibv_access_relaxed_ordering.cc
function main (line 4) | int main(void) {
FILE: ext-tuner/basic/nccl/common.h
type ncclDebugLogLevel (line 10) | typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_ERROR=1, NCCL_LOG_VERSION=2, NCC...
type ncclDebugLogSubSys (line 11) | typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET...
FILE: ext-tuner/basic/nccl/err.h
type ncclResult_t (line 9) | typedef enum { ncclSuccess = 0,
FILE: ext-tuner/basic/nccl/tuner.h
type ncclFunc_t (line 18) | typedef enum {
type ncclTuner_v4_t (line 49) | typedef struct {
type ncclTuner_v4_t (line 93) | typedef ncclTuner_v4_t ncclTuner_t;
FILE: ext-tuner/basic/plugin.c
function __hidden (line 11) | __hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebug...
function __hidden (line 13) | __hidden ncclResult_t pluginGetCollInfo(void* context, ncclFunc_t collTy...
function __hidden (line 25) | __hidden ncclResult_t pluginDestroy(void* context) { return ncclSuccess; }
FILE: ext-tuner/example/nccl/common.h
type ncclDebugLogLevel (line 10) | typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_ERROR=1, NCCL_LOG_VERSION=2, NCC...
type ncclDebugLogSubSys (line 11) | typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET...
FILE: ext-tuner/example/nccl/err.h
type ncclResult_t (line 9) | typedef enum { ncclSuccess = 0,
FILE: ext-tuner/example/nccl/tuner.h
type ncclFunc_t (line 18) | typedef enum {
type ncclNvlDomainInfo_v5_t (line 64) | typedef struct {
type ncclTunerConstants_v5_t (line 70) | typedef struct {
type ncclTuner_v5_t (line 83) | typedef struct {
type ncclTuner_v5_t (line 132) | typedef ncclTuner_v5_t ncclTuner_t;
type ncclNvlDomainInfo_v5_t (line 133) | typedef ncclNvlDomainInfo_v5_t ncclNvlDomainInfo_t;
type ncclTunerConstants_v5_t (line 134) | typedef ncclTunerConstants_v5_t ncclTunerConstants_t;
FILE: ext-tuner/example/plugin.c
type TuningConfig (line 34) | typedef struct {
type TunerContext (line 47) | typedef struct {
function ncclFunc_t (line 58) | static ncclFunc_t parseCollType(const char* str) {
function parseAlgorithm (line 80) | static int parseAlgorithm(const char* str) {
function parseProtocol (line 106) | static int parseProtocol(const char* str) {
function countConfigLines (line 124) | static int countConfigLines(const char* filename) {
function ncclResult_t (line 151) | static ncclResult_t loadConfig(TunerContext* ctx, const char* filename) {
function __hidden (line 293) | __hidden ncclResult_t pluginInit(void** context, uint64_t commId, size_t...
function __hidden (line 352) | __hidden ncclResult_t pluginGetCollInfo(void* context, ncclFunc_t collTy...
function __hidden (line 463) | __hidden ncclResult_t pluginFinalize(void* context) {
FILE: ext-tuner/example/scripts/optimize_config.py
class PerformanceData (line 41) | class PerformanceData:
method __init__ (line 42) | def __init__(self, row: Dict[str, str]):
method get_config_key (line 57) | def get_config_key(self) -> Tuple:
method get_size_range_key (line 61) | def get_size_range_key(self, topology_size_ranges: Dict[Tuple[int, int...
class ConfigOptimizer (line 80) | class ConfigOptimizer:
method __init__ (line 81) | def __init__(self, optimization_metric: str = 'latency_us'):
method set_size_ranges (line 93) | def set_size_ranges(self, ranges: List[Tuple[int, int]]):
method auto_determine_size_ranges (line 98) | def auto_determine_size_ranges(self, data: List[PerformanceData]) -> D...
method load_data (line 161) | def load_data(self, csv_file: str) -> List[PerformanceData]:
method is_better (line 190) | def is_better(self, new_data: PerformanceData, current_best: Performan...
method optimize_configurations (line 200) | def optimize_configurations(self, data: List[PerformanceData]) -> List...
method combine_sequential_ranges (line 256) | def combine_sequential_ranges(self, configs: List[Dict]) -> List[Dict]:
method append_to_config_file (line 322) | def append_to_config_file(self, configs: List[str], config_file: str, ...
function main (line 372) | def main():
FILE: ext-tuner/example/test/test_plugin.c
function mock_logger (line 41) | void mock_logger(ncclDebugLogLevel level, unsigned long flags,
function create_test_config (line 88) | void create_test_config(const char* filename, const char* content) {
function test_plugin_init (line 97) | int test_plugin_init() {
function test_config_parsing_valid (line 111) | int test_config_parsing_valid() {
function test_config_parsing_invalid (line 137) | int test_config_parsing_invalid() {
function test_collective_matching (line 159) | int test_collective_matching() {
function test_size_matching (line 219) | int test_size_matching() {
function test_topology_matching (line 290) | int test_topology_matching() {
function test_default_channels (line 344) | int test_default_channels() {
function test_regbuff_matching (line 379) | int test_regbuff_matching() {
function test_pipeops_matching (line 447) | int test_pipeops_matching() {
function test_no_match_fallback (line 514) | int test_no_match_fallback() {
function test_large_config (line 553) | int test_large_config() {
function test_very_large_config_stress (line 663) | int test_very_large_config_stress() {
function test_empty_config (line 716) | int test_empty_config() {
function test_nvl_domain_info (line 762) | int test_nvl_domain_info() {
function test_tuner_constants (line 787) | int test_tuner_constants() {
type TestCase (line 886) | typedef struct {
function show_help (line 913) | void show_help(const char* program_name) {
function TestFunction (line 927) | TestFunction find_test(const char* name) {
function main (line 937) | int main(int argc, char* argv[]) {
FILE: ext-tuner/model_demo/nccl/common.h
type ncclDebugLogLevel (line 10) | typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_ERROR=1, NCCL_LOG_VERSION=2, NCC...
type ncclDebugLogSubSys (line 11) | typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET...
FILE: ext-tuner/model_demo/nccl/err.h
type ncclResult_t (line 9) | typedef enum { ncclSuccess = 0,
FILE: ext-tuner/model_demo/nccl/tuner.h
type ncclFunc_t (line 18) | typedef enum {
type ncclTuner_v4_t (line 49) | typedef struct {
type ncclTuner_v4_t (line 93) | typedef ncclTuner_v4_t ncclTuner_t;
FILE: ext-tuner/model_demo/plugin.c
function log2i (line 15) | static long log2i(long n) {
type tuningModel (line 27) | struct tuningModel {
type tuningModel (line 34) | struct tuningModel
function ncclResult_t (line 67) | ncclResult_t ncclTopoGetAlgoTime_Tuner(ncclFunc_t collType, int algorith...
function __hidden (line 89) | __hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebug...
function __hidden (line 198) | __hidden ncclResult_t pluginGetCollInfo(void* context, ncclFunc_t collTy...
function __hidden (line 225) | __hidden ncclResult_t pluginDestroy(void* context) { return ncclSuccess; }
FILE: src/allocator.cc
function ncclResult_t (line 13) | ncclResult_t ncclMemAlloc_impl(void **ptr, size_t size) {
function ncclResult_t (line 102) | ncclResult_t ncclMemFree_impl(void *ptr) {
function ncclSpaceConstruct (line 143) | void ncclSpaceConstruct(struct ncclSpace* a) {
function ncclSpaceDestruct (line 147) | void ncclSpaceDestruct(struct ncclSpace* a) {
function insertSegment (line 151) | static void insertSegment(struct ncclSpace* a, int index, int64_t lo, in...
function ncclResult_t (line 197) | ncclResult_t ncclSpaceAlloc(
function ncclResult_t (line 224) | ncclResult_t ncclSpaceFree(struct ncclSpace* a, int64_t offset, int64_t ...
type ncclShadowPage (line 256) | struct ncclShadowPage { // A contiguous block of (at most) 64 objects
type ncclShadowPage (line 257) | struct ncclShadowPage
type ncclShadowObject (line 262) | struct ncclShadowObject {
type ncclShadowObject (line 263) | struct ncclShadowObject
type ncclShadowPage (line 266) | struct ncclShadowPage
function ncclShadowPoolConstruct (line 269) | void ncclShadowPoolConstruct(struct ncclShadowPool* pool) {
function ncclResult_t (line 276) | ncclResult_t ncclShadowPoolDestruct(struct ncclShadowPool* pool) {
function hashBucket (line 317) | static int hashBucket(int hbits, void* devObj) {
function hashInsert (line 324) | static void hashInsert(struct ncclShadowPool* pool, struct ncclShadowObj...
function ncclResult_t (line 330) | ncclResult_t ncclShadowPoolAlloc(
function ncclResult_t (line 420) | ncclResult_t ncclShadowPoolFree(struct ncclShadowPool* pool, void* devOb...
function ncclResult_t (line 450) | ncclResult_t ncclShadowPoolToHost(struct ncclShadowPool* pool, void* dev...
FILE: src/bootstrap.cc
function firstRankFromRoot (line 49) | static int firstRankFromRoot(int root, int n_ranks, int nRoots) {
function rootIdFromRank (line 54) | static int rootIdFromRank(int rank, int nRanks, int nRoots) {
function nRankFromRoot (line 64) | static int nRankFromRoot(int root, int nRanks, int nRoots) {
function localIdFromRoot (line 72) | static int localIdFromRoot(int rank, int root, int nRanks, int nRoots) {
function isFirstFromRoot (line 77) | static int isFirstFromRoot(int rank, int root, int nRanks, int nRoots) {
type bootstrapRootArgs (line 81) | struct bootstrapRootArgs {
type ncclSocket (line 82) | struct ncclSocket
function ncclResult_t (line 94) | ncclResult_t bootstrapNetInit() {
type bootstrapInterface_t (line 130) | enum bootstrapInterface_t { findSubnetIf = -1, dontCareIf = -2 }
function ncclResult_t (line 133) | static ncclResult_t checkAbort(volatile uint32_t* flag, int* cntr) {
function ncclResult_t (line 144) | static ncclResult_t netReg(ncclNet_t* net, void* comm, void* data, int s...
function ncclResult_t (line 148) | static ncclResult_t netDereg(ncclNet_t* net, void* comm, void** handle) {
function ncclResult_t (line 153) | static ncclResult_t netIsend(ncclNet_t* net, void* sendComm, void* data,...
function ncclResult_t (line 167) | static ncclResult_t netIrecv(ncclNet_t* net, void* recvComm, void* data,...
function ncclResult_t (line 182) | static ncclResult_t netSendRecv(ncclNet_t* net, void* sendComm, void* se...
function ncclResult_t (line 200) | static ncclResult_t socketSend(struct ncclSocket* sock, void* data, int ...
function ncclResult_t (line 206) | static ncclResult_t socketRecv(struct ncclSocket* sock, void* data, int ...
function ncclResult_t (line 218) | static ncclResult_t socketSendRecv(struct ncclSocket* sendSock, void* se...
type extInfo (line 235) | struct extInfo {
function ncclResult_t (line 248) | static ncclResult_t setFilesLimit() {
function ncclResult_t (line 256) | static ncclResult_t rootSend(union ncclSocketAddress* addr, uint64_t mag...
type bootstrapRootArgs (line 270) | struct bootstrapRootArgs
type ncclSocket (line 82) | struct ncclSocket
type bootstrapRootArgs (line 270) | struct bootstrapRootArgs
type ncclSocket (line 82) | struct ncclSocket
type ncclSocket (line 271) | struct ncclSocket
type extInfo (line 277) | struct extInfo
type ncclSocket (line 293) | struct ncclSocket
function ncclResult_t (line 375) | ncclResult_t bootstrapCreateRoot(struct ncclBootstrapHandle* handle, boo...
function ncclResult_t (line 400) | ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle) {
type unexConn (line 420) | struct unexConn {
type ncclSocket (line 423) | struct ncclSocket
type unexConn (line 424) | struct unexConn
type bootstrapRing_t (line 427) | struct bootstrapRing_t {
type ncclSocket (line 434) | struct ncclSocket
type ncclSocket (line 435) | struct ncclSocket
type bootstrapListen_t (line 439) | struct bootstrapListen_t {
type ncclSocket (line 440) | struct ncclSocket
type ncclSocket (line 447) | struct ncclSocket
type bootstrapState (line 451) | struct bootstrapState {
type bootstrapRing_t (line 452) | struct bootstrapRing_t
type bootstrapListen_t (line 453) | struct bootstrapListen_t
type unexConn (line 458) | struct unexConn
function ncclResult_t (line 469) | static ncclResult_t createListenSocket(struct ncclComm* comm, uint64_t m...
function ncclResult_t (line 476) | static ncclResult_t getUDS(uint64_t* peerUDS) {
function ncclResult_t (line 483) | static ncclResult_t netGetDevice(int rank, struct ncclComm* comm, int* d...
function ncclResult_t (line 534) | static ncclResult_t netRingConnect(void* ctx, ncclNet_t* net, struct boo...
function ncclResult_t (line 548) | static ncclResult_t socketRingConnect(ncclSocketAddress* addr, struct nc...
function ncclResult_t (line 555) | static ncclResult_t ringAllInfo(struct ncclComm* comm, struct bootstrapS...
function ncclResult_t (line 600) | static ncclResult_t sendToRoot(struct ncclBootstrapHandle* handle, struc...
function ncclResult_t (line 618) | ncclResult_t bootstrapInit(int nHandles, void* handles, struct ncclComm*...
function ncclResult_t (line 780) | ncclResult_t bootstrapSplit(uint64_t magic, struct ncclComm* comm, struc...
type socketAckInfo (line 861) | struct socketAckInfo {
function ncclResult_t (line 865) | static ncclResult_t socketConnect(void* commState, int peer, int tag, st...
function ncclResult_t (line 878) | ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* dat...
function ncclResult_t (line 892) | static ncclResult_t unexpectedEnqueue(struct bootstrapState* state, int ...
function ncclResult_t (line 910) | static ncclResult_t unexpectedDequeue(struct bootstrapState* state, int ...
function unexpectedFree (line 932) | static void unexpectedFree(struct bootstrapState* state) {
function ncclResult_t (line 945) | static ncclResult_t socketAccept(void* commState, int peer, int tag, str...
function ncclResult_t (line 969) | ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* dat...
function ncclResult_t (line 982) | static ncclResult_t netRingAllGather(ncclNet_t* net, void* sendComm, voi...
function ncclResult_t (line 1015) | static ncclResult_t socketRingAllGather(struct ncclSocket* sendSock, str...
function ncclResult_t (line 1040) | ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
function ncclResult_t (line 1062) | static ncclResult_t bootstrapP2PBarrier(void* commState, int* ranks, int...
function ncclResult_t (line 1080) | ncclResult_t bootstrapIntraNodeBarrier(void* commState, int* ranks, int ...
function ncclResult_t (line 1089) | ncclResult_t bootstrapBarrier(void* commState, int rank, int nranks, int...
function ncclResult_t (line 1098) | ncclResult_t bootstrapIntraNodeAllGather(void* commState, int* ranks, in...
function ncclResult_t (line 1119) | static ncclResult_t bootstrapP2PBroadcast(void* commState, int* ranks, i...
function ncclResult_t (line 1131) | ncclResult_t bootstrapIntraNodeBroadcast(void* commState, int* ranks, in...
function ncclResult_t (line 1139) | ncclResult_t bootstrapBroadcast(void* commState, int rank, int nranks, i...
function ncclResult_t (line 1148) | ncclResult_t bootstrapClose(void* commState) {
function ncclResult_t (line 1178) | ncclResult_t bootstrapAbort(void* commState) {
FILE: src/ce_coll.cc
function ncclResult_t (line 23) | ncclResult_t ncclCeInit(struct ncclComm* comm) {
function ncclResult_t (line 56) | ncclResult_t ncclCeFinalize(struct ncclComm* comm) {
function ncclCeImplemented (line 82) | bool ncclCeImplemented(ncclFunc_t coll, int/*ncclDevRedOp_t*/ red, ncclD...
function ncclResult_t (line 101) | ncclResult_t ncclPrepMCSync(struct ncclComm* comm, bool isComplete, hipS...
function ncclResult_t (line 146) | ncclResult_t ncclPrepUCSync(struct ncclComm* comm, bool isComplete,
function ncclResult_t (line 191) | ncclResult_t ncclMemOpSync(struct ncclComm* comm, cudaStream_t stream) {
function ncclResult_t (line 237) | ncclResult_t ncclCeInitBatchOpsParams(struct ncclCeBatchOpsParams* param...
function ncclCeFreeBatchOpsParams (line 264) | void ncclCeFreeBatchOpsParams(struct ncclCeBatchOpsParams* params) {
function ncclResult_t (line 274) | ncclResult_t ncclCeLaunchBatchOps(struct ncclComm* comm, struct ncclCeBa...
function ncclResult_t (line 373) | ncclResult_t ncclCeAllGather(struct ncclComm* comm, struct ncclCeCollArg...
function ncclResult_t (line 424) | ncclResult_t ncclCeAlltoAll(struct ncclComm* comm, struct ncclCeCollArgs...
function ncclResult_t (line 479) | ncclResult_t ncclCeScatter(struct ncclComm* comm, struct ncclCeCollArgs*...
function ncclResult_t (line 539) | ncclResult_t ncclCeGather(struct ncclComm* comm, struct ncclCeCollArgs* ...
function ncclResult_t (line 589) | ncclResult_t ncclLaunchCeColl(struct ncclComm* comm, struct ncclKernelPl...
FILE: src/channel.cc
function ncclResult_t (line 12) | ncclResult_t initChannel(struct ncclComm* comm, int channelId) {
function ncclResult_t (line 65) | ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struc...
function ncclResult_t (line 109) | ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, st...
function ncclResult_t (line 147) | ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks, int co...
FILE: src/collectives.cc
function ncclResult_t (line 91) | ncclResult_t ncclAllGather_impl(const void* sendbuff, void* recvbuff, si...
function ncclResult_t (line 155) | ncclResult_t ncclAlltoAll_impl(const void* sendbuff, void* recvbuff, siz...
function ncclResult_t (line 201) | ncclResult_t ncclAlltoAllv_impl(const void *sendbuff, const size_t sendc...
function ncclResult_t (line 246) | ncclResult_t ncclAllReduce_impl(const void* sendbuff, void* recvbuff, si...
function ncclResult_t (line 276) | ncclResult_t ncclAllReduceWithBias_impl(const void* sendbuff, void* recv...
function ncclResult_t (line 298) | ncclResult_t ncclBroadcast_impl(const void* sendbuff, void* recvbuff, si...
function ncclResult_t (line 323) | ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype...
function ncclResult_t (line 331) | ncclResult_t ncclGather_impl(const void* sendbuff, void* recvbuff, size_...
function ncclResult_t (line 355) | ncclResult_t ncclReduce_impl(const void* sendbuff, void* recvbuff, size_...
function ncclResult_t (line 380) | ncclResult_t ncclReduceScatter_impl(const void* sendbuff, void* recvbuff...
function ncclResult_t (line 405) | ncclResult_t ncclScatter_impl(const void* sendbuff, void* recvbuff, size...
function ncclResult_t (line 429) | ncclResult_t ncclSend_impl(const void* sendbuff, size_t count, ncclDataT...
function ncclResult_t (line 454) | ncclResult_t ncclRecv_impl(void* recvbuff, size_t count, ncclDataType_t ...
FILE: src/commDump.cc
function ncclResult_t (line 9) | __attribute__ ((visibility("default")))
FILE: src/debug.cc
function ncclDebugInit (line 40) | static void ncclDebugInit() {
function ncclDebugLog (line 275) | void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const ch...
function ncclResetDebugInit (line 399) | void ncclResetDebugInit() {
function ncclSetThreadName (line 410) | void ncclSetThreadName(pthread_t thread, const char *fmt, ...) {
FILE: src/dev_runtime.cc
type ncclDevrMemory (line 17) | struct ncclDevrMemory {
type ncclDevrMemory (line 19) | struct ncclDevrMemory
type ncclDevrWindowSorted (line 25) | struct ncclDevrWindowSorted {
type ncclDevrWindow (line 28) | struct ncclDevrWindow
type ncclDevrTeam (line 31) | struct ncclDevrTeam {
type ncclDevrTeam (line 32) | struct ncclDevrTeam
type ncclTeam (line 33) | struct ncclTeam
function ncclResult_t (line 54) | ncclResult_t ncclDevrInitOnce(struct ncclComm* comm) {
type ncclComm (line 96) | struct ncclComm
function ncclResult_t (line 98) | ncclResult_t ncclDevrFinalize(struct ncclComm* comm) {
function ncclResult_t (line 135) | static ncclResult_t symMemoryMapLsaTeam(
function ncclResult_t (line 195) | static ncclResult_t symBindTeamMemory(
function ncclResult_t (line 207) | static ncclResult_t symUnbindTeamMemory(
function ncclResult_t (line 219) | static ncclResult_t symTeamObtain(
function symTeamDestroyAll (line 323) | static void symTeamDestroyAll(struct ncclComm* comm) {
function ncclResult_t (line 344) | static ncclResult_t symMemoryObtain(
function symMemoryDropRef (line 398) | static void symMemoryDropRef(
function ncclResult_t (line 423) | static ncclResult_t symWindowTableInitOnce(struct ncclComm* comm, cudaSt...
function ncclResult_t (line 434) | static ncclResult_t symWindowCreate(
function ncclResult_t (line 506) | static ncclResult_t symWindowDestroy(struct ncclComm* comm, struct ncclW...
function ncclResult_t (line 547) | ncclResult_t ncclDevrWindowRegisterInGroup(
function ncclResult_t (line 620) | static ncclResult_t deepCopyDevCommRequirements(
function freeDevCommRequirements (line 659) | void freeDevCommRequirements(
function ncclResult_t (line 679) | ncclResult_t ncclDevrCommCreateInternal(
function ncclResult_t (line 794) | ncclResult_t ncclCommWindowRegister_impl(
function ncclResult_t (line 830) | ncclResult_t ncclCommWindowDeregister_impl(struct ncclComm* comm, struct...
function ncclResult_t (line 855) | ncclResult_t ncclDevrFindWindow(
function ncclResult_t (line 870) | ncclResult_t ncclDevCommCreate(
function ncclResult_t (line 910) | ncclResult_t ncclDevCommDestroy(
function ncclResult_t (line 922) | ncclResult_t ncclDevrGetLsaRankPtr(struct ncclComm* comm, struct ncclDev...
function ncclResult_t (line 945) | ncclResult_t ncclDevrGetLsaTeamPtrMC(struct ncclComm* comm, struct ncclD...
function listFindSortedLub (line 967) | static int listFindSortedLub(Key Obj::*key, Obj* sorted, int count, Key ...
function listInsert (line 980) | static void listInsert(Obj** list, int* capacity, int* count, int index,...
function listRemove (line 994) | static void listRemove(Obj* list, int* count, int index) {
FILE: src/device/all_gather.h
function runRing (line 15) | void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
function run (line 217) | void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
function run (line 224) | void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
function run (line 231) | void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
type ncclDevWorkColl (line 300) | struct ncclDevWorkColl
function __forceinline__ (line 305) | __forceinline__ void operator()(
function run (line 368) | void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) {
type ncclDevWorkColl (line 523) | struct ncclDevWorkColl
function __forceinline__ (line 528) | __forceinline__ void operator()(
function run (line 594) | void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) {
FILE: src/device/all_reduce.h
function runRing (line 19) | void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
function run (line 616) | void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) {
function run (line 752) | void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) {
function run (line 885) | void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) {
function run (line 993) | void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
function run (line 1119) | void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
function run (line 1126) | void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
function run (line 1133) | void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
function run (line 1140) | void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
FILE: src/device/alltoall_pivot.h
function runRing (line 14) | void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
FILE: src/device/broadcast.h
function runRing (line 14) | void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
function run (line 128) | void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
function run (line 135) | void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
FILE: src/device/common.h
type ncclShmemGroup (line 133) | struct ncclShmemGroup {
type ncclShmemData (line 149) | struct ncclShmemData {
function __device__ (line 197) | __device__ inline void insert_random_delay_per_warp() {
function __device__ (line 218) | __device__ inline void* ncclScratchForWarp(int warp) {
function __device__ (line 222) | __device__ inline void barrier_sync(int name) {
function __device__ (line 229) | __device__ inline void barrier_sync(int name, int nThreads) {
function __device__ (line 236) | __device__ inline void barrier_sync_aligned(int name) {
function __device__ (line 239) | __device__ inline void barrier_sync_aligned(int name, int nThreads) {
function __device__ (line 243) | __device__ inline bool barrier_red_or(bool vote, int name) {
function __device__ (line 252) | __device__ inline bool barrier_red_or(bool vote, int name, int nThreads) {
function __device__ (line 275) | inline __device__ void copyToShmem16(int tid, void* dst, void const* src...
function loadWorkBatchToShmem (line 287) | void loadWorkBatchToShmem(
function globaltimer (line 411) | unsigned long long int globaltimer() {
function __device__ (line 423) | __device__ void run(int tid, int tn, struct ncclDevWorkColl* work) {
function run (line 440) | void run() {
function profilerEnabled (line 482) | bool profilerEnabled(int workItemIdx) {
function profiler (line 488) | void profiler(int action) {
function __forceinline__ (line 511) | __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* args) {
FILE: src/device/common_kernel.h
function __device__ (line 22) | inline __device__ int min(int a, ssize_t b) { return (a < b) ? a : b; }
function __device__ (line 24) | inline __device__ int loadInt(int* ptr) {
function __forceinline__ (line 34) | __forceinline__ static void reduceCopyPacks(
function __forceinline__ (line 211) | __forceinline__ void loadSources(
function reduceAndStore (line 244) | void reduceAndStore(
function __forceinline__ (line 296) | __forceinline__ static void reduceCopyPacksPipelined(
function __forceinline__ (line 424) | __forceinline__ void reduceCopyPacksWithBias(
FILE: src/device/generate.py
function paste (line 74) | def paste(sep, *args):
class Fn (line 171) | class Fn:
method __iter__ (line 181) | def __iter__(self):
function calc_unroll_and_pipeline_for_local_arch (line 184) | def calc_unroll_and_pipeline_for_local_arch():
function func_validate (line 230) | def func_validate(coll, algo, proto, redop, ty, acc, pipeline, unroll):
function func_filter (line 247) | def func_filter(function_params, current_idx, item_list=None):
function parse_input (line 293) | def parse_input(func_pattern):
function equivalent_primary (line 310) | def equivalent_primary(coll, algo, proto, redop, ty, acc, pipeline, unro...
function enumerate_func_rows (line 326) | def enumerate_func_rows():
function custom_sort_key (line 339) | def custom_sort_key(fn: Fn):
function get_arch_guard (line 351) | def get_arch_guard(fn):
function impl_filename (line 526) | def impl_filename(coll, algo, proto, redop, ty, acc, pipeline, unroll):
function partition_by_name (line 531) | def partition_by_name(fns):
FILE: src/device/msccl_kernel_impl.h
function __device__ (line 29) | inline __device__ static void barrier(int nthreads) {
function __device__ (line 43) | inline __device__ static void copyToShmem8(int tid, void* dst, void cons...
function threadBlockCopy (line 53) | static void threadBlockCopy(
function __forceinline__ (line 68) | __forceinline__ static void mscclReduce(int c, int numReductions, int cu...
function __forceinline__ (line 95) | __forceinline__ void mscclRunInterpreter(
FILE: src/device/network/unpack/unpack.h
function __device__ (line 19) | inline __device__ void load64gpu(const uint64_t* ptr, uint64_t &v) {
function __device__ (line 34) | inline __device__ void ncclNetDeviceUnpackSetup(void* ohandle, const int...
function __device__ (line 43) | inline __device__ void ncclNetDeviceIncrementHead(const int group, const...
function __device__ (line 48) | inline __device__ void ncclNetDeviceSaveHead(void* ohandle, const int gr...
function bulkLoad (line 55) | void bulkLoad(const int t, const uint32_t len, char* cpy_src, char* cpy_...
function __device__ (line 153) | inline __device__ int ppw(const int nbytes, int nw) {
function __device__ (line 194) | inline __device__ void ncclNetDeviceUnpackInner(
FILE: src/device/network/unpack/unpack_defs.h
type netUnpackMeta (line 34) | struct netUnpackMeta {
type unpackNetDeviceHandle (line 39) | struct unpackNetDeviceHandle {
type unpackShmem (line 51) | struct unpackShmem {
type unpackGroupShmem (line 55) | struct unpackGroupShmem {
FILE: src/device/op128.h
function __device__ (line 14) | inline __device__ void load128(const uint64_t* ptr, uint64_t &v0, uint64...
function __device__ (line 19) | inline __device__ void store128(uint64_t* ptr, uint64_t v0, uint64_t v1) {
function __device__ (line 24) | inline __device__ uint64_t* shmemCvtPtr(volatile uint64_t* shmemGenericP...
function __device__ (line 33) | inline __device__ void storeShmem128(uint64_t* shmemAsmPtr, uint64_t v0,...
function cvta_to_shared (line 72) | uint32_t cvta_to_shared(T* ptr) {
function cvta_to_global (line 76) | uintptr_t cvta_to_global(T* ptr) {
function __device__ (line 181) | inline __device__ BytePack<Size>(const BytePack<Size>& other) {
type BytePackOf (line 198) | struct BytePackOf
function __forceinline__ (line 214) | __forceinline__ T fromPack(typename BytePackOf<T>::Pack pack) {
function store16global (line 301) | void store16global(uintptr_t addr, BytePack<16> value){
function store16global (line 306) | void store16global(uintptr_t addr, BytePack<16> value){
function ld_volatile_global (line 352) | uint64_t ld_volatile_global(uint64_t *ptr) {
function ld_relaxed_sys_global (line 357) | uint64_t ld_relaxed_sys_global(uint64_t *ptr) {
function ld_acquire_sys_global (line 373) | uint64_t ld_acquire_sys_global(uint64_t *ptr) {
function st_volatile_global (line 379) | void st_volatile_global(uint64_t *ptr, uint64_t val) {
function st_relaxed_sys_global (line 382) | void st_relaxed_sys_global(uint64_t *ptr, uint64_t val) {
function st_release_sys_global (line 385) | void st_release_sys_global(uint64_t *ptr, uint64_t val) {
function fence_acq_rel_sys (line 389) | void fence_acq_rel_sys() {
function fence_acq_rel_gpu (line 392) | void fence_acq_rel_gpu() {
function multimem_st_global (line 431) | void multimem_st_global(uintptr_t addr, BytePack<Size> val) {
function __forceinline__ (line 438) | __forceinline__ Pack loadPack(T* ptr, int ix, int end) {
function storePack (line 473) | void storePack(T* ptr, int ix, int end, Pack val) {
function __forceinline__ (line 490) | __forceinline__ void copyGlobalShared_WarpUnrolled(
function __forceinline__ (line 543) | __forceinline__ void copyGlobalShared_WarpUnrolled(
FILE: src/device/primitives.h
function __device__ (line 68) | __device__ static int calcBytePerStep() {
function __device__ (line 72) | __device__ static int calcBytePerGrain() {
type ProtoLL (line 79) | struct ProtoLL {
function calcBytePerGrain (line 87) | static int calcBytePerGrain() {
type ProtoLL128 (line 94) | struct ProtoLL128 {
function calcBytePerGrain (line 102) | static int calcBytePerGrain() {
function __device__ (line 120) | __device__ FanAsymmetric(int nrecv, int nsend): nr(nrecv), ns(nsend) {
function __device__ (line 132) | __device__ FanSymmetric(int nrecv, int nsend): n(nrecv) {
function __device__ (line 146) | __device__ void directSend(intptr_t inpIx, intptr_t outIx, int eltN) {
function __device__ (line 149) | __device__ void directSendFromOutput(intptr_t outIx, int eltN) {
function __device__ (line 152) | __device__ void directRecv(intptr_t outIx, int eltN) {
FILE: src/device/prims_ll.h
type ncclConnInfo (line 32) | struct ncclConnInfo
type ncclConnInfo (line 36) | struct ncclConnInfo
type ncclConnFifo (line 37) | struct ncclConnFifo
function __device__ (line 63) | inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS...
function __device__ (line 64) | inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS...
function ncclLLFifoLine (line 65) | ncclLLFifoLine* recvPtr(int i) { return recvBuff[i]+recvOffset(i); }
function ncclLLFifoLine (line 66) | ncclLLFifoLine* sendPtr(int i) { return sendBuff[i]+sendOffset(i); }
function __device__ (line 67) | inline __device__ uint32_t recvFlag(int i) { return NCCL_LL_FLAG(recvSte...
function __device__ (line 68) | inline __device__ uint32_t sendFlag(int i) { return NCCL_LL_FLAG(sendSte...
function __device__ (line 73) | inline __device__ void barrier() {
function __device__ (line 104) | inline __device__ void waitSend(int nbytes) {
function __device__ (line 133) | inline __device__ void incRecv(int i) {
function __device__ (line 136) | inline __device__ void postRecv() {
function __device__ (line 141) | inline __device__ void incSend(int i, int offset) {
function __device__ (line 150) | __device__ uint64_t readLL(int offset, int i) {
function __device__ (line 223) | __device__ uint64_t readLLFinish(int offset, ncclLLFifoLine(&line)[MaxRe...
function __device__ (line 264) | __device__ void storeLL(union ncclLLFifoLine* dst, uint64_t val, uint32_...
function store (line 331) | void store(U *dst, U val) {
function else (line 364) | struct DataLoader {
function __device__ (line 392) | __device__ uint64_t loadFinish() {
function __device__ (line 403) | __device__ void storeData(T *dst, uint64_t val, int eltN) {
function loadRecvConn (line 621) | void loadRecvConn(struct ncclConnInfo* conn, int i) {
function loadRecvSync (line 626) | void loadRecvSync() {
function loadSendConn (line 633) | void loadSendConn(struct ncclConnInfo* conn, int i) {
function loadSendSync (line 638) | void loadSendSync() {
function __device__ (line 687) | __device__ ~Primitives() {
function __device__ (line 703) | __device__ void moveDataPtrs(intptr_t delta) {
function __device__ (line 708) | __device__ void send(intptr_t inpIx, int eltN) {
function __device__ (line 723) | __device__ void sendFromOutput(intptr_t outIx, int eltN) {
function __device__ (line 753) | __device__ void recvReduceSend(intptr_t inpIx, int eltN) {
function __device__ (line 828) | __device__ void recvSend(int eltN) {
function __device__ (line 833) | __device__ void sendWithBarrier(intptr_t inpIx, int eltN) {
function __device__ (line 838) | __device__ void localCopy(T* srcs, T* dsts, int eltN) {
FILE: src/device/prims_ll128.h
type ncclConnInfo (line 41) | struct ncclConnInfo
type ncclConnInfo (line 45) | struct ncclConnInfo
type ncclConnFifo (line 46) | struct ncclConnFifo
function __device__ (line 58) | inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS...
function __device__ (line 59) | inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS...
function __device__ (line 60) | inline __device__ uint64_t* recvPtr(int i) { return recvBuff[i]+recvOffs...
function __device__ (line 61) | inline __device__ uint64_t* sendPtr(int i) { return sendBuff[i]+sendOffs...
function __device__ (line 62) | inline __device__ uint64_t recvFlag(int i) { return recvStep[i]+1; }
function __device__ (line 63) | inline __device__ uint64_t sendFlag(int i) { return sendStep[i]+1; }
function __device__ (line 84) | inline __device__ void barrier() {
function __device__ (line 111) | inline __device__ void waitSend(int nbytes) {
function __device__ (line 138) | inline __device__ void postRecv() {
function __device__ (line 141) | inline __device__ void postSend() {
function loadRecvConn (line 539) | void loadRecvConn(struct ncclConnInfo* conn, int i) {
function loadRecvSync (line 544) | void loadRecvSync() {
function loadSendConn (line 551) | void loadSendConn(struct ncclConnInfo* conn, int i) {
function loadSendSync (line 556) | void loadSendSync() {
function __device__ (line 607) | __device__ ~Primitives() {
function __device__ (line 623) | __device__ void moveDataPtrs(intptr_t delta) {
function __device__ (line 628) | __device__ void send(intptr_t inpIx, int eltN) {
function __device__ (line 644) | __device__ void sendFromOutput(intptr_t outIx, int eltN) {
function __device__ (line 674) | __device__ void recvReduceSend(intptr_t inpIx, int eltN) {
function __device__ (line 749) | __device__ void recvSend(int eltN) {
function __device__ (line 754) | __device__ void sendWithBarrier(intptr_t inpIx, int eltN) {
function __device__ (line 757) | __device__ void localCopy(T* srcs, T* dsts, int eltN) {
FILE: src/device/prims_simple.h
type primsMode (line 18) | enum primsMode {
type ncclConnInfo (line 56) | struct ncclConnInfo
type ncclConnFifo (line 57) | struct ncclConnFifo
function __device__ (line 83) | inline __device__ void barrier() {
function __device__ (line 93) | inline __device__ void subBarrier() {
function __device__ (line 99) | inline __device__ void patBarrier() {
function __device__ (line 107) | inline __device__ void barrierAny() {
function __device__ (line 111) | inline __device__ void subBarrierAny() {
function __device__ (line 115) | inline __device__ uint64_t loadStepValue(uint64_t* ptr) {
function __device__ (line 216) | inline __device__ void postPeer(bool dataStored) {
function __device__ (line 514) | static inline __device__ void recvPeerNotify(int peer, int connIndex, in...
function __forceinline__ (line 616) | __forceinline__ void
function loadRecvConn (line 676) | void loadRecvConn(ncclDevChannelPeer *peer, int connIndex, uint32_t dire...
function loadSendConn (line 728) | void loadSendConn(ncclDevChannelPeer *peer, int connIndex, uint32_t dire...
type ncclDevWorkCollReg (line 865) | struct ncclDevWorkCollReg
type ncclPatPeer (line 878) | struct ncclPatPeer
type ncclPatPeer (line 878) | struct ncclPatPeer
type ncclConnInfo (line 879) | struct ncclConnInfo
type ncclPatPeer (line 888) | struct ncclPatPeer
function __forceinline__ (line 910) | __forceinline__ __device__ ~Primitives() {
function __device__ (line 943) | __device__ void setDataPtrs(void const *inputBuf, void *outputBuf, uint6...
function __device__ (line 1057) | __device__ void moveDataPtrs(intptr_t delta) {
function send (line 1072) | void send(intptr_t inpIx, int eltN) {
function sendFromOutput (line 1075) | void sendFromOutput(intptr_t outIx, int eltN) {
function directSend (line 1078) | void directSend(intptr_t inpIx, intptr_t outIx, int eltN) {
function directSendFromOutput (line 1081) | void directSendFromOutput(intptr_t outIx, int eltN) {
function directRecvCopy (line 1091) | void directRecvCopy(intptr_t inpIx, intptr_t outIx, int eltN) {
function scatter (line 1155) | void
function directScatter (line 1159) | void
function directGather (line 1168) | void
function patReduce (line 1173) | void patReduce(struct ncclPatStep* ps, struct ncclPatShmem* shmem) {
function patCopy (line 1267) | void patCopy(struct ncclPatStep* ps, struct ncclPatShmem* shmem) {
function sendWithBarrier (line 1362) | void sendWithBarrier(intptr_t inpIx, int eltN) {
function localCopy (line 1365) | void localCopy(T* srcs, T* dsts, int eltN) {
FILE: src/device/rccl_metadata.h
function isMsccl (line 29) | constexpr bool isMsccl(int metadata){
FILE: src/device/reduce.h
function runRing (line 15) | void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
function run (line 80) | void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
function run (line 87) | void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
FILE: src/device/reduce_kernel.h
function true_type (line 22) | struct IsFloatingPoint<half>: std::true_type {}
type IsFloatingPoint (line 25) | struct IsFloatingPoint
type IsFloatingPoint (line 29) | struct IsFloatingPoint
function loadArg (line 71) | static uint64_t loadArg(void *ptr) { return 0; }
function loadArg (line 77) | static uint64_t loadArg(void *ptr) {
type Apply_Cast (line 92) | struct Apply_Cast
type Apply_Cast_MaybeEmpty (line 126) | struct Apply_Cast_MaybeEmpty
function __device__ (line 136) | __device__ constexpr static BytePack<0> reduce(Fn fn, BytePack<0> a, Byt...
function __device__ (line 144) | __device__ constexpr static BytePack<0> preOp(Fn fn, BytePack<0> a) { re...
function __device__ (line 152) | __device__ constexpr static BytePack<0> postOp(Fn fn, BytePack<0> a) { r...
function __device__ (line 159) | __device__ constexpr static BytePack<0> load(Fn fn, uintptr_t addr) { re...
type Apply_Cast (line 206) | struct Apply_Cast {
type Apply_Cast (line 223) | struct Apply_Cast
type Apply_Cast (line 229) | struct Apply_Cast
type Apply_Cast (line 236) | struct Apply_Cast
type Apply_Cast (line 242) | struct Apply_Cast
type Apply_Cast (line 250) | struct Apply_Cast
type Apply_Cast (line 256) | struct Apply_Cast
function BytePack (line 292) | static BytePack<0> reduce(Fn fn, BytePack<0> a, BytePack<0> b) {
function __forceinline__ (line 304) | __forceinline__ static BytePack<Size> reduce(Fn fn, BytePack<Size> a, By...
type Apply_Reduce (line 339) | struct Apply_Reduce
type Apply_Reduce (line 351) | struct Apply_Reduce
function __forceinline__ (line 462) | __forceinline__ static BytePack<Size> preOp(Fn fn, BytePack<Size> a) {
function T (line 809) | static T divide(T dividend, T divisor) {
function uint64_t (line 815) | struct Divider<uint64_t> {
function T (line 850) | T divide(T x) {
function BytePack (line 872) | static BytePack<EltPerPack*sizeof(T)> reduce(FuncSumPostDiv<T> fn, ByteP...
function BytePack (line 1027) | static BytePack<BytePerPack> load(Fn fn, uintptr_t addr) {
FILE: src/device/reduce_scatter.h
function runRing (line 15) | void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
function run (line 167) | void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
function run (line 174) | void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
function run (line 181) | void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
type ncclDevWorkColl (line 249) | struct ncclDevWorkColl
function __forceinline__ (line 254) | __forceinline__ void operator()(
function run (line 315) | void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) {
type ncclDevWorkColl (line 456) | struct ncclDevWorkColl
function __forceinline__ (line 461) | __forceinline__ void operator()(
function run (line 525) | void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
FILE: src/device/sendrecv.h
function __device__ (line 138) | __device__ void run() {
FILE: src/device/symmetric/generate.py
function paste (line 25) | def paste(sep, *args):
function emitln (line 29) | def emitln(f, lines):
function indent (line 34) | def indent(s):
class Rec (line 37) | class Rec(object):
method __init__ (line 38) | def __init__(me, **kw):
method __eq__ (line 40) | def __eq__(x, y):
method __hash__ (line 46) | def __hash__(me):
function enumerate_kernels (line 93) | def enumerate_kernels():
function required_cuda (line 103) | def required_cuda(k):
function kernel_fdep (line 123) | def kernel_fdep(k):
function kernel_fname (line 126) | def kernel_fname(k):
function kernel_gencode (line 135) | def kernel_gencode(k):
function kernel_cname (line 141) | def kernel_cname(k):
function kernel_conds (line 147) | def kernel_conds(k):
function instantiate (line 158) | def instantiate(k):
function prototype (line 178) | def prototype(k):
function partition (line 183) | def partition(vals, keyfn):
FILE: src/enhcompat.cc
type cudaError_t (line 9) | enum cudaError_t { cudaErrorStubLibrary = 34 }
function cudaError_t (line 14) | cudaError_t cudaStreamGetCaptureInfo_v2(...) { return cudaErrorS...
function cudaError_t (line 17) | cudaError_t cudaUserObjectCreate(...) { return cudaErrorS...
function cudaError_t (line 20) | cudaError_t cudaGraphRetainUserObject(...) { return cudaErrorS...
function cudaError_t (line 23) | cudaError_t cudaStreamUpdateCaptureDependencies(...) { return cudaErrorS...
function cudaError_t (line 26) | cudaError_t cudaGetDriverEntryPoint(...) { return cudaErrorS...
FILE: src/enqueue.cc
type ncclKernelMatch (line 42) | struct ncclKernelMatch {
function rcclProtoGrainSize (line 67) | static int rcclProtoGrainSize(int proto, ncclComm *comm){
function rcclShmemScratchWarpSize (line 77) | constexpr int rcclShmemScratchWarpSize(int cudaArch = NCCL_CUDA_ARCH, in...
function rcclShmemDynamicSize (line 88) | constexpr int rcclShmemDynamicSize(int cudaArch = NCCL_CUDA_ARCH, int Wa...
function ncclFuncTrafficPerByte (line 154) | static inline int ncclFuncTrafficPerByte(ncclFunc_t func, int nRanks) {
function ncclResult_t (line 167) | static ncclResult_t addProxyOpIfNeeded(struct ncclComm* comm, struct ncc...
function addWorkBatchToPlan (line 182) | static void addWorkBatchToPlan(
function finishPlan (line 249) | static void finishPlan(struct ncclComm* comm, struct ncclKernelPlan* pla...
type ncclComm (line 337) | struct ncclComm
type ncclTaskColl (line 337) | struct ncclTaskColl
type ncclComm (line 339) | struct ncclComm
type ncclTaskColl (line 339) | struct ncclTaskColl
type ncclComm (line 343) | struct ncclComm
type ncclTaskColl (line 343) | struct ncclTaskColl
type ncclProxyOp (line 344) | struct ncclProxyOp
type ncclKernelPlanBudget (line 347) | struct ncclKernelPlanBudget {
function testBudget (line 352) | static bool testBudget(
function gfx9CheapFenceOff (line 364) | bool gfx9CheapFenceOff(const ncclDevWorkColl& devWork, bool disabledByPr...
function ncclResult_t (line 369) | ncclResult_t ncclTasksRegAndEnqueue(struct ncclComm* comm) {
function ncclResult_t (line 451) | ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConne...
function ncclResult_t (line 637) | static ncclResult_t addProfilerProxyOpIfNeeded(struct ncclComm* comm, st...
function ncclResult_t (line 647) | static ncclResult_t scheduleCollTasksToPlan(
function ncclResult_t (line 969) | static ncclResult_t addP2pToPlan(
function calcP2pChannelCount (line 1243) | static int calcP2pChannelCount(size_t totalSize, int minChannels, int ma...
function ncclResult_t (line 1253) | static ncclResult_t scheduleP2pTasksToPlan(
function ncclResult_t (line 1340) | static ncclResult_t waitWorkFifoAvailable(struct ncclComm* comm, uint32_...
type uploadWork_cleanup_t (line 1366) | struct uploadWork_cleanup_t {
type ncclCommEventCallback (line 1367) | struct ncclCommEventCallback
function ncclResult_t (line 1370) | ncclResult_t uploadWork_cleanup_fn(
function ncclResult_t (line 1381) | static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelP...
function ncclResult_t (line 1499) | static ncclResult_t uploadProxyOps(struct ncclComm* comm, struct ncclKer...
function ncclResult_t (line 1541) | static ncclResult_t hostStreamPlanTask(struct ncclComm* comm, struct ncc...
function hostStreamPlanCallback (line 1557) | static void HIPRT_CB hostStreamPlanCallback(void *plan_) {
function ncclResult_t (line 1567) | static ncclResult_t reclaimPlan(struct ncclComm* comm, struct ncclCommCa...
function persistentDestructor (line 1620) | static void persistentDestructor(void* plans_) {
type ncclImplicitOrder (line 1633) | enum ncclImplicitOrder {
function ncclResult_t (line 1640) | static ncclResult_t getImplicitOrder(enum ncclImplicitOrder *mode, bool ...
function ncclResult_t (line 1655) | ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) {
function ncclResult_t (line 1795) | ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* co...
function ncclResult_t (line 1808) | ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPl...
function ncclResult_t (line 1935) | ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ...
type KernelFinishCallback (line 1951) | struct KernelFinishCallback {
type ncclCommEventCallback (line 1952) | struct ncclCommEventCallback
function ncclResult_t (line 1955) | ncclResult_t KernelFinishCallback_fn(
function ncclResult_t (line 1966) | ncclResult_t ncclLaunchFinish(struct ncclComm* comm) {
function ncclResult_t (line 2031) | static inline ncclResult_t getCollNetSupport(
function initCollCostTable (line 2052) | static void initCollCostTable(float** collCostTable) {
function ncclResult_t (line 2062) | static ncclResult_t updateCollCostTable(
function ncclResult_t (line 2105) | static ncclResult_t topoGetAlgoInfo(
function rccl_static (line 2277) | rccl_static ncclResult_t getAlgoInfo(
function ncclResult_t (line 2347) | static ncclResult_t calcCollChunking(
function ncclResult_t (line 2634) | static ncclResult_t hostToDevRedOp(
function ncclResult_t (line 2730) | static ncclResult_t ncclPlannerSetCapturingGraph(struct ncclComm* comm, ...
function ncclResult_t (line 2760) | static ncclResult_t p2pTaskAppend(
function ncclResult_t (line 2858) | static ncclResult_t collTaskAppend(
function ncclResult_t (line 2904) | static ncclResult_t ceCollTaskAppend(
function ncclResult_t (line 2955) | static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* i...
function ncclResult_t (line 3027) | ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
function ncclResult_t (line 3070) | ncclResult_t ncclRedOpCreatePreMulSum_impl(ncclRedOp_t *op, void *scalar...
function ncclResult_t (line 3115) | ncclResult_t ncclRedOpDestroy_impl(ncclRedOp_t op, ncclComm_t comm) {
FILE: src/graph/connect.cc
function ncclResult_t (line 22) | ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph*...
function isRankHere (line 97) | bool isRankHere(const char* s, int start, int end, int rank) {
function ncclResult_t (line 121) | ncclResult_t ncclTreeBasePostset(struct ncclComm* comm,
function ncclResult_t (line 217) | static ncclResult_t connectRings(struct ncclComm* comm, int* ringRecv, i...
function ncclResult_t (line 264) | static ncclResult_t getIndexes(int* ranks, int* indexes, int nNodes) {
function ncclResult_t (line 269) | static ncclResult_t setTreeUp(struct ncclTree* tree, int* indexes, int u) {
function ncclResult_t (line 275) | static ncclResult_t setTreeDown(struct ncclTree* tree, int* indexes, int...
function ncclResult_t (line 287) | static ncclResult_t connectTrees(struct ncclComm* comm, int* treeToParen...
function ncclResult_t (line 374) | static ncclResult_t connectCollNet(struct ncclComm* comm, struct ncclTop...
function ncclResult_t (line 439) | static ncclResult_t connectNvls(struct ncclComm* comm, int* nvlsHeads, i...
function ncclMinNchannels (line 526) | int ncclMinNchannels() {
function ncclMaxNchannels (line 540) | int ncclMaxNchannels() {
function copyChannels (line 553) | static int copyChannels(struct ncclComm* comm, int start, int end, int* ...
function exchangeValues (line 564) | void exchangeValues(int* v0, int* v1) {
function getTreeNodeParity (line 570) | int getTreeNodeParity(int treeDir, int nNodes, int node)
function ncclResult_t (line 587) | ncclResult_t connectRailOptimizedTrees(struct ncclComm* comm, int* treeT...
function ncclResult_t (line 713) | ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int...
FILE: src/graph/paths.cc
type ncclTopoNodeList (line 20) | struct ncclTopoNodeList {
type ncclTopoNode (line 21) | struct ncclTopoNode
function ncclResult_t (line 25) | static ncclResult_t getPath(struct ncclTopoSystem* system, struct ncclTo...
function ncclResult_t (line 38) | static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, stru...
function printNodePaths (line 120) | static void printNodePaths(struct ncclTopoSystem* system, struct ncclTop...
function ncclResult_t (line 153) | ncclResult_t ncclTopoPrintPaths(struct ncclTopoSystem* system) {
function ncclResult_t (line 163) | ncclResult_t ncclGetLocalCpu(struct ncclTopoSystem* system, int gpu, int...
function mergePathType (line 183) | static int mergePathType(int type0, int type1){
function ncclResult_t (line 190) | static ncclResult_t addInterStep(struct ncclTopoSystem* system, int tx, ...
function ncclTopoRemovePaths (line 209) | static void ncclTopoRemovePaths(struct ncclTopoSystem* system) {
function ncclResult_t (line 222) | ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char...
function ncclResult_t (line 266) | ncclResult_t ncclGetUserP2pLevel(int* level) {
function ncclResult_t (line 274) | ncclResult_t ncclTopoCheckP2p(struct ncclComm* comm, struct ncclTopoSyst...
function ncclResult_t (line 388) | ncclResult_t ncclTopoCheckMNNVL(struct ncclTopoSystem* system, struct nc...
function ncclResult_t (line 415) | ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int rank, i...
function ncclResult_t (line 505) | ncclResult_t ncclTopoIsGdrAvail(struct ncclTopoSystem* system, int rank,...
function ncclResult_t (line 529) | ncclResult_t ncclTopoNeedFlush(struct ncclComm* comm, int64_t netId, int...
function ncclResult_t (line 558) | ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int rank1, ...
function ncclResult_t (line 589) | ncclResult_t ncclTopoGetIntermediateRank(struct ncclTopoSystem* system, ...
function ncclPxnDisable (line 618) | int ncclPxnDisable(struct ncclComm* comm) {
function ncclResult_t (line 632) | ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermedia...
function rcclPathOverride (line 662) | static bool rcclPathOverride(struct ncclTopoSystem* system, uint64_t dis...
type ncclTopoSystem (line 688) | struct ncclTopoSystem
type ncclComm (line 688) | struct ncclComm
type ncclPeerInfo (line 730) | struct ncclPeerInfo
type ncclPeerInfo (line 733) | struct ncclPeerInfo
type ncclTopoNode (line 748) | struct ncclTopoNode
type ncclTopoNode (line 750) | struct ncclTopoNode
type ncclTopoNode (line 783) | struct ncclTopoNode
type ncclTopoNode (line 787) | struct ncclTopoNode
type ncclTopoNode (line 793) | struct ncclTopoNode
type ncclTopoGdrMode (line 812) | enum ncclTopoGdrMode
type ncclTopoNode (line 827) | struct ncclTopoNode
function ncclResult_t (line 835) | ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct nc...
function ncclTopoFree (line 938) | void ncclTopoFree(struct ncclTopoSystem* system) {
function ncclResult_t (line 943) | static ncclResult_t ncclTopoGetNchannels(struct ncclComm* comm, int g /*...
function ncclResult_t (line 986) | ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
function ncclResult_t (line 1032) | ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank,...
function ncclResult_t (line 1049) | ncclResult_t ncclTopoGetGpuMinPath(struct ncclTopoSystem* system, int ty...
function ncclResult_t (line 1063) | ncclResult_t ncclTopoGetGpuMaxPath(struct ncclTopoSystem* system, int ty...
function ncclResult_t (line 1077) | ncclResult_t ncclTopoPathAllNVLink(struct ncclTopoSystem* system, int* a...
function ncclResult_t (line 1086) | ncclResult_t ncclTopoSplitNvLink(struct ncclTopoSystem* system, int* spl...
FILE: src/graph/rings.cc
function dumpLine (line 9) | void dumpLine(int* values, int nranks, const char* prefix) {
function ncclResult_t (line 28) | ncclResult_t ncclBuildRings(int nrings, int* rings, int rank, int nranks...
FILE: src/graph/rome_models.cc
type rcclRomeModel (line 35) | struct rcclRomeModel {
type rcclRomeModel (line 55) | struct rcclRomeModel
type rcclRomeModel (line 68) | struct rcclRomeModel
type rcclRomeModel (line 81) | struct rcclRomeModel
type rcclRomeModel (line 94) | struct rcclRomeModel
type rcclRomeModel (line 107) | struct rcclRomeModel
type rcclRomeModel (line 120) | struct rcclRomeModel
type rcclRomeModel (line 133) | struct rcclRomeModel
type rcclRomeModel (line 146) | struct rcclRomeModel
type rcclRomeModel (line 159) | struct rcclRomeModel
type rcclRomeModel (line 172) | struct rcclRomeModel
type rcclRomeModel (line 185) | struct rcclRomeModel
type rcclRomeModel (line 198) | struct rcclRomeModel
type rcclRomeModel (line 211) | struct rcclRomeModel
type rcclRomeModel (line 224) | struct rcclRomeModel
type rcclRomeModel (line 237) | struct rcclRomeModel
type rcclRomeModel (line 250) | struct rcclRomeModel
type rcclRomeModel (line 263) | struct rcclRomeModel
type rcclRomeModel (line 276) | struct rcclRomeModel
type rcclRomeModel (line 289) | struct rcclRomeModel
type rcclRomeModel (line 302) | struct rcclRomeModel
type rcclRomeModel (line 315) | struct rcclRomeModel
type rcclRomeModel (line 328) | struct rcclRomeModel
type rcclRomeModel (line 341) | struct rcclRomeModel
type rcclRomeModel (line 355) | struct rcclRomeModel
type rcclRomeModel (line 368) | struct rcclRomeModel
type rcclRomeModel (line 382) | struct rcclRomeModel
type rcclRomeModel (line 395) | struct rcclRomeModel
type rcclRomeModel (line 408) | struct rcclRomeModel
type rcclRomeModel (line 421) | struct rcclRomeModel
type rcclRomeModel (line 434) | struct rcclRomeModel
type rcclRomeModel (line 447) | struct rcclRomeModel
type rcclRomeModel (line 460) | struct rcclRomeModel
type rcclRomeModel (line 473) | struct rcclRomeModel
type rcclRomeModel (line 486) | struct rcclRomeModel
type rcclRomeModel (line 499) | struct rcclRomeModel
type rcclRomeModel (line 512) | struct rcclRomeModel
type rcclRomeModel (line 525) | struct rcclRomeModel
type rcclRomeModel (line 538) | struct rcclRomeModel
type rcclRomeModel (line 551) | struct rcclRomeModel
type rcclRomeModel (line 564) | struct rcclRomeModel
type rcclRomeModel (line 577) | struct rcclRomeModel
type rcclRomeModel (line 892) | struct rcclRomeModel
type rcclRomeModel (line 905) | struct rcclRomeModel
type rcclRomeModel (line 918) | struct rcclRomeModel
type rcclRomeModel (line 1218) | struct rcclRomeModel
function ncclResult_t (line 1276) | ncclResult_t parseGraph(const char* str, struct ncclTopoSystem* system, ...
function ncclResult_t (line 1411) | ncclResult_t parseGraphLight(const char* str, struct ncclTopoSystem* sys...
function parseOptions (line 1492) | static void parseOptions(struct ncclTopoSystem* system, const char *opti...
function checkOption (line 1540) | static bool checkOption(const char *options, const char *name) {
function ncclResult_t (line 1569) | ncclResult_t parseChordalRing(struct ncclTopoSystem* system, struct nccl...
function ncclResult_t (line 1644) | static ncclResult_t parseRomeSystem(struct ncclTopoSystem* system, struc...
function permuteGpuIds (line 1814) | static bool permuteGpuIds(int *g, int n, int last, struct rcclRomeModel*...
function permuteNetIds (line 1858) | static bool permuteNetIds(int *n, int *g, int s, int last, struct rcclRo...
function checkAlltoallWidth (line 1894) | int checkAlltoallWidth(struct rcclRomeModel *romeTopo) {
function ncclResult_t (line 1916) | ncclResult_t parseA2a8P(struct ncclTopoSystem* system, struct ncclTopoGr...
function ncclResult_t (line 2118) | ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, struct ncclTop...
function ncclResult_t (line 2299) | ncclResult_t parse1H16P(struct ncclTopoSystem* system, struct ncclTopoGr...
function ncclResult_t (line 2442) | ncclResult_t find_gpu_hives(int *g_hives, int *ng_hives, struct rcclRome...
function ncclResult_t (line 2478) | ncclResult_t parse4H4P(struct ncclTopoSystem* system, struct ncclTopoGra...
type rcclRomeModel (line 2560) | struct rcclRomeModel
function ncclResult_t (line 2595) | ncclResult_t parseGIOTopos(struct ncclTopoSystem* system, struct ncclTop...
FILE: src/graph/rome_models.h
type ncclTopoSystem (line 26) | struct ncclTopoSystem
type ncclTopoGraph (line 26) | struct ncclTopoGraph
type ncclTopoSystem (line 27) | struct ncclTopoSystem
type ncclTopoGraph (line 27) | struct ncclTopoGraph
type ncclTopoSystem (line 28) | struct ncclTopoSystem
type ncclTopoGraph (line 28) | struct ncclTopoGraph
type ncclTopoSystem (line 29) | struct ncclTopoSystem
type ncclTopoGraph (line 29) | struct ncclTopoGraph
type ncclTopoSystem (line 30) | struct ncclTopoSystem
type ncclTopoGraph (line 30) | struct ncclTopoGraph
type ncclTopoSystem (line 31) | struct ncclTopoSystem
type ncclTopoGraph (line 31) | struct ncclTopoGraph
type ncclTopoSystem (line 32) | struct ncclTopoSystem
type ncclTopoGraph (line 32) | struct ncclTopoGraph
type ncclTopoSystem (line 33) | struct ncclTopoSystem
type ncclTopoGraph (line 33) | struct ncclTopoGraph
FILE: src/graph/search.cc
function getMaxBw (line 23) | static float getMaxBw(struct ncclTopoSystem* system, struct ncclTopoNode...
function getTotalBw (line 33) | static float getTotalBw(struct ncclTopoSystem* system, struct ncclTopoNo...
function ncclResult_t (line 42) | ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system) {
function ncclResult_t (line 59) | ncclResult_t ncclTopoComputeCommCPU(struct ncclComm* comm) {
function ncclResult_t (line 68) | static ncclResult_t findRevLink(struct ncclTopoNode* node1, struct ncclT...
function ncclResult_t (line 83) | static ncclResult_t followPath(struct ncclTopoLinkList* path, struct ncc...
function ncclResult_t (line 124) | static ncclResult_t ncclTopoFollowPath(struct ncclTopoSystem* system, st...
function gpuPciBw (line 169) | static int gpuPciBw(struct ncclTopoNode* gpu) {
type ncclGpuScore (line 185) | struct ncclGpuScore {
function cmpScore (line 195) | static int cmpScore(const void * g1, const void * g2) {
function cmpIntraScores (line 207) | static int cmpIntraScores(struct ncclGpuScore* scores, int count) {
function ncclResult_t (line 216) | static ncclResult_t getGpuIndex(struct ncclTopoSystem* system, int rank,...
function ncclResult_t (line 227) | static ncclResult_t getNetIndex(struct ncclTopoSystem* system, int64_t i...
function ncclResult_t (line 238) | static ncclResult_t getNetPaths(struct ncclTopoSystem* system, struct nc...
function ncclResult_t (line 246) | ncclResult_t ncclTopoSearchNextGpuSort(struct ncclTopoSystem* system, st...
type ncclTopoSystem (line 316) | struct ncclTopoSystem
type ncclTopoGraph (line 316) | struct ncclTopoGraph
type ncclTopoGraph (line 316) | struct ncclTopoGraph
function ncclResult_t (line 327) | ncclResult_t ncclTopoReplayGetGpu(struct ncclTopoSystem* system, struct ...
type ncclTopoSystem (line 339) | struct ncclTopoSystem
type ncclTopoGraph (line 339) | struct ncclTopoGraph
type ncclTopoGraph (line 339) | struct ncclTopoGraph
type ncclTopoNode (line 339) | struct ncclTopoNode
function ncclResult_t (line 341) | ncclResult_t ncclTopoSearchTryGpu(struct ncclTopoSystem* system, struct ...
function ncclTopoCountXGMI (line 354) | static int ncclTopoCountXGMI(struct ncclTopoSystem* system, struct ncclT...
function ncclResult_t (line 383) | ncclResult_t ncclTopoSearchTryCollnetDirect(struct ncclTopoSystem* syste...
function ncclResult_t (line 419) | ncclResult_t ncclTopoSearchTryNvls(struct ncclTopoSystem* system, struct...
function ncclResult_t (line 452) | ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct...
function ncclResult_t (line 478) | static ncclResult_t ncclTopoPrefNetsGpuFirst(struct ncclTopoSystem* syst...
function ncclResult_t (line 513) | static ncclResult_t ncclTopoPrefNetsChannelFirst(struct ncclTopoSystem* ...
function ncclResult_t (line 545) | ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeI...
function ncclResult_t (line 582) | ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ...
function ncclResult_t (line 682) | ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ...
function ncclResult_t (line 797) | ncclResult_t ncclTopoSearchParams(struct ncclTopoSystem* system, int pat...
function ncclResult_t (line 811) | ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncc...
type kvDict (line 845) | struct kvDict
function ncclResult_t (line 858) | ncclResult_t ncclTopoGetChannelFromXml(struct ncclXmlNode *xmlChannel, i...
function ncclResult_t (line 886) | ncclResult_t ncclTopoGetGraphFromXmlSub(struct ncclXmlNode *xmlGraph, st...
function ncclResult_t (line 915) | ncclResult_t ncclTopoGetGraphFromXml(struct ncclXmlNode *xmlGraphs, stru...
function ncclResult_t (line 923) | ncclResult_t ncclTopoGetXmlFromChannel(struct ncclTopoGraph* graph, int ...
function ncclResult_t (line 956) | ncclResult_t ncclTopoGetXmlFromGraph(struct ncclTopoGraph* graph, struct...
function ncclResult_t (line 977) | ncclResult_t ncclTopoGetXmlFromGraphs(int ngraphs, struct ncclTopoGraph*...
function ncclResult_t (line 988) | ncclResult_t ncclTopoDupChannels(struct ncclTopoGraph* graph, int ccMin,...
function ncclResult_t (line 1027) | ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGrap...
function ncclResult_t (line 1284) | ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct nc...
function ncclResult_t (line 1323) | ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngrap...
function ncclResult_t (line 1342) | ncclResult_t getNvlsNetDev(struct ncclComm* comm, struct ncclTopoGraph* ...
function ncclResult_t (line 1370) | ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct n...
function ncclResult_t (line 1442) | ncclResult_t ncclTopoGetIntraNetDev(struct ncclTopoSystem* system, int r...
function ncclResult_t (line 1464) | ncclResult_t ncclTopoGetLinkType(struct ncclTopoSystem* system, int cuda...
FILE: src/graph/topo.cc
function ncclResult_t (line 39) | ncclResult_t pciPathToInt64(char* path, int offset, int minOffset, int64...
function ncclResult_t (line 54) | static ncclResult_t findLocalCpu(struct ncclTopoNode* node, struct ncclT...
function ncclResult_t (line 76) | static ncclResult_t ncclTopoGetInterCpuBw(struct ncclTopoNode* cpu, floa...
type ncclNvLinkDeviceType (line 102) | enum ncclNvLinkDeviceType {
function ncclResult_t (line 109) | ncclResult_t ncclTopoGetNode(struct ncclTopoSystem* system, struct ncclT...
function ncclResult_t (line 119) | ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct nc...
function ncclResult_t (line 146) | ncclResult_t ncclTopoRemoveNode(struct ncclTopoSystem* system, int type,...
function ncclResult_t (line 169) | ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct nccl...
function getBcmGen (line 200) | int getBcmGen(uint64_t id, int level) {
function ncclResult_t (line 205) | ncclResult_t ncclTopoFlattenBcmSwitches(struct ncclTopoSystem* system) {
function ncclResult_t (line 270) | ncclResult_t ncclTopoConnectCpus(struct ncclTopoSystem* system) {
function ncclResult_t (line 285) | static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct n...
function ncclResult_t (line 321) | ncclResult_t ncclTopoPrint(struct ncclTopoSystem* s) {
function ncclResult_t (line 330) | static ncclResult_t ncclTopoSort(struct ncclTopoNode* node, struct ncclT...
function ncclResult_t (line 357) | ncclResult_t ncclTopoSortSystem(struct ncclTopoSystem* system) {
function ncclResult_t (line 362) | ncclResult_t ncclTopoAddNet(struct ncclXmlNode* xmlNet, struct ncclTopoS...
function ncclResult_t (line 392) | ncclResult_t ncclTopoAddNic(struct ncclXmlNode* xmlNic, struct ncclTopoS...
function ncclResult_t (line 405) | ncclResult_t ncclTopoAddGpu(struct ncclXmlNode* xmlGpu, struct ncclTopoS...
type kvDict (line 433) | struct kvDict
type kvDict (line 434) | struct kvDict
function ncclResult_t (line 438) | ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoS...
type kvDict (line 508) | struct kvDict
type kvDict (line 509) | struct kvDict
function ncclResult_t (line 511) | ncclResult_t ncclGetSystemId(struct ncclTopoSystem* system, struct ncclX...
function ncclResult_t (line 523) | ncclResult_t ncclTopoAddCpu(struct ncclXmlNode* xmlCpu, struct ncclTopoS...
function ncclResult_t (line 584) | ncclResult_t ncclTopoAddXGMI(struct ncclXmlNode* node, struct ncclTopoSy...
function ncclResult_t (line 635) | ncclResult_t ncclTopoAddNvLinks(struct ncclXmlNode* node, struct ncclTop...
function ncclResult_t (line 691) | ncclResult_t ncclTopoAddPciLinks(struct ncclXmlNode* node, struct ncclTo...
function ncclResult_t (line 723) | ncclResult_t ncclTopoAddC2c(struct ncclXmlNode* node, struct ncclTopoSys...
function ncclResult_t (line 757) | ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTo...
function ncclResult_t (line 793) | static ncclResult_t xmlInitAttrInt(struct ncclXmlNode* node, const char*...
function ncclResult_t (line 804) | static ncclResult_t xmlInitAttrUint64(struct ncclXmlNode* node, const ch...
function ncclResult_t (line 815) | static ncclResult_t xmlInitAttrFloat(struct ncclXmlNode* node, const cha...
function ncclResult_t (line 827) | ncclResult_t ncclTopoRefreshBcmP2pLinks(void) {
function ncclTopoCheckPix (line 841) | int ncclTopoCheckPix(ncclXmlNode* common, ncclXmlNode** nodes, int nNode...
type xmlNodeStack (line 870) | struct xmlNodeStack {
method ncclXmlNode (line 874) | ncclXmlNode* top() {
method ncclXmlNode (line 882) | ncclXmlNode* pop() {
method push (line 890) | void push(ncclXmlNode* node) {
method empty (line 896) | bool empty() {
function ncclResult_t (line 902) | ncclResult_t ncclFindFirstPciParent(ncclXmlNode** parent) {
function ncclResult_t (line 914) | ncclResult_t ncclTopoGetPath(ncclXmlNode** nodes, int nNodes, int* path,...
function ncclResult_t (line 1017) | ncclResult_t ncclTopoMakeUniqueBusId(struct ncclXml* xml, char* busId, s...
function ncclResult_t (line 1042) | ncclResult_t ncclTopoMakePciParent(struct ncclXml* xml, struct ncclXmlNo...
function ncclResult_t (line 1075) | ncclResult_t ncclTopoMakeVnic(struct ncclXml* xml, struct ncclTopoNetInf...
function ncclResult_t (line 1107) | ncclResult_t ncclTopoForceMerge(struct ncclXml* xml, struct ncclTopoNetI...
function ncclResult_t (line 1168) | ncclResult_t ncclTopoAutoMerge(struct ncclXml* xml, struct ncclTopoNetIn...
type kvDict (line 1238) | struct kvDict
function ncclResult_t (line 1251) | ncclResult_t ncclTopoFindLinkWidthRec(ncclXmlNode* node, ncclXmlNode** p...
function ncclResult_t (line 1299) | ncclResult_t ncclTopoFindLinkWidth(ncclXmlNode* parent, ncclXmlNode** ph...
function ncclResult_t (line 1314) | ncclResult_t ncclTopoWidenLinks(ncclXmlNode** physNetNodes, int ndevs, n...
function ncclResult_t (line 1336) | ncclResult_t ncclTopoGetVNicParent(struct ncclXml* xml, ncclResult_t (*g...
function ncclResult_t (line 1373) | ncclResult_t ncclTopoMakeVNics(struct ncclXml* xml, struct ncclTopoNetIn...
function ncclResult_t (line 1401) | static ncclResult_t ncclTopoPopulateNics(ncclXml* xml, int startIndex, i...
function ncclResult_t (line 1447) | ncclResult_t ncclTopoProcessNet(ncclXml* xml, const char* dumpXmlFile, s...
function ncclResult_t (line 1473) | ncclResult_t ncclTopoGetFusionEnv(int* mergeLevel, const char** forceMer...
function ncclResult_t (line 1486) | ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSys...
function ncclResult_t (line 1622) | ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, i...
function ncclResult_t (line 1650) | ncclResult_t getLocalNetCountByBw(struct ncclTopoSystem* system, int gpu...
type netDevsPolicy (line 1673) | enum netDevsPolicy {
type netDevsPolicy (line 1680) | enum netDevsPolicy
function getNetDevsPolicyOnce (line 1683) | static void getNetDevsPolicyOnce() {
function ncclResult_t (line 1705) | ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank...
function ncclResult_t (line 1748) | ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int64_t ...
function ncclResult_t (line 1779) | ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, i...
function ncclResult_t (line 1788) | ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int r...
function ncclResult_t (line 1833) | ncclResult_t ncclTopoGetGpuCount(struct ncclTopoSystem* system, int* cou...
function ncclResult_t (line 1838) | ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* cou...
function ncclResult_t (line 1843) | ncclResult_t ncclTopoGetNvsCount(struct ncclTopoSystem* system, int* cou...
function ncclResult_t (line 1848) | ncclResult_t ncclTopoGetCompCap(struct ncclTopoSystem* system, int* ccMi...
FILE: src/graph/topo.h
type ncclTopoNode (line 110) | struct ncclTopoNode
type ncclTopoLink (line 111) | struct ncclTopoLink {
type ncclTopoLinkList (line 120) | struct ncclTopoLinkList {
type ncclTopoNode (line 145) | struct ncclTopoNode {
type ncclTopoNodeSet (line 189) | struct ncclTopoNodeSet {
type ncclTopoSystem (line 194) | struct ncclTopoSystem {
type ncclTopoSystem (line 221) | struct ncclTopoSystem
type ncclTopoNode (line 221) | struct ncclTopoNode
type ncclTopoSystem (line 222) | struct ncclTopoSystem
type ncclTopoNode (line 222) | struct ncclTopoNode
type ncclTopoSystem (line 223) | struct ncclTopoSystem
type ncclTopoNode (line 224) | struct ncclTopoNode
type ncclTopoNode (line 224) | struct ncclTopoNode
type ncclTopoSystem (line 225) | struct ncclTopoSystem
type ncclTopoSystem (line 226) | struct ncclTopoSystem
type ncclTopoSystem (line 227) | struct ncclTopoSystem
type ncclTopoSystem (line 228) | struct ncclTopoSystem
type ncclTopoSystem (line 229) | struct ncclTopoSystem
type ncclTopoSystem (line 230) | struct ncclTopoSystem
type ncclTopoNetInfo (line 232) | struct ncclTopoNetInfo {
type ncclTopoNetInfo (line 250) | struct ncclTopoNetInfo
type ncclXml (line 255) | struct ncclXml
type ncclTopoSystem (line 255) | struct ncclTopoSystem
type ncclXmlNode (line 256) | struct ncclXmlNode
type ncclTopoSystem (line 256) | struct ncclTopoSystem
type ncclTopoGraph (line 256) | struct ncclTopoGraph
type ncclTopoGraph (line 257) | struct ncclTopoGraph
type ncclTopoSystem (line 257) | struct ncclTopoSystem
type ncclXml (line 257) | struct ncclXml
type ncclTopoSystem (line 259) | struct ncclTopoSystem
function ncclResult_t (line 261) | static ncclResult_t ncclTopoIdToIndex(struct ncclTopoSystem* system, int...
function ncclResult_t (line 272) | static ncclResult_t ncclTopoRankToIndex(struct ncclTopoSystem* system, i...
function ncclResult_t (line 284) | static ncclResult_t ncclTopoDevToRank(struct ncclTopoSystem* system, int...
type kvDict (line 296) | struct kvDict
function ncclResult_t (line 298) | static ncclResult_t ncclTopoIdToNetDev(struct ncclTopoSystem* system, in...
function ncclTopoXGMISpeed (line 311) | static float ncclTopoXGMISpeed(const char* gcn) {
function ncclTopoNVLinkBw (line 323) | static float ncclTopoNVLinkBw(int cudaCompCap) {
function isPow2 (line 335) | static bool isPow2(int val) {
function mirrorBits (line 338) | static int mirrorBits(int val, int pow2) {
FILE: src/graph/trees.cc
function ncclResult_t (line 31) | ncclResult_t ncclGetBtree(int nranks, int rank, int* u, int* d0, int* d1...
function ncclResult_t (line 88) | ncclResult_t ncclGetDtree(int nranks, int rank, int* s0, int* d0_0, int*...
FILE: src/graph/tuning.cc
function getNthreads (line 18) | static int getNthreads(const char* name, int env, int min, int max, int ...
function ncclResult_t (line 56) | ncclResult_t parseList(const char* str, const char* prefixElems[], int n...
type tuningModel (line 151) | struct tuningModel {
type tuningModel (line 166) | struct tuningModel
type tuningModel (line 199) | struct tuningModel
type tuningModel (line 232) | struct tuningModel
type tuningModel (line 265) | struct tuningModel
type tuningModel (line 298) | struct tuningModel
type tuningModel (line 331) | struct tuningModel
type tuningModel (line 377) | struct tuningModel
type tuningModel (line 427) | struct tuningModel
type tuningModel (line 477) | struct tuningModel
function ncclPatEnable (line 557) | static int ncclPatEnable(struct ncclComm* comm) {
function getNetOverhead (line 570) | static float getNetOverhead(struct ncclComm* comm) {
function ncclResult_t (line 579) | ncclResult_t ncclTopoInitTunerConstants(struct ncclComm* comm) {
function ncclResult_t (line 586) | ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, in...
function ncclResult_t (line 986) | ncclResult_t ncclTopoGetAlgoTime(struct ncclComm* comm, int coll, int al...
function rcclGetTuningIndexForArch (line 1019) | int rcclGetTuningIndexForArch(const char* gfxarch) {
FILE: src/graph/xml.cc
function ncclResult_t (line 34) | ncclResult_t xmlGetChar(FILE* file, char* c) {
function ncclResult_t (line 42) | ncclResult_t xmlGetValue(FILE* file, char* value, char* last) {
function ncclResult_t (line 83) | ncclResult_t xmlGetToken(FILE* file, char* name, char* value, char* last) {
function ncclResult_t (line 112) | ncclResult_t xmlSkipComment(FILE* file, char* start, char next) {
function ncclResult_t (line 133) | ncclResult_t xmlGetNode(FILE* file, struct ncclXmlNode* node) {
type ncclXml (line 188) | struct ncclXml
type ncclXmlNode (line 188) | struct ncclXmlNode
type xmlHandler (line 190) | struct xmlHandler {
function ncclResult_t (line 195) | ncclResult_t xmlLoadSub(FILE* file, struct ncclXml* xml, struct ncclXmlN...
function ncclResult_t (line 251) | ncclResult_t ncclTopoConvertXml(struct ncclXml* xml, uintptr_t base, int...
function ncclResult_t (line 267) | ncclResult_t ncclTopoDumpXmlRec(int indent, FILE* file, struct ncclXmlNo...
function ncclResult_t (line 287) | ncclResult_t ncclTopoDumpXmlToFile(const char* xmlTopoFile, struct ncclX...
function ncclResult_t (line 298) | static ncclResult_t xmlTopoFuseXmlRecursive(struct ncclXml* dst, struct ...
function ncclResult_t (line 312) | ncclResult_t ncclTopoFuseXml(struct ncclXml* dst, struct ncclXml* src) {
function ncclResult_t (line 334) | ncclResult_t ncclTopoXmlLoadNvlink(FILE* file, struct ncclXml* xml, stru...
function ncclResult_t (line 339) | ncclResult_t ncclTopoXmlLoadPciLink(FILE* file, struct ncclXml* xml, str...
function ncclResult_t (line 344) | ncclResult_t ncclTopoXmlLoadC2c(FILE* file, struct ncclXml* xml, struct ...
function ncclResult_t (line 348) | ncclResult_t ncclTopoXmlLoadGpu(FILE* file, struct ncclXml* xml, struct ...
function ncclResult_t (line 358) | ncclResult_t ncclTopoXmlLoadNet(FILE* file, struct ncclXml* xml, struct ...
function ncclResult_t (line 363) | ncclResult_t ncclTopoXmlLoadNic(FILE* file, struct ncclXml* xml, struct ...
function ncclResult_t (line 369) | ncclResult_t ncclTopoXmlLoadPci(FILE* file, struct ncclXml* xml, struct ...
function ncclResult_t (line 375) | ncclResult_t ncclTopoXmlLoadCpu(FILE* file, struct ncclXml* xml, struct ...
function ncclResult_t (line 381) | ncclResult_t ncclTopoXmlLoadSystem(FILE* file, struct ncclXml* xml, stru...
function ncclResult_t (line 401) | ncclResult_t ncclTopoGetXmlFromFile(const char* xmlTopoFile, struct nccl...
function memcpylower (line 424) | static void memcpylower(char* dst, const char* src, const size_t size) {
function ncclResult_t (line 427) | static ncclResult_t getPciPath(const char* busId, char** path) {
function ncclResult_t (line 442) | static ncclResult_t getBcmLinks(const char* busId, int* nlinks, char** p...
function ncclResult_t (line 463) | ncclResult_t ncclTopoGetStrFromSys(const char* path, const char* fileNam...
function ncclResult_t (line 484) | ncclResult_t ncclTopoSetAttrFromSys(struct ncclXmlNode* pciNode, const c...
function ncclResult_t (line 492) | ncclResult_t ncclTopoGetXmlFromCpu(struct ncclXmlNode* cpuNode, struct n...
function ncclResult_t (line 567) | ncclResult_t ncclTopoGetPciNode(struct ncclXml* xml, const char* busId, ...
function isHex (line 579) | int isHex(char c) { return ((c >= '0' && c <= '9') || (c >= 'a' && c <= ...
function checkBDFFormat (line 580) | int checkBDFFormat(char* bdf) {
function ncclResult_t (line 589) | ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct n...
function ncclResult_t (line 762) | ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, uint32_t...
function ncclResult_t (line 999) | ncclResult_t ncclTopoFillGpu(struct ncclXml* xml, const char* busId, str...
function ncclResult_t (line 1030) | ncclResult_t ncclTopoGetSubsystem(const char* sysPath, char* subSys) {
function ncclResult_t (line 1045) | ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, c...
function ncclResult_t (line 1092) | ncclResult_t ncclTopoTrimXmlRec(struct ncclXmlNode* node, int* keep) {
function ncclResult_t (line 1124) | ncclResult_t ncclTopoTrimXml(struct ncclXml* xml) {
function ncclResult_t (line 1134) | ncclResult_t ncclTopoXmlGraphLoadGpu(FILE* file, struct ncclXml* xml, st...
function ncclResult_t (line 1139) | ncclResult_t ncclTopoXmlGraphLoadNet(FILE* file, struct ncclXml* xml, st...
function ncclResult_t (line 1144) | ncclResult_t ncclTopoXmlGraphLoadChannel(FILE* file, struct ncclXml* xml...
function ncclResult_t (line 1150) | ncclResult_t ncclTopoXmlGraphLoadGraph(FILE* file, struct ncclXml* xml, ...
function ncclResult_t (line 1156) | ncclResult_t ncclTopoXmlGraphLoadGraphs(FILE* file, struct ncclXml* xmlG...
function ncclResult_t (line 1173) | ncclResult_t ncclTopoGetXmlGraphFromFile(const char* xmlGraphFile, struc...
FILE: src/graph/xml.h
type ncclXmlNode (line 28) | struct ncclXmlNode {
type ncclXml (line 41) | struct ncclXml {
type ncclXml (line 48) | struct ncclXml
type ncclXml (line 49) | struct ncclXml
type ncclXml (line 51) | struct ncclXml
type ncclXml (line 54) | struct ncclXml
type ncclXmlNode (line 54) | struct ncclXmlNode
type ncclXml (line 55) | struct ncclXml
type ncclXmlNode (line 55) | struct ncclXmlNode
type ncclXmlNode (line 55) | struct ncclXmlNode
type ncclXml (line 58) | struct ncclXml
type ncclXml (line 61) | struct ncclXml
type ncclXml (line 61) | struct ncclXml
type ncclXml (line 63) | struct ncclXml
function xmlMemSize (line 72) | static size_t xmlMemSize(int maxNodes) {
function ncclResult_t (line 75) | static ncclResult_t xmlAlloc(struct ncclXml** xml, int maxNodes) {
function ncclResult_t (line 83) | static ncclResult_t xmlGetAttrIndex(struct ncclXmlNode* node, const char...
function ncclResult_t (line 95) | static ncclResult_t xmlGetAttr(struct ncclXmlNode* node, const char* att...
function ncclResult_t (line 102) | static ncclResult_t xmlGetAttrStr(struct ncclXmlNode* node, const char* ...
function ncclResult_t (line 110) | static ncclResult_t xmlGetAttrInt(struct ncclXmlNode* node, const char* ...
function ncclResult_t (line 117) | static ncclResult_t xmlGetAttrIntDefault(struct ncclXmlNode* node, const...
function ncclResult_t (line 124) | static ncclResult_t xmlGetAttrUint64(struct ncclXmlNode* node, const cha...
function ncclResult_t (line 131) | static ncclResult_t xmlGetAttrUint64Default(struct ncclXmlNode* node, co...
function ncclResult_t (line 138) | static ncclResult_t xmlGetAttrLong(struct ncclXmlNode* node, const char*...
function ncclResult_t (line 145) | static ncclResult_t xmlGetAttrFloat(struct ncclXmlNode* node, const char...
function ncclResult_t (line 152) | static ncclResult_t xmlGetAttrFloatDefault(struct ncclXmlNode* node, con...
function ncclResult_t (line 159) | static ncclResult_t xmlFindTag(struct ncclXml* xml, const char* tagName,...
function ncclResult_t (line 171) | static ncclResult_t xmlFindNextTag(struct ncclXml* xml, const char* tagN...
function ncclResult_t (line 183) | static ncclResult_t xmlFindTagKv(struct ncclXml* xml, const char* tagNam...
function ncclResult_t (line 199) | static ncclResult_t xmlFindNode(struct ncclXmlNode* parentNode, struct n...
function ncclResult_t (line 222) | static ncclResult_t xmlSetAttr(struct ncclXmlNode* node, const char* att...
function ncclResult_t (line 235) | static ncclResult_t xmlPrintNodeRecursive(struct ncclXmlNode* node, cons...
function ncclResult_t (line 253) | static ncclResult_t xmlSetAttrIfUnset(struct ncclXmlNode* node, const ch...
function ncclResult_t (line 265) | static ncclResult_t xmlSetAttrInt(struct ncclXmlNode* node, const char* ...
function ncclResult_t (line 277) | static ncclResult_t xmlSetAttrFloat(struct ncclXmlNode* node, const char...
function ncclResult_t (line 289) | static ncclResult_t xmlSetAttrLong(struct ncclXmlNode* node, const char*...
function ncclResult_t (line 301) | static ncclResult_t xmlUnsetAttr(struct ncclXmlNode* node, const char* a...
function ncclResult_t (line 313) | static ncclResult_t xmlGetSub(struct ncclXmlNode* node, const char* subN...
function ncclResult_t (line 324) | static ncclResult_t xmlGetSubKv(struct ncclXmlNode* node, const char* su...
function ncclResult_t (line 339) | static ncclResult_t xmlGetSubKvInt(struct ncclXmlNode* node, const char*...
function ncclResult_t (line 346) | static ncclResult_t xmlAddNode(struct ncclXml* xml, struct ncclXmlNode* ...
function ncclResult_t (line 368) | static ncclResult_t xmlRemoveNode(struct ncclXmlNode* node) {
function ncclResult_t (line 381) | static ncclResult_t xmlAddTree(struct ncclXml* dst, struct ncclXmlNode* ...
type kvDict (line 406) | struct kvDict {
function ncclResult_t (line 411) | static ncclResult_t kvConvertToInt(const char* str, int* value, struct k...
function ncclResult_t (line 424) | static ncclResult_t kvConvertToStr(int value, const char** str, struct k...
FILE: src/group.cc
type ncclComm (line 29) | struct ncclComm
type ncclComm (line 30) | struct ncclComm
type ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> (line 31) | struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next>
type ncclAsyncJob (line 31) | struct ncclAsyncJob
function ncclResult_t (line 35) | ncclResult_t ncclAsyncLaunch(
type ncclAsyncJob (line 80) | struct ncclAsyncJob
type ncclAsyncJob (line 80) | struct ncclAsyncJob
function ncclResult_t (line 89) | ncclResult_t ncclAsyncJobComplete(struct ncclAsyncJob* job) {
function ncclResult_t (line 101) | ncclResult_t ncclGroupStart_impl() {
function ncclResult_t (line 114) | ncclResult_t ncclGroupStartInternal() {
function ncclResult_t (line 123) | ncclResult_t ncclGroupEnd_impl() {
function ncclResult_t (line 137) | ncclResult_t ncclGroupSimulateEnd(ncclSimInfo_t* simInfo) {
type ncclPreconnectJob (line 150) | struct ncclPreconnectJob {
type ncclAsyncJob (line 151) | struct ncclAsyncJob
type ncclComm (line 152) | struct ncclComm
type ncclPrepareTasksAndCollPreconnectJob (line 156) | struct ncclPrepareTasksAndCollPreconnectJob {
type ncclAsyncJob (line 157) | struct ncclAsyncJob
type ncclComm (line 158) | struct ncclComm
function ncclResult_t (line 162) | ncclResult_t ncclP2PPreconnectFunc(struct ncclAsyncJob* job_) {
function ncclResult_t (line 172) | static ncclResult_t ncclCollPreconnect(struct ncclComm* comm, bool* algo...
function ncclResult_t (line 217) | ncclResult_t ncclPrepareTasksAndCollPreconnectFunc(struct ncclAsyncJob* ...
function ncclResult_t (line 230) | ncclResult_t ncclCollPreconnectFunc(struct ncclAsyncJob* job_) {
type ncclGroupSymmetricJob (line 246) | struct ncclGroupSymmetricJob {
type ncclAsyncJob (line 247) | struct ncclAsyncJob
type ncclComm (line 248) | struct ncclComm
function ncclResult_t (line 251) | ncclResult_t ncclCommGroupRegisterSymmetric(struct ncclAsyncJob* job_) {
function ncclResult_t (line 287) | static ncclResult_t doLaunches(struct ncclComm* head) {
function groupLocalResetJobState (line 359) | static inline void groupLocalResetJobState() {
function groupCleanup (line 368) | static void groupCleanup(struct ncclComm** groupCommHeadPtr, struct nccl...
function ncclResult_t (line 429) | static ncclResult_t asyncJobLaunch(struct ncclIntruQueue<struct ncclAsyn...
function ncclResult_t (line 496) | static ncclResult_t ncclPrepareTasksAndCollPreconnect(struct ncclComm* c...
function ncclResult_t (line 539) | static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t...
function ncclResult_t (line 672) | static ncclResult_t groupLaunchNonBlocking(struct ncclAsyncJob *job_) {
function ncclResult_t (line 676) | ncclResult_t ncclGroupEndInternal(ncclSimInfo_t* simInfo) {
function ncclResult_t (line 801) | ncclResult_t ncclGroupJobComplete(struct ncclGroupJob* groupJob) {
function ncclResult_t (line 814) | ncclResult_t ncclGroupJobAbort(struct ncclGroupJob* groupJob) {
FILE: src/include/BfdBacktrace.hpp
type dl_address_search (line 27) | struct dl_address_search {
type backtrace_file (line 33) | struct backtrace_file {
type dl_address_search (line 34) | struct dl_address_search
type backtrace_line (line 39) | struct backtrace_line {
type backtrace (line 48) | struct backtrace {
type backtrace_line (line 49) | struct backtrace_line
type backtrace (line 53) | struct backtrace
type backtrace_line (line 49) | struct backtrace_line
type backtrace_search (line 55) | struct backtrace_search {
type backtrace_file (line 57) | struct backtrace_file
type backtrace_line (line 60) | struct backtrace_line
function dl_match_address (line 79) | static int dl_match_address(struct dl_phdr_info *info, size_t size, void...
function dl_lookup_address (line 99) | static int dl_lookup_address(struct dl_address_search *dl)
function load_file (line 114) | static int load_file(struct backtrace_file *file)
function unload_file (line 154) | static void unload_file(struct backtrace_file *file)
type backtrace_search (line 162) | struct backtrace_search
type backtrace_file (line 57) | struct backtrace_file
type backtrace_line (line 60) | struct backtrace_line
function get_line_info (line 225) | static int get_line_info(struct backtrace_file *file, int backoff,
FILE: src/include/alloc.h
function ncclSizeOfT (line 31) | size_t ncclSizeOfT() { return sizeof(T); }
type ncclSideStream (line 35) | struct ncclSideStream {
function ncclResult_t (line 44) | static inline ncclResult_t ncclCreateSideStream(int cudaDev) {
function ncclResult_t (line 65) | static inline ncclResult_t ncclDestroySideStream(int cudaDev) {
function ncclResult_t (line 89) | static inline ncclResult_t getSideStream(cudaStream_t *stream) {
function ncclResult_t (line 109) | static inline ncclResult_t ncclCuMemHostAlloc(void** ptr, CUmemGenericAl...
function ncclResult_t (line 153) | static inline ncclResult_t ncclCuMemHostFree(void* ptr) {
function ncclResult_t (line 170) | static inline ncclResult_t ncclCuMemHostAlloc(void** ptr, void* handlep,...
function ncclResult_t (line 175) | static inline ncclResult_t ncclCuMemHostFree(void* ptr) {
function ncclResult_t (line 212) | static inline ncclResult_t ncclCudaHostFree(void* ptr) {
type allocationTracker (line 256) | struct __attribute__ ((aligned(64))) allocationTracker {
type allocationTracker (line 265) | struct allocationTracker
type allocationTracker (line 267) | struct allocationTracker
function ncclResult_t (line 274) | static inline ncclResult_t ncclCuMemAllocAddr(void **ptr, CUmemGenericAl...
function ncclResult_t (line 302) | static inline ncclResult_t ncclCuMemFreeAddr(void *ptr) {
function ncclResult_t (line 321) | static inline ncclResult_t ncclCuMemAlloc(void **ptr, CUmemGenericAlloca...
function ncclResult_t (line 364) | static inline ncclResult_t ncclCuMemFree(void *ptr) {
function ncclResult_t (line 392) | static inline ncclResult_t ncclCuMemAlloc(void **ptr, void *handlep, int...
function ncclResult_t (line 396) | static inline ncclResult_t ncclCuMemFree(void *ptr) {
function ncclResult_t (line 401) | static inline ncclResult_t ncclCuMemAllocAddr(void **ptr, CUmemGenericAl...
function ncclResult_t (line 406) | static inline ncclResult_t ncclCuMemFreeAddr(void *ptr) {
function ncclResult_t (line 570) | inline ncclResult_t ncclIbMallocDebug(void** ptr, size_t size, const cha...
FILE: src/include/allocator.h
type ncclSpace (line 15) | struct ncclSpace {
type ncclSpace (line 21) | struct ncclSpace
type ncclSpace (line 22) | struct ncclSpace
type ncclSpace (line 23) | struct ncclSpace
type ncclSpace (line 24) | struct ncclSpace
type ncclShadowObject (line 31) | struct ncclShadowObject
type ncclShadowPage (line 32) | struct ncclShadowPage
type ncclShadowPool (line 33) | struct ncclShadowPool {
type ncclShadowPool (line 40) | struct ncclShadowPool
type ncclShadowPool (line 41) | struct ncclShadowPool
type ncclShadowPool (line 42) | struct ncclShadowPool
type ncclShadowPool (line 43) | struct ncclShadowPool
type ncclShadowPool (line 44) | struct ncclShadowPool
function ncclResult_t (line 47) | inline ncclResult_t ncclShadowPoolAlloc(struct ncclShadowPool* pool, T**...
function ncclResult_t (line 57) | inline ncclResult_t ncclShadowPoolToHost(struct ncclShadowPool* pool, T*...
FILE: src/include/alt_rsmi.h
type ARSMI_IO_LINK_TYPE (line 37) | typedef enum _ARSMI_IO_LINK_TYPE {
type ARSMI_linkInfo (line 45) | struct ARSMI_linkInfo {
type ARSMI_linkInfo (line 54) | typedef struct ARSMI_linkInfo ARSMI_linkInfo;
FILE: src/include/api_trace.h
type ncclResult_t (line 57) | typedef ncclResult_t (*ncclAllGather_fn_t)(const void* sendbuff, void* r...
type ncclResult_t (line 60) | typedef ncclResult_t (*ncclAllReduce_fn_t)(const void* sendbuff, void* r...
type ncclResult_t (line 64) | typedef ncclResult_t (*ncclAllReduceWithBias_fn_t)(const void* sendbuff,...
type ncclResult_t (line 68) | typedef ncclResult_t (*ncclAlltoAll_fn_t)(const void* sendbuff, void* re...
type ncclResult_t (line 71) | typedef ncclResult_t (*ncclAlltoAllv_fn_t)(
type ncclResult_t (line 75) | typedef ncclResult_t (*ncclBroadcast_fn_t)(const void* sendbuff, void* r...
type ncclResult_t (line 79) | typedef ncclResult_t (*ncclGather_fn_t)(const void* sendbuff, void* recv...
type ncclResult_t (line 82) | typedef ncclResult_t (*ncclReduce_fn_t)(const void* sendbuff, void* recv...
type ncclResult_t (line 86) | typedef ncclResult_t (*ncclReduceScatter_fn_t)(const void* sendbuff, voi...
type ncclResult_t (line 90) | typedef ncclResult_t (*ncclScatter_fn_t)(const void* sendbuff, void* rec...
type ncclResult_t (line 93) | typedef ncclResult_t (*ncclSend_fn_t)(const void* sendbuff, size_t count,
type ncclResult_t (line 96) | typedef ncclResult_t (*ncclRecv_fn_t)(void* recvbuff, size_t count,
type ncclResult_t (line 99) | typedef ncclResult_t (*ncclRedOpCreatePreMulSum_fn_t)(ncclRedOp_t* op, v...
type ncclResult_t (line 103) | typedef ncclResult_t (*ncclRedOpDestroy_fn_t)(ncclRedOp_t op, ncclComm_t...
type ncclResult_t (line 104) | typedef ncclResult_t (*ncclGroupStart_fn_t)();
type ncclResult_t (line 105) | typedef ncclResult_t (*ncclGroupEnd_fn_t)();
type ncclResult_t (line 106) | typedef ncclResult_t (*ncclGetVersion_fn_t)(int* version);
type ncclResult_t (line 107) | typedef ncclResult_t (*ncclGetUniqueId_fn_t)(ncclUniqueId* out);
type ncclResult_t (line 109) | typedef ncclResult_t (*ncclCommInitRank_fn_t)(ncclComm_t* newcomm, int n...
type ncclResult_t (line 112) | typedef ncclResult_t (*ncclCommInitAll_fn_t)(ncclComm_t* comms, int ndev,
type ncclResult_t (line 115) | typedef ncclResult_t (*ncclCommInitRankConfig_fn_t)(ncclComm_t* comm, in...
type ncclResult_t (line 119) | typedef ncclResult_t (*ncclCommFinalize_fn_t)(ncclComm_t comm);
type ncclResult_t (line 121) | typedef ncclResult_t (*ncclCommDestroy_fn_t)(ncclComm_t comm);
type ncclResult_t (line 123) | typedef ncclResult_t (*ncclCommAbort_fn_t)(ncclComm_t comm);
type ncclResult_t (line 125) | typedef ncclResult_t (*ncclCommShrink_fn_t)(ncclComm_t comm, int* exclud...
type ncclResult_t (line 129) | typedef ncclResult_t (*ncclCommSplit_fn_t)(ncclComm_t comm, int color, i...
type ncclResult_t (line 136) | typedef ncclResult_t (*ncclCommGetAsyncError_fn_t)(ncclComm_t comm,
type ncclResult_t (line 139) | typedef ncclResult_t (*ncclCommCount_fn_t)(const ncclComm_t comm, int* c...
type ncclResult_t (line 141) | typedef ncclResult_t (*ncclCommCuDevice_fn_t)(const ncclComm_t comm, int...
type ncclResult_t (line 143) | typedef ncclResult_t (*ncclCommUserRank_fn_t)(const ncclComm_t comm, int...
type ncclResult_t (line 145) | typedef ncclResult_t (*ncclMemAlloc_fn_t)(void** ptr, size_t size);
type ncclResult_t (line 147) | typedef ncclResult_t (*ncclMemFree_fn_t)(void* ptr);
type ncclResult_t (line 149) | typedef ncclResult_t (*mscclLoadAlgo_fn_t)(const char* mscclAlgoF...
type ncclResult_t (line 152) | typedef ncclResult_t (*mscclRunAlgo_fn_t)(
type ncclResult_t (line 158) | typedef ncclResult_t (*mscclUnloadAlgo_fn_t)(mscclAlgoHandle_t mscclAlgo...
type ncclResult_t (line 160) | typedef ncclResult_t (*ncclCommRegister_fn_t)(const ncclComm_t comm, voi...
type ncclResult_t (line 163) | typedef ncclResult_t (*ncclCommDeregister_fn_t)(const ncclComm_t comm, v...
type ncclResult_t (line 165) | typedef ncclResult_t (*ncclCommWindowRegister_fn_t)(ncclComm_t comm, voi...
type ncclResult_t (line 167) | typedef ncclResult_t (*ncclCommWindowDeregister_fn_t)(ncclComm_t comm, n...
type rcclApiFuncTable (line 169) | typedef struct rcclApiFuncTable
FILE: src/include/argcheck.h
type ncclComm (line 14) | struct ncclComm
type ncclInfo (line 15) | struct ncclInfo
type ncclComm (line 16) | struct ncclComm
FILE: src/include/bitops.h
function Int (line 23) | Int minval(Int a) { return a; }
function Int (line 25) | Int minval(Int a, Int b, More ...more) {
function Int (line 34) | Int maxval(Int a) { return a; }
function Int (line 36) | Int maxval(Int a, Int b, More ...more) {
function Z (line 60) | constexpr Z divUp(X x, Y y) {
function Z (line 65) | constexpr Z roundUp(X x, Y y) {
function Z (line 69) | constexpr Z roundDown(X x, Y y) {
function Z (line 75) | constexpr Z alignUp(X x, Y a) {
function T (line 79) | T* alignUp(T* x, size_t a) {
function Z (line 86) | constexpr Z alignDown(X x, Y a) {
function T (line 91) | T* alignDown(T* x, size_t a) {
function isPow2 (line 97) | bool isPow2(Int x) {
function T (line 102) | T add4G(T base, int delta4G) {
function T (line 110) | T incWrap4G(T ptr, uint32_t delta4G, uint32_t lo4G, uint32_t hi4G) {
function T (line 119) | T decWrap4G(T ptr, uint32_t delta4G, uint32_t lo4G, uint32_t hi4G) {
function idivRcp32 (line 128) | uint32_t idivRcp32(uint32_t x) {
function idivRcp64 (line 131) | uint64_t idivRcp64(uint64_t x) {
function mul32hi (line 135) | uint32_t mul32hi(uint32_t a, uint32_t b) {
function mul64hi (line 142) | uint64_t mul64hi(uint64_t a, uint64_t b) {
function imulRcp32 (line 152) | uint32_t imulRcp32(uint32_t x, uint32_t xrcp, uint32_t y, uint32_t yrcp) {
function imulRcp64 (line 160) | uint64_t imulRcp64(uint64_t x, uint64_t xrcp, uint64_t y, uint64_t yrcp) {
function idivmodFast32 (line 171) | void idivmodFast32(uint32_t *quo, uint32_t *rem, uint32_t x, uint32_t y,...
function idivmodFast64 (line 181) | void idivmodFast64(uint64_t *quo, uint64_t *rem, uint64_t x, uint64_t y,...
function idivFast32 (line 192) | uint32_t idivFast32(uint32_t x, uint32_t y, uint32_t yrcp) {
function idivFast64 (line 197) | uint32_t idivFast64(uint64_t x, uint64_t y, uint64_t yrcp) {
function imodFast32 (line 203) | uint32_t imodFast32(uint32_t x, uint32_t y, uint32_t yrcp) {
function imodFast64 (line 208) | uint32_t imodFast64(uint64_t x, uint64_t y, uint64_t yrcp) {
function countOneBits (line 215) | int countOneBits(Int x) {
function firstOneBit (line 241) | int firstOneBit(Int mask) {
function popFirstOneBit (line 266) | int popFirstOneBit(Int* mask) {
function log2Down (line 273) | int log2Down(Int x) {
function log2Up (line 305) | int log2Up(Int x) {
function Int (line 338) | Int pow2Up(Int x) {
function Int (line 343) | Int pow2Down(Int x) {
function UInt (line 350) | UInt reverseSubBits(UInt x) {
function char (line 369) | struct ncclToUnsigned<char> { using type = unsigned char; }
function signed (line 370) | struct ncclToUnsigned<signed char> { using type = unsigned char; }
function unsigned (line 371) | struct ncclToUnsigned<unsigned char> { using type = unsigned char; }
function signed (line 372) | struct ncclToUnsigned<signed short> { using type = unsigned short; }
function unsigned (line 373) | struct ncclToUnsigned<unsigned short> { using type = unsigned short; }
function signed (line 374) | struct ncclToUnsigned<signed int> { using type = unsigned int; }
function unsigned (line 375) | struct ncclToUnsigned<unsigned int> { using type = unsigned int; }
function signed (line 376) | struct ncclToUnsigned<signed long> { using type = unsigned long; }
function unsigned (line 377) | struct ncclToUnsigned<unsigned long> { using type = unsigned long; }
type ncclToUnsigned (line 378) | struct ncclToUnsigned
type ncclToUnsigned (line 379) | struct ncclToUnsigned
function Int (line 383) | Int reverseBits(Int x, int nBits) {
function u32fpEncode (line 407) | uint32_t u32fpEncode(uint32_t x, int bitsPerPow2) {
function u32fpDecode (line 419) | uint32_t u32fpDecode(uint32_t x, int bitsPerPow2) {
function u32fp8MaxValue (line 426) | constexpr uint32_t u32fp8MaxValue() { return 0xf0000000; }
function u32fp8Encode (line 428) | uint8_t u32fp8Encode(uint32_t x) {
function u32fp8Decode (line 431) | uint32_t u32fp8Decode(uint8_t x) {
function eatHash (line 437) | void eatHash(uint64_t acc[2], const void* bytes, size_t size) {
function eatHash (line 460) | void eatHash(uint64_t acc[2], const T* bytes) {
function digestHash (line 464) | uint64_t digestHash(uint64_t const acc[2]) {
function getHash (line 474) | uint64_t getHash(const void* bytes, size_t size) {
function getHash (line 480) | uint64_t getHash(const T* bytes) {
FILE: src/include/bootstrap.h
type ncclBootstrapHandle (line 13) | struct ncclBootstrapHandle {
type ncclBootstrapHandle (line 17) | struct ncclBootstrapHandle
type ncclBootstrapHandle (line 20) | struct ncclBootstrapHandle
type ncclBootstrapHandle (line 21) | struct ncclBootstrapHandle
type ncclComm (line 22) | struct ncclComm
type ncclComm (line 23) | struct ncclComm
type ncclComm (line 23) | struct ncclComm
FILE: src/include/ce_coll.h
type ncclCeColl (line 18) | struct ncclCeColl {
type ncclCeInitTask (line 30) | struct ncclCeInitTask {
type alignas (line 35) | struct alignas
type ncclDevrWindow (line 42) | struct ncclDevrWindow
type ncclDevrWindow (line 43) | struct ncclDevrWindow
type ncclCeBatchOpsParams (line 46) | struct ncclCeBatchOpsParams {
type ncclComm (line 61) | struct ncclComm
type ncclComm (line 63) | struct ncclComm
type ncclComm (line 65) | struct ncclComm
type ncclComm (line 67) | struct ncclComm
type ncclKernelPlan (line 67) | struct ncclKernelPlan
type ncclComm (line 69) | struct ncclComm
type ncclCeCollArgs (line 69) | struct ncclCeCollArgs
type ncclComm (line 71) | struct ncclComm
type ncclCeCollArgs (line 71) | struct ncclCeCollArgs
type ncclComm (line 73) | struct ncclComm
type ncclCeCollArgs (line 73) | struct ncclCeCollArgs
type ncclComm (line 75) | struct ncclComm
type ncclCeCollArgs (line 75) | struct ncclCeCollArgs
FILE: src/include/channel.h
type ncclComm (line 14) | struct ncclComm
type ncclComm (line 15) | struct ncclComm
type ncclComm (line 15) | struct ncclComm
type ncclComm (line 16) | struct ncclComm
type ncclComm (line 16) | struct ncclComm
type ncclChannel (line 17) | struct ncclChannel
type ncclComm (line 19) | struct ncclComm
FILE: src/include/coll_net.h
type ncclComm (line 16) | struct ncclComm
function ncclResult_t (line 17) | static ncclResult_t collNetDevices(struct ncclComm* comm, int* ndev) { N...
function ncclResult_t (line 18) | static ncclResult_t collNetGetProperties(struct ncclComm* comm, int dev,...
function ncclResult_t (line 19) | static ncclResult_t collNetListen(struct ncclComm* comm, int dev, void* ...
function ncclResult_t (line 20) | static ncclResult_t collNetConnect(struct ncclComm* comm, void* handles[...
function ncclResult_t (line 21) | static ncclResult_t collNetReduceSupport(struct ncclComm* comm, ncclData...
function ncclResult_t (line 22) | static ncclResult_t collNetRegMr(struct ncclComm* comm, void* collComm, ...
function ncclResult_t (line 24) | static ncclResult_t collNetRegMrDmaBuf(struct ncclComm* comm, void* coll...
function ncclResult_t (line 25) | static ncclResult_t collNetDeregMr(struct ncclComm* comm, void* collComm...
function ncclResult_t (line 26) | static ncclResult_t collNetIallreduce(struct ncclComm* comm, void* collC...
function ncclResult_t (line 28) | static ncclResult_t collNetIflush(struct ncclComm* comm, void* collComm,...
function ncclResult_t (line 29) | static ncclResult_t collNetTest(struct ncclComm* comm, void* request, in...
function ncclResult_t (line 30) | static ncclResult_t collNetCloseColl(struct ncclComm* comm, void* collCo...
function ncclResult_t (line 31) | static ncclResult_t collNetCloseListen(struct ncclComm* comm, void* list...
function ncclResult_t (line 32) | static ncclResult_t collNetFinalize(struct ncclComm* comm, void* ctx) { ...
function collNetSupport (line 34) | static int collNetSupport(struct ncclComm* comm) { return comm->ncclColl...
FILE: src/include/collectives.h
function ncclTypeSize (line 56) | inline int ncclTypeSize(ncclDataType_t type) {
type ncclConnFifo (line 84) | struct ncclConnFifo {
function class (line 93) | class RingAlgorithm {
function class (line 127) | class RingARAlgorithm : public RingAlgorithm {
function class (line 242) | class RingAGAlgorithm : public RingAlgorithm {
function class (line 332) | class RingBCAlgorithm : public RingAlgorithm {
type ncclPatStep (line 416) | struct ncclPatStep {
type ncclPatPeer (line 421) | struct ncclPatPeer {
type ncclPatShmem (line 434) | struct ncclPatShmem {
function min (line 464) | ssize_t min(ssize_t a, ssize_t b) {
function getNelem (line 468) | int getNelem() {
function mirrorInvert (line 472) | int mirrorInvert(int i, int max) {
function firstBitSet (line 480) | int firstBitSet(int i, int max) {
function resetA (line 490) | void resetA() {
function reset (line 498) | void reset() {
function nBitsSet (line 506) | int nBitsSet(int i) {
function newPeer (line 518) | int newPeer(int i, int pow2) {
function getParallelFactor (line 547) | int getParallelFactor() {
function getNextOp (line 551) | void getNextOp(struct ncclPatStep* ps) {
function min (line 721) | ssize_t min(ssize_t a, ssize_t b) {
function getNelem (line 725) | int getNelem() {
function mirror (line 729) | int mirror(int i, int max) {
function firstBitSet (line 737) | int firstBitSet(int i, int max) {
function resetA (line 747) | void resetA() {
function reset (line 753) | void reset() {
function nextAs (line 766) | int nextAs() {
function getParallelFactor (line 812) | int getParallelFactor() {
function getNextOp (line 816) | void getNextOp(struct ncclPatStep* ps) {
FILE: src/include/comm.h
type cudaLaunchParams (line 38) | struct cudaLaunchParams {
type ncclSendMem (line 58) | struct ncclSendMem {
type ncclRecvMem (line 72) | struct ncclRecvMem {
type helperThreadState (line 84) | enum helperThreadState {ThreadStart, ThreadStop}
type ncclGraphHelperResources (line 88) | struct ncclGraphHelperResources {
type ncclUserRedOp (line 98) | struct ncclUserRedOp {
type ncclNodeRanks (line 104) | struct ncclNodeRanks {
type cliqueInfo (line 109) | struct cliqueInfo {
type ncclDestructor (line 115) | struct ncclDestructor {
type ncclCommCallback (line 121) | struct ncclCommCallback {
type ncclCommEventCallback (line 125) | struct ncclCommEventCallback {
type ncclSharedResources (line 131) | struct ncclSharedResources {
type ncclChannel (line 158) | struct ncclChannel {
type ncclWorkBatchList (line 182) | struct ncclWorkBatchList {
type alignas (line 186) | struct alignas
type ncclWorkList (line 187) | struct ncclWorkList
type ncclDevWorkType (line 188) | enum ncclDevWorkType
type ncclCollnetHandleList (line 193) | struct ncclCollnetHandleList {
type ncclTaskColl (line 201) | struct ncclTaskColl {
type ncclTaskP2p (line 251) | struct ncclTaskP2p {
type ncclKernelPlan (line 270) | struct ncclKernelPlan {
type ncclTaskCollSorter (line 317) | struct ncclTaskCollSorter {
function ncclTaskCollSorterInsert (line 337) | inline void ncclTaskCollSorterInsert(
function ncclTaskCollSorterEmpty (line 368) | inline bool ncclTaskCollSorterEmpty(struct ncclTaskCollSorter* me) {
type ncclTaskColl (line 373) | struct ncclTaskColl
type ncclTaskCollSorter (line 373) | struct ncclTaskCollSorter
type ncclTaskColl (line 374) | struct ncclTaskColl
type ncclCudaStreamList (line 381) | struct ncclCudaStreamList {
type ncclKernelPlanner (line 386) | struct ncclKernelPlanner {
type ncclPeerInfo (line 453) | struct ncclPeerInfo {
type ncclGroupTaskType_t (line 472) | typedef enum ncclGroupTaskType {
type ncclCommSymTeams (line 478) | struct ncclCommSymTeams
type ncclComm (line 480) | struct ncclComm {
type ncclComm (line 769) | struct ncclComm
type ncclComm (line 770) | struct ncclComm
type ncclComm (line 770) | struct ncclComm
type ncclLaunchMode (line 772) | enum ncclLaunchMode {
type ncclLaunchMode (line 777) | enum ncclLaunchMode
type ncclComm (line 779) | struct ncclComm
type ncclComm (line 780) | struct ncclComm
type ncclComm (line 781) | struct ncclComm
type ncclComm (line 782) | struct ncclComm
function ncclResult_t (line 784) | inline ncclResult_t ncclCommPollCallbacks(struct ncclComm* comm, bool wa...
function ncclResult_t (line 797) | inline ncclResult_t ncclCommPollEventCallbacks(struct ncclComm *comm, bo...
function ncclCommIntraBarrierIn (line 824) | inline void ncclCommIntraBarrierIn(struct ncclComm* comm, uint32_t x) {
function ncclCommIntraBarrierOut (line 842) | inline uint32_t ncclCommIntraBarrierOut(struct ncclComm* comm) {
function ncclRedOp_t (line 863) | static inline ncclRedOp_t ncclUserRedOpMangle(ncclComm *comm, ncclRedOp_...
FILE: src/include/cpuset.h
function ncclResult_t (line 21) | static ncclResult_t ncclStrToCpuset(const char* maskStr, cpu_set_t* set) {
function ncclResult_t (line 70) | static ncclResult_t ncclStrListToCpuset(const char* userStr, cpu_set_t* ...
function ncclResult_t (line 85) | static ncclResult_t ncclCpusetToStrList(cpu_set_t* mask, char* str, size...
FILE: src/include/cudawrap.h
function ncclResult_t (line 130) | inline ncclResult_t ncclCudaDriverVersion(int* driver) {
FILE: src/include/debug.h
function ncclResult_t (line 46) | static inline ncclResult_t rcclCudaErrorHandler(cudaError_t err) {
FILE: src/include/dev_runtime.h
type ncclDevrMemory (line 19) | struct ncclDevrMemory
type ncclDevrWindow (line 20) | struct ncclDevrWindow {
type ncclDevrWindowSorted (line 29) | struct ncclDevrWindowSorted
type ncclDevrTeam (line 30) | struct ncclDevrTeam
type ncclDevrRegTask (line 32) | struct ncclDevrRegTask {
type ncclDevrCommCreateTask (line 40) | struct ncclDevrCommCreateTask {
type ncclDevrState (line 46) | struct ncclDevrState {
type ncclComm (line 70) | struct ncclComm
type ncclComm (line 71) | struct ncclComm
type ncclComm (line 74) | struct ncclComm
type ncclDevrWindow (line 74) | struct ncclDevrWindow
type ncclComm (line 77) | struct ncclComm
type ncclComm (line 81) | struct ncclComm
type ncclDevCommRequirements (line 81) | struct ncclDevCommRequirements
type ncclDevComm (line 81) | struct ncclDevComm
type ncclDevCommRequirements (line 84) | struct ncclDevCommRequirements
type ncclComm (line 88) | struct ncclComm
type ncclDevrWindow (line 88) | struct ncclDevrWindow
type ncclComm (line 91) | struct ncclComm
type ncclDevrWindow (line 91) | struct ncclDevrWindow
type ncclTeam (line 91) | struct ncclTeam
FILE: src/include/device.h
type __hip_bfloat16 (line 21) | typedef __hip_bfloat16 hip_bfloat16;
type ncclDevRedOp_t (line 87) | enum ncclDevRedOp_t {
type ncclDevRedOpFull (line 92) | struct ncclDevRedOpFull {
type ncclConnInfo (line 193) | struct ncclConnInfo {
type ncclProxyConnector (line 219) | struct ncclProxyConnector {
type ncclConnector (line 229) | struct ncclConnector {
type ncclRing (line 239) | struct ncclRing {
type ncclTree (line 257) | struct ncclTree {
type ncclDirect (line 264) | struct ncclDirect {
type ncclNvls (line 280) | struct ncclNvls {
type ncclChannelPeer (line 297) | struct ncclChannelPeer {
type ncclKernelComm (line 303) | struct ncclKernelComm
type alignas (line 308) | struct alignas
function ncclP2pPartBounds (line 332) | void ncclP2pPartBounds(int nParts, int part, size_t bytes, size_t* partB...
type ncclComm (line 344) | struct ncclComm
function __host__ (line 348) | inline __host__ int ncclP2pChannelForPart(int nP2pChannels, int base, in...
function __device__ (line 358) | inline __device__ int ncclP2pChannelToPart(int nP2pChannels, int base, i...
type alignas (line 369) | struct alignas
function __device__ (line 415) | __device__ constexpr int ncclProtoGrainSize(int proto) {
function ncclCollCbdPart (line 423) | inline void ncclCollCbdPart(
type alignas (line 450) | struct alignas
type ncclDevWorkColl (line 451) | struct ncclDevWorkColl
type ncclDevWorkType (line 457) | enum ncclDevWorkType: uint8_t {
function ncclDevWorkSize (line 463) | constexpr size_t ncclDevWorkSize(enum ncclDevWorkType type) {
type alignas (line 472) | struct alignas
type ncclDevChannelPeer (line 492) | struct ncclDevChannelPeer {
type ncclProf (line 505) | struct ncclProf {
type ncclProf (line 513) | struct ncclProf
type ncclCollTraceDataType_t (line 517) | typedef enum {
type ncclCollTrace (line 528) | struct ncclCollTrace {
type ncclCollTrace (line 564) | struct ncclCollTrace
type alignas (line 574) | struct alignas
type ncclDevChannelPeer (line 575) | struct ncclDevChannelPeer
type ncclRing (line 576) | struct ncclRing
type ncclTree (line 577) | struct ncclTree
type ncclTree (line 578) | struct ncclTree
type ncclDirect (line 579) | struct ncclDirect
type ncclTree (line 580) | struct ncclTree
type ncclNvls (line 581) | struct ncclNvls
type ncclDevProfiler (line 587) | struct ncclDevProfiler {
type ncclKernelComm (line 594) | struct ncclKernelComm {
type alignas (line 642) | struct alignas
type ncclKernelComm (line 643) | struct ncclKernelComm
type ncclDevChannel (line 644) | struct ncclDevChannel
type ncclDevWorkStorageType (line 647) | enum ncclDevWorkStorageType: uint8_t {
type channelMasks (line 653) | struct channelMasks {
type alignas (line 657) | struct alignas
type ncclKernelComm (line 658) | struct ncclKernelComm
type channelMasks (line 659) | struct channelMasks
type ncclDevWorkStorageType (line 660) | enum ncclDevWorkStorageType
function ncclDevKernelArgsStorage (line 668) | alignas(16) ncclDevKernelArgsStorage {
type ncclDevKernelArgs5K (line 684) | typedef ncclDevKernelArgs5K ncclDevKernelArgsDefaultStorage;
type ncclDevKernelArgs4K (line 686) | typedef ncclDevKernelArgs4K ncclDevKernelArgsDefaultStorage;
function ncclMaxKernelArgsSize (line 688) | constexpr int ncclMaxKernelArgsSize(/*int cudaDriver, */int cudaArch=NCC...
function T (line 694) | constexpr T min_constexpr(T a) { return a; }
function T (line 696) | constexpr T min_constexpr(T a, T b, Ts ...c) {
function T (line 701) | constexpr T max_constexpr(T a) { return a; }
function T (line 703) | constexpr T max_constexpr(T a, T b, Ts ...c) {
function ncclDevMaxChannelsForArgsBytes (line 707) | constexpr int ncclDevMaxChannelsForArgsBytes(size_t argsBytes) {
function ncclCalcUnroll (line 715) | constexpr int ncclCalcUnroll(int bytePerPack, int insns, int bytes) {
function ncclCollUnroll (line 724) | constexpr int ncclCollUnroll(int cudaArch = NCCL_CUDA_ARCH) {
function ncclNvlsUnrollBytes (line 729) | constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { retur...
function ncclNvlsUnrollInsns (line 730) | constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { retur...
function ncclNvlsUnroll (line 732) | constexpr int ncclNvlsUnroll(int bytePerPack, int cudaArch = NCCL_CUDA_A...
function __device__ (line 737) | __device__ constexpr int ncclShmemScratchWarpSize(int cudaArch = NCCL_CU...
function __device__ (line 751) | __device__ constexpr int ncclShmemDynamicSize(int cudaArch = NCCL_CUDA_A...
type ncclDevRedOpFull (line 766) | struct ncclDevRedOpFull
function ncclNvlsSupported (line 769) | inline bool ncclNvlsSupported(int devRedOp, int type) {
function ncclDevFuncId_P2p (line 821) | inline int ncclDevFuncId_P2p() {
FILE: src/include/enqueue.h
type ncclTaskColl (line 20) | struct ncclTaskColl
type ncclInfo (line 23) | struct ncclInfo
type ncclComm (line 24) | struct ncclComm
type ncclComm (line 25) | struct ncclComm
type ncclKernelPlan (line 25) | struct ncclKernelPlan
type ncclComm (line 26) | struct ncclComm
type ncclKernelPlan (line 26) | struct ncclKernelPlan
type ncclComm (line 27) | struct ncclComm
type ncclKernelPlan (line 27) | struct ncclKernelPlan
type ncclComm (line 28) | struct ncclComm
type ncclComm (line 29) | struct ncclComm
type ncclComm (line 30) | struct ncclComm
function ncclFuncSendCount (line 32) | static inline size_t ncclFuncSendCount(ncclFunc_t func, int nRanks, size...
function ncclFuncRecvCount (line 35) | static inline size_t ncclFuncRecvCount(ncclFunc_t func, int nRanks, size...
function rccl_static (line 38) | rccl_static inline size_t ncclFuncMaxSendRecvCount(ncclFunc_t func, int ...
FILE: src/include/gdrwrap.h
function wc_store_fence (line 42) | static inline void wc_store_fence(void) { asm volatile("sync") ; }
function wc_store_fence (line 45) | static inline void wc_store_fence(void) { _mm_sfence(); }
function wc_store_fence (line 49) | static inline void wc_store_fence(void) { std::atomic_thread_fence(std::...
function wc_store_fence (line 52) | static inline void wc_store_fence(void) { atomic_thread_fence(memory_ord...
function ncclResult_t (line 63) | static ncclResult_t wrap_gdr_symbols(void) { return ncclSuccess; }
function gdr_t (line 64) | static gdr_t wrap_gdr_open(void) { gdr_t g = gdr_open(); return g; }
function ncclResult_t (line 65) | static ncclResult_t wrap_gdr_close(gdr_t g) { GDRCHECK(gdr_close(g)); re...
function ncclResult_t (line 66) | static ncclResult_t wrap_gdr_pin_buffer(gdr_t g, unsigned long addr, siz...
function ncclResult_t (line 70) | static ncclResult_t wrap_gdr_unpin_buffer(gdr_t g, gdr_mh_t handle) {
function ncclResult_t (line 74) | static ncclResult_t wrap_gdr_get_info(gdr_t g, gdr_mh_t handle, gdr_info...
function ncclResult_t (line 78) | static ncclResult_t wrap_gdr_map(gdr_t g, gdr_mh_t handle, void **va, si...
function ncclResult_t (line 82) | static ncclResult_t wrap_gdr_unmap(gdr_t g, gdr_mh_t handle, void *va, s...
function wrap_gdr_runtime_get_version (line 86) | static void wrap_gdr_runtime_get_version(int *major, int *minor) {
function wrap_gdr_driver_get_version (line 90) | static void wrap_gdr_driver_get_version(gdr_t g, int *major, int *minor) {
function ncclResult_t (line 94) | static ncclResult_t wrap_gdr_copy_to_mapping(gdr_mh_t handle, void *map_...
function ncclResult_t (line 98) | static ncclResult_t wrap_gdr_copy_from_mapping(gdr_mh_t handle, void *h_...
type gdr (line 113) | struct gdr
type gdr (line 114) | struct gdr
type gdr_mh_t (line 116) | typedef struct gdr_mh_s {
type gdr_info (line 120) | struct gdr_info {
type gdr_info_t (line 129) | typedef struct gdr_info gdr_info_t;
type gdr_mem_desc_t (line 154) | typedef struct gdr_mem_desc {
function gdr_t (line 163) | static gdr_t ncclGdrInit() {
function ncclResult_t (line 183) | ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void**...
function ncclResult_t (line 219) | ncclResult_t ncclGdrCudaCopy(void *gdrHandle, T* dst, T* src, size_t nel...
function ncclResult_t (line 225) | static ncclResult_t ncclGdrCudaFree(void* gdrHandle) {
function gdr_t (line 233) | static gdr_t ncclGdrInit() {
function ncclResult_t (line 264) | ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void**...
function ncclResult_t (line 310) | ncclResult_t ncclGdrCudaCopy(void *gdrHandle, T* dst, T* src, size_t nel...
function ncclResult_t (line 316) | static ncclResult_t ncclGdrCudaFree(void* gdrHandle) {
FILE: src/include/graph.h
type ncclTopoSystem (line 21) | struct ncclTopoSystem
type ncclComm (line 23) | struct ncclComm
type ncclTopoSystem (line 23) | struct ncclTopoSystem
type ncclTopoSystem (line 24) | struct ncclTopoSystem
type ncclTopoSystem (line 25) | struct ncclTopoSystem
type ncclTopoSystem (line 27) | struct ncclTopoSystem
type ncclComm (line 27) | struct ncclComm
type ncclTopoSystem (line 28) | struct ncclTopoSystem
type ncclTopoSystem (line 29) | struct ncclTopoSystem
type ncclComm (line 29) | struct ncclComm
type ncclComm (line 30) | struct ncclComm
type ncclTopoSystem (line 31) | struct ncclTopoSystem
type ncclTopoSystem (line 32) | struct ncclTopoSystem
type ncclComm (line 34) | struct ncclComm
type ncclComm (line 37) | struct ncclComm
type ncclTopoGraph (line 37) | struct ncclTopoGraph
type ncclComm (line 38) | struct ncclComm
type ncclTopoSystem (line 38) | struct ncclTopoSystem
type ncclTopoSystem (line 39) | struct ncclTopoSystem
type ncclPeerInfo (line 39) | struct ncclPeerInfo
type ncclPeerInfo (line 39) | struct ncclPeerInfo
type ncclTopoGdrMode (line 40) | enum ncclTopoGdrMode {
type ncclTopoSystem (line 46) | struct ncclTopoSystem
type ncclTopoGdrMode (line 46) | enum ncclTopoGdrMode
type ncclComm (line 47) | struct ncclComm
type ncclTopoSystem (line 48) | struct ncclTopoSystem
type ncclTopoSystem (line 49) | struct ncclTopoSystem
type ncclComm (line 50) | struct ncclComm
type ncclComm (line 51) | struct ncclComm
type ncclTopoSystem (line 52) | struct ncclTopoSystem
type ncclTopoSystem (line 57) | struct ncclTopoSystem
type ncclTopoGraph (line 57) | struct ncclTopoGraph
type ncclTopoSystem (line 58) | struct ncclTopoSystem
type ncclTopoSystem (line 61) | struct ncclTopoSystem
type ncclTopoSystem (line 78) | struct ncclTopoSystem
type ncclTopoSystem (line 79) | struct ncclTopoSystem
type ncclTopoSystem (line 80) | struct ncclTopoSystem
type ncclTopoSystem (line 81) | struct ncclTopoSystem
type ncclTopoSystem (line 82) | struct ncclTopoSystem
type ncclTopoSystem (line 83) | struct ncclTopoSystem
type ncclTopoSystem (line 84) | struct ncclTopoSystem
type ncclTopoSystem (line 88) | struct ncclTopoSystem
type ncclTopoSystem (line 91) | struct ncclTopoSystem
type ncclTopoGraph (line 99) | struct ncclTopoGraph {
type ncclTopoSystem (line 122) | struct ncclTopoSystem
type ncclTopoGraph (line 122) | struct ncclTopoGraph
type ncclTopoSystem (line 124) | struct ncclTopoSystem
type ncclTopoGraph (line 124) | struct ncclTopoGraph
type ncclTopoSystem (line 125) | struct ncclTopoSystem
type ncclTopoGraph (line 125) | struct ncclTopoGraph
type ncclTopoRanks (line 127) | struct ncclTopoRanks {
type ncclComm (line 140) | struct ncclComm
type ncclTopoGraph (line 140) | struct ncclTopoGraph
type ncclTopoRanks (line 140) | struct ncclTopoRanks
type ncclComm (line 142) | struct ncclComm
type ncclTopoRanks (line 143) | struct ncclTopoRanks
type ncclTopoGraph (line 143) | struct ncclTopoGraph
type ncclComm (line 143) | struct ncclComm
type ncclComm (line 144) | struct ncclComm
type ncclTopoGraph (line 144) | struct ncclTopoGraph
type ncclComm (line 146) | struct ncclComm
type ncclComm (line 147) | struct ncclComm
type ncclTopoGraph (line 147) | struct ncclTopoGraph
type ncclComm (line 148) | struct ncclComm
FILE: src/include/group.h
type ncclComm (line 17) | struct ncclComm
type ncclComm (line 18) | struct ncclComm
type ncclComm (line 19) | struct ncclComm
type ncclGroupJob (line 20) | struct ncclGroupJob
type ncclGroupJob (line 21) | struct ncclGroupJob
type ncclResult_t (line 23) | typedef ncclResult_t(*ncclInitFunc_t)(ncclComm_t* newcomm, int ndev, ncc...
type ncclGroupJobState_t (line 27) | typedef enum ncclGroupJobState {
type ncclAsyncJob (line 33) | struct ncclAsyncJob {
type ncclAsyncJob (line 51) | struct ncclAsyncJob
type ncclAsyncJob (line 52) | struct ncclAsyncJob
type ncclAsyncJob (line 53) | struct ncclAsyncJob
type ncclGroupJob (line 57) | struct ncclGroupJob {
type ncclAsyncJob (line 71) | struct ncclAsyncJob
type ncclComm (line 77) | struct ncclComm
type ncclComm (line 78) | struct ncclComm
function ncclResult_t (line 81) | inline ncclResult_t ncclGroupErrCheck(ncclResult_t ret) {
function ncclGroupCommJoin (line 89) | inline void ncclGroupCommJoin(struct ncclComm* comm, int type) {
function ncclGroupCommPreconnect (line 119) | inline void ncclGroupCommPreconnect(struct ncclComm* comm) {
function ncclResult_t (line 127) | inline ncclResult_t ncclGroupCommLeave(struct ncclComm* comm, int type) {
FILE: src/include/ibvcore.h
type ibv_node_type (line 45) | enum ibv_node_type {
type ibv_transport_type (line 59) | enum ibv_transport_type {
type ibv_device_cap_flags (line 71) | enum ibv_device_cap_flags {
type ibv_atomic_cap (line 91) | enum ibv_atomic_cap {
type ibv_device_attr (line 97) | struct ibv_device_attr {
type ibv_mtu (line 140) | enum ibv_mtu {
type ibv_port_state (line 148) | enum ibv_port_state {
type ibv_port_cap_flags (line 169) | enum ibv_port_cap_flags {
type ibv_port_attr (line 191) | struct ibv_port_attr {
type ibv_event_type (line 217) | enum ibv_event_type {
type ibv_async_event (line 246) | struct ibv_async_event {
type ibv_wc_status (line 259) | enum ibv_wc_status {
type ibv_wc_status (line 283) | enum ibv_wc_status
type ibv_wc_opcode (line 285) | enum ibv_wc_opcode {
type ibv_wc_flags (line 300) | enum ibv_wc_flags {
type ibv_wc (line 305) | struct ibv_wc {
type ibv_access_flags (line 321) | enum ibv_access_flags {
type ibv_pd (line 330) | struct ibv_pd {
type ibv_xrcd_init_attr_mask (line 335) | enum ibv_xrcd_init_attr_mask {
type ibv_xrcd_init_attr (line 341) | struct ibv_xrcd_init_attr {
type ibv_xrcd (line 347) | struct ibv_xrcd {
type ibv_rereg_mr_flags (line 351) | enum ibv_rereg_mr_flags {
type ibv_mr (line 358) | struct ibv_mr {
type ibv_mw_type (line 368) | enum ibv_mw_type {
type ibv_mw (line 373) | struct ibv_mw {
type ibv_global_route (line 379) | struct ibv_global_route {
type ibv_grh (line 387) | struct ibv_grh {
type ibv_rate (line 396) | enum ibv_rate {
type ibv_rate (line 423) | enum ibv_rate
type ibv_rate (line 429) | enum ibv_rate
type ibv_rate (line 436) | enum ibv_rate
type ibv_rate (line 442) | enum ibv_rate
type ibv_ah_attr (line 444) | struct ibv_ah_attr {
type ibv_srq_attr_mask (line 454) | enum ibv_srq_attr_mask {
type ibv_srq_attr (line 459) | struct ibv_srq_attr {
type ibv_srq_init_attr (line 465) | struct ibv_srq_init_attr {
type ibv_srq_type (line 470) | enum ibv_srq_type {
type ibv_srq_init_attr_mask (line 475) | enum ibv_srq_init_attr_mask {
type ibv_srq_init_attr_ex (line 483) | struct ibv_srq_init_attr_ex {
type ibv_qp_type (line 494) | enum ibv_qp_type {
type ibv_qp_cap (line 512) | struct ibv_qp_cap {
type ibv_qp_init_attr (line 520) | struct ibv_qp_init_attr {
type ibv_qp_init_attr_mask (line 532) | enum ibv_qp_init_attr_mask {
type ibv_qp_init_attr_ex (line 538) | struct ibv_qp_init_attr_ex {
type ibv_qp_open_attr_mask (line 552) | enum ibv_qp_open_attr_mask {
type ibv_qp_open_attr (line 560) | struct ibv_qp_open_attr {
type ibv_qp_attr_mask (line 568) | enum ibv_qp_attr_mask {
type ibv_qp_state (line 592) | enum ibv_qp_state {
type ibv_mig_state (line 603) | enum ibv_mig_state {
type ibv_qp_attr (line 609) | struct ibv_qp_attr {
type ibv_wr_opcode (line 637) | enum ibv_wr_opcode {
type ibv_send_flags (line 647) | enum ibv_send_flags {
type ibv_sge (line 654) | struct ibv_sge {
type ibv_send_wr (line 660) | struct ibv_send_wr {
type ibv_recv_wr (line 696) | struct ibv_recv_wr {
type ibv_mw_bind (line 703) | struct ibv_mw_bind {
type ibv_srq (line 712) | struct ibv_srq {
type ibv_event_flags (line 737) | enum ibv_event_flags {
type ibv_qp (line 743) | struct ibv_qp {
type ibv_comp_channel (line 760) | struct ibv_comp_channel {
type ibv_cq (line 766) | struct ibv_cq {
type ibv_ah (line 779) | struct ibv_ah {
type ibv_flow_flags (line 785) | enum ibv_flow_flags {
type ibv_flow_attr_type (line 790) | enum ibv_flow_attr_type {
type ibv_flow_spec_type (line 803) | enum ibv_flow_spec_type {
type ibv_flow_eth_filter (line 810) | struct ibv_flow_eth_filter {
type ibv_flow_spec_eth (line 820) | struct ibv_flow_spec_eth {
type ibv_flow_ipv4_filter (line 827) | struct ibv_flow_ipv4_filter {
type ibv_flow_spec_ipv4 (line 832) | struct ibv_flow_spec_ipv4 {
type ibv_flow_tcp_udp_filter (line 839) | struct ibv_flow_tcp_udp_filter {
type ibv_flow_spec_tcp_udp (line 844) | struct ibv_flow_spec_tcp_udp {
type ibv_flow_spec (line 851) | struct ibv_flow_spec {
type ibv_flow_attr (line 863) | struct ibv_flow_attr {
type ibv_flow (line 877) | struct ibv_flow {
type ibv_device (line 883) | struct ibv_device
type ibv_context (line 884) | struct ibv_context
type ibv_device_ops (line 886) | struct ibv_device_ops {
type ibv_device (line 896) | struct ibv_device {
type verbs_device (line 910) | struct verbs_device {
type ibv_context_ops (line 921) | struct ibv_context_ops {
type ibv_context (line 979) | struct ibv_context {
type verbs_context_mask (line 989) | enum verbs_context_mask {
type verbs_context (line 997) | struct verbs_context {
type verbs_context (line 1028) | struct verbs_context
type ibv_context (line 1028) | struct ibv_context
type verbs_context (line 1034) | struct verbs_context
type verbs_context (line 1035) | struct verbs_context
type verbs_device (line 1049) | struct verbs_device
type ibv_device (line 1049) | struct ibv_device
function ibv_post_send (line 1055) | static inline int ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *w...
type ibv_ece (line 1059) | struct ibv_ece {
function ibv_query_port_ex (line 1077) | static inline int ibv_query_port_ex(struct ibv_context *context,
FILE: src/include/ibvsymbols.h
type ncclIbvSymbols (line 13) | struct ncclIbvSymbols {
type ncclIbvSymbols (line 44) | struct ncclIbvSymbols
FILE: src/include/ibvwrap.h
type ibv_return_t (line 27) | typedef enum ibv_return_enum
type ibv_device (line 35) | struct ibv_device
type ibv_device (line 36) | struct ibv_device
type ibv_device (line 37) | struct ibv_device
type ibv_context (line 38) | struct ibv_context
type ibv_device (line 38) | struct ibv_device
type ibv_context (line 39) | struct ibv_context
type ibv_context (line 40) | struct ibv_context
type ibv_async_event (line 40) | struct ibv_async_event
type ibv_async_event (line 41) | struct ibv_async_event
type ibv_context (line 42) | struct ibv_context
type ibv_device_attr (line 42) | struct ibv_device_attr
type ibv_context (line 43) | struct ibv_context
type ibv_port_attr (line 43) | struct ibv_port_attr
type ibv_context (line 44) | struct ibv_context
type ibv_qp (line 45) | struct ibv_qp
type ibv_qp_attr (line 45) | struct ibv_qp_attr
type ibv_qp_init_attr (line 45) | struct ibv_qp_init_attr
type ibv_pd (line 46) | struct ibv_pd
type ibv_context (line 46) | struct ibv_context
type ibv_pd (line 47) | struct ibv_pd
type ibv_mr (line 48) | struct ibv_mr
type ibv_pd (line 48) | struct ibv_pd
type ibv_mr (line 49) | struct ibv_mr
type ibv_pd (line 49) | struct ibv_pd
type ibv_mr (line 50) | struct ibv_mr
type ibv_pd (line 50) | struct ibv_pd
type ibv_mr (line 52) | struct ibv_mr
type ibv_pd (line 52) | struct ibv_pd
type ibv_mr (line 53) | struct ibv_mr
type ibv_pd (line 53) | struct ibv_pd
type ibv_mr (line 54) | struct ibv_mr
type ibv_comp_channel (line 55) | struct ibv_comp_channel
type ibv_context (line 55) | struct ibv_context
type ibv_comp_channel (line 56) | struct ibv_comp_channel
type ibv_cq (line 57) | struct ibv_cq
type ibv_context (line 57) | struct ibv_context
type ibv_comp_channel (line 57) | struct ibv_comp_channel
type ibv_cq (line 58) | struct ibv_cq
function ncclResult_t (line 59) | static inline ncclResult_t wrap_ibv_poll_cq(struct ibv_cq *cq, int num_e...
type ibv_qp (line 68) | struct ibv_qp
type ibv_pd (line 68) | struct ibv_pd
type ibv_qp_init_attr (line 68) | struct ibv_qp_init_attr
type ibv_qp (line 69) | struct ibv_qp
type ibv_qp_attr (line 69) | struct ibv_qp_attr
type ibv_qp (line 70) | struct ibv_qp
type ibv_qp (line 71) | struct ibv_qp
type ibv_ece (line 71) | struct ibv_ece
type ibv_qp (line 72) | struct ibv_qp
type ibv_ece (line 72) | struct ibv_ece
function ncclResult_t (line 74) | static inline ncclResult_t wrap_ibv_post_send(struct ibv_qp *qp, struct ...
function ncclResult_t (line 83) | static inline ncclResult_t wrap_ibv_post_recv(struct ibv_qp *qp, struct ...
type ibv_event_type (line 92) | enum ibv_event_type
type in6_addr (line 100) | struct in6_addr
FILE: src/include/info.h
type ncclInfo (line 17) | struct ncclInfo {
FILE: src/include/ionic/ionicdvcore.h
type ionicdv_reg_udma_mask (line 15) | enum ionicdv_reg_udma_mask {
FILE: src/include/ionic/ionicdvsymbols.h
type ncclIonicdvSymbols (line 8) | struct ncclIonicdvSymbols {
type ncclIonicdvSymbols (line 14) | struct ncclIonicdvSymbols
FILE: src/include/ionic/ionicdvwrap.h
type ibv_qp (line 14) | struct ibv_qp
type ibv_pd (line 15) | struct ibv_pd
FILE: src/include/ipcsocket.h
type ncclIpcSocket (line 25) | struct ncclIpcSocket {
type ncclIpcSocket (line 31) | struct ncclIpcSocket
type ncclIpcSocket (line 32) | struct ncclIpcSocket
type ncclIpcSocket (line 33) | struct ncclIpcSocket
type ncclIpcSocket (line 35) | struct ncclIpcSocket
type ncclIpcSocket (line 36) | struct ncclIpcSocket
FILE: src/include/latency_profiler/CollTrace.h
type ncclComm (line 15) | struct ncclComm
type CollStats (line 19) | struct CollStats
function ncclComm (line 55) | ncclComm* comm_{nullptr};
FILE: src/include/latency_profiler/CollTraceEvent.h
function namespace (line 16) | namespace latency_profiler {
FILE: src/include/latency_profiler/CollTraceFunc.h
function namespace (line 13) | namespace latency_profiler {
FILE: src/include/latency_profiler/CollTraceUtils.h
type CollStats (line 16) | struct CollStats {
FILE: src/include/latency_profiler/EventQueue.h
function namespace (line 12) | namespace latency_profiler {
FILE: src/include/mlx5/mlx5dvcore.h
type mlx5dv_reg_dmabuf_access (line 17) | enum mlx5dv_reg_dmabuf_access {
FILE: src/include/mlx5/mlx5dvsymbols.h
type ncclMlx5dvSymbols (line 16) | struct ncclMlx5dvSymbols {
type ncclMlx5dvSymbols (line 24) | struct ncclMlx5dvSymbols
FILE: src/include/mlx5/mlx5dvwrap.h
type mlx5dv_return_t (line 28) | typedef enum mlx5dv_return_enum
type ibv_device (line 35) | struct ibv_device
type ibv_context (line 36) | struct ibv_context
type ibv_mr (line 38) | struct ibv_mr
type ibv_pd (line 38) | struct ibv_pd
type ibv_mr (line 39) | struct ibv_mr
type ibv_pd (line 39) | struct ibv_pd
FILE: src/include/mnnvl.h
type ncclComm (line 13) | struct ncclComm
FILE: src/include/msccl/msccl_parser.h
type mscclXmlNode (line 30) | struct mscclXmlNode {
type mscclXml (line 43) | struct mscclXml {
function ncclResult_t (line 48) | static ncclResult_t mscclXmlGetAttrIndex(struct mscclXmlNode* node, cons...
function ncclResult_t (line 60) | static ncclResult_t mscclXmlGetAttr(struct mscclXmlNode* node, const cha...
function ncclResult_t (line 67) | static ncclResult_t mscclXmlGetAttrStr(struct mscclXmlNode* node, const ...
function ncclResult_t (line 75) | static ncclResult_t mscclXmlGetAttrInt(struct mscclXmlNode* node, const ...
function ncclResult_t (line 82) | static ncclResult_t mscclXmlGetAttrInt64(struct mscclXmlNode* node, cons...
function ncclResult_t (line 89) | static ncclResult_t mscclXmlFindTag(struct mscclXml* xml, const char* ta...
type mscclAlgo (line 101) | struct mscclAlgo
type mscclAlgoMeta (line 103) | struct mscclAlgoMeta
FILE: src/include/msccl/msccl_scheduler.h
type mscclFunc_t (line 9) | typedef enum { mscclFuncReduce = 0,
type mscclSchedulerParam (line 22) | struct mscclSchedulerParam {
type mscclSchedulerInterface (line 42) | typedef struct {
FILE: src/include/msccl/msccl_setup.h
type mscclAlgo (line 16) | struct mscclAlgo
type mscclAlgo (line 20) | struct mscclAlgo
type mscclAlgo (line 22) | struct mscclAlgo
type mscclAlgo (line 24) | struct mscclAlgo
type mscclAlgo (line 27) | struct mscclAlgo
type mscclAlgo (line 27) | struct mscclAlgo
FILE: src/include/msccl/msccl_struct.h
type mscclTransmission (line 41) | struct mscclTransmission {
type mscclThreadBlock (line 57) | struct mscclThreadBlock {
type mscclThreadBlock (line 69) | struct mscclThreadBlock
type mscclFlag (line 71) | struct mscclFlag {
type mscclChannelPeerInfo (line 76) | struct mscclChannelPeerInfo {
type mscclChannelInfo (line 84) | struct mscclChannelInfo {
type mscclAlgoMeta (line 91) | struct mscclAlgoMeta {
type mscclAlgo (line 114) | struct mscclAlgo {
type mscclGroupStatus (line 153) | enum mscclGroupStatus {
type mscclSavedSchedulerParam (line 159) | struct mscclSavedSchedulerParam {
type mscclCaptureStatus (line 169) | enum mscclCaptureStatus {
type mscclAlgo (line 177) | struct mscclAlgo
type mscclAlgo (line 179) | struct mscclAlgo
type mscclThreadLocalStatus (line 185) | struct mscclThreadLocalStatus {
type mscclWorkFifoStatus (line 195) | struct mscclWorkFifoStatus {
type mscclStatus (line 206) | struct mscclStatus {
type mscclWork (line 237) | struct mscclWork {
type mscclWork (line 253) | struct mscclWork
type mscclShmemData (line 257) | struct mscclShmemData {
type mscclShmemData (line 261) | struct mscclShmemData
FILE: src/include/mscclpp/mscclpp_nccl.h
type mscclppComm (line 14) | struct mscclppComm
type ncclUniqueId (line 16) | typedef ncclUniqueId mscclppUniqueId;
function namespace (line 55) | namespace std {
FILE: src/include/nccl_common.h
type ncclDebugLogLevel (line 18) | typedef enum {
type ncclDebugLogSubSys (line 28) | typedef enum {
type ncclResult_t (line 59) | typedef ncclResult_t (*ncclProfilerCallback_t)(void** eHandle, int type,...
type ncclFunc_t (line 62) | typedef enum {
FILE: src/include/nccl_device/coop.h
function NCCL_DEVICE_INLINE (line 25) | NCCL_DEVICE_INLINE int thread_rank() const {
function NCCL_DEVICE_INLINE (line 34) | NCCL_DEVICE_INLINE void sync() {
type ncclCoopTile (line 41) | typedef ncclCoopTile<1> ncclCoopThread;
type ncclCoopTile (line 42) | typedef ncclCoopTile<32> ncclCoopWarp;
type ncclCoopLanes (line 46) | struct ncclCoopLanes { // Some lanes of this warp.
function NCCL_DEVICE_INLINE (line 60) | NCCL_DEVICE_INLINE void sync() {
type ncclCoopWarpSpan (line 70) | struct ncclCoopWarpSpan {
function NCCL_DEVICE_INLINE (line 87) | NCCL_DEVICE_INLINE void sync() {
type ncclCoopCta (line 95) | struct ncclCoopCta {
function NCCL_DEVICE_INLINE (line 99) | NCCL_DEVICE_INLINE void sync() { __syncthreads(); }
function NCCL_DEVICE_INLINE (line 108) | NCCL_DEVICE_INLINE uint32_t ncclCoopLaneMask(ncclCoopLanes coop) {
function NCCL_DEVICE_INLINE (line 111) | NCCL_DEVICE_INLINE uint32_t ncclCoopLaneMask(ncclCoopWarpSpan coop) {
function NCCL_DEVICE_INLINE (line 114) | NCCL_DEVICE_INLINE uint32_t ncclCoopLaneMask(ncclCoopCta coop) {
function ncclCoopIsThread (line 123) | constexpr bool ncclCoopIsThread(ncclCoopTile<nThreads>) {
function NCCL_DEVICE_INLINE (line 126) | NCCL_DEVICE_INLINE constexpr bool ncclCoopIsThread(ncclCoopLanes) { retu...
function NCCL_DEVICE_INLINE (line 127) | NCCL_DEVICE_INLINE constexpr bool ncclCoopIsThread(ncclCoopWarpSpan) { r...
function NCCL_DEVICE_INLINE (line 128) | NCCL_DEVICE_INLINE constexpr bool ncclCoopIsThread(ncclCoopCta) { return...
function NCCL_DEVICE_INLINE (line 145) | NCCL_DEVICE_INLINE ncclCoopLanes ncclCoopCoalesced(ncclCoopLanes coop) {
FILE: src/include/nccl_device/core.h
type ncclDevComm (line 15) | struct ncclDevComm
type ncclDevComm_t (line 16) | typedef struct ncclDevComm ncclDevComm_t;
type ncclTeam (line 18) | struct ncclTeam
type ncclTeam_t (line 19) | typedef struct ncclTeam ncclTeam_t;
type ncclMultimemHandle (line 23) | struct ncclMultimemHandle
type ncclMultimemHandle_t (line 24) | typedef struct ncclMultimemHandle ncclMultimemHandle_t;
type ncclDevResourceHandle (line 26) | typedef uint32_t ncclDevResourceHandle;
type ncclDevResourceHandle (line 27) | typedef ncclDevResourceHandle ncclDevResourceHandle_t;
type ncclLsaBarrierHandle (line 29) | struct ncclLsaBarrierHandle
type ncclLsaBarrierHandle_t (line 30) | typedef struct ncclLsaBarrierHandle ncclLsaBarrierHandle_t;
type ncclLLA2AHandle (line 32) | struct ncclLLA2AHandle
type ncclLLA2AHandle_t (line 33) | typedef struct ncclLLA2AHandle ncclLLA2AHandle_t;
type ncclTeam (line 35) | struct ncclTeam {
type ncclTeamTagWorld (line 44) | struct ncclTeamTagWorld {}
type ncclTeamTagLsa (line 45) | struct ncclTeamTagLsa {}
type ncclTeamTagRail (line 46) | struct ncclTeamTagRail {}
type ncclDevCommRequirements (line 49) | struct ncclDevCommRequirements
type ncclDevCommRequirements_t (line 50) | typedef struct ncclDevCommRequirements ncclDevCommRequirements_t;
type ncclDevResourceRequirements (line 52) | struct ncclDevResourceRequirements
type ncclDevResourceRequirements_t (line 53) | typedef struct ncclDevResourceRequirements ncclDevResourceRequirements_t;
type ncclTeamRequirements (line 55) | struct ncclTeamRequirements
type ncclTeamRequirements_t (line 56) | typedef struct ncclTeamRequirements ncclTeamRequirements_t;
type ncclDevCommRequirements (line 58) | struct ncclDevCommRequirements {
type ncclDevResourceRequirements (line 67) | struct ncclDevResourceRequirements {
type ncclTeamRequirements (line 73) | struct ncclTeamRequirements {
FILE: src/include/nccl_device/impl/comm__types.h
type ncclDevCommWindowTable (line 14) | struct ncclDevCommWindowTable
type ncclDevCommWindowTable (line 16) | struct ncclDevCommWindowTable {
type ncclDevComm (line 25) | struct ncclDevComm {
FILE: src/include/nccl_device/impl/core__funcs.h
function NCCL_HOST_DEVICE_INLINE (line 16) | NCCL_HOST_DEVICE_INLINE ncclTeam ncclTeamWorld(ncclDevComm const &comm) {
function NCCL_HOST_DEVICE_INLINE (line 26) | NCCL_HOST_DEVICE_INLINE ncclTeam ncclTeamLsa(ncclDevComm const &comm) {
function NCCL_HOST_DEVICE_INLINE (line 36) | NCCL_HOST_DEVICE_INLINE ncclTeam ncclTeamRail(ncclDevComm const& comm) {
function NCCL_HOST_DEVICE_INLINE (line 45) | NCCL_HOST_DEVICE_INLINE bool ncclTeamRankIsMember(ncclTeam_t a, ncclTeam...
function NCCL_HOST_DEVICE_INLINE (line 53) | NCCL_HOST_DEVICE_INLINE int ncclTeamRankToTeam(ncclTeam_t a, ncclTeam_t ...
function NCCL_HOST_DEVICE_INLINE (line 62) | NCCL_HOST_DEVICE_INLINE int ncclTeamRankToWorld(ncclDevComm const& comm,...
function NCCL_HOST_DEVICE_INLINE (line 68) | NCCL_HOST_DEVICE_INLINE int ncclTeamRankToLsa(ncclDevComm const& comm, n...
function NCCL_HOST_DEVICE_INLINE (line 73) | NCCL_HOST_DEVICE_INLINE ncclTeam_t ncclTeamInnerFactor(ncclTeam_t parent...
function NCCL_HOST_DEVICE_INLINE (line 81) | NCCL_HOST_DEVICE_INLINE ncclTeam_t ncclTeamOuterFactor(ncclTeam_t parent...
function NCCL_HOST_DEVICE_INLINE (line 89) | NCCL_HOST_DEVICE_INLINE int ncclTeamRankInDifference(ncclTeam_t parent, ...
function NCCL_DEVICE_INLINE (line 106) | NCCL_DEVICE_INLINE void* ncclGetLocalPointer(ncclWindow_t w, size_t offs...
function NCCL_DEVICE_INLINE (line 115) | NCCL_DEVICE_INLINE void* ncclGetLsaPointer(ncclWindow_t w, size_t offset...
function NCCL_DEVICE_INLINE (line 124) | NCCL_DEVICE_INLINE void* ncclGetPeerPointer(ncclWindow_t w, size_t offse...
function NCCL_DEVICE_INLINE (line 135) | NCCL_DEVICE_INLINE void* ncclGetPeerPointer(ncclWindow_t w, size_t offse...
function NCCL_DEVICE_INLINE (line 145) | NCCL_DEVICE_INLINE void* ncclGetMultimemPointer(ncclWindow_t w, size_t o...
function NCCL_DEVICE_INLINE (line 153) | NCCL_DEVICE_INLINE void* ncclGetLsaMultimemPointer(ncclWindow_t w, size_...
function NCCL_HOST_DEVICE_INLINE (line 158) | NCCL_HOST_DEVICE_INLINE size_t ncclGetResourceBufferOffset(ncclDevResour...
function NCCL_DEVICE_INLINE (line 163) | NCCL_DEVICE_INLINE void* ncclGetResourceBufferLocalPointer(ncclDevComm c...
function NCCL_DEVICE_INLINE (line 172) | NCCL_DEVICE_INLINE void* ncclGetResourceBufferLsaPointer(ncclDevComm con...
function NCCL_DEVICE_INLINE (line 182) | NCCL_DEVICE_INLINE void* ncclGetResourceBufferPeerPointer(ncclDevComm co...
function NCCL_DEVICE_INLINE (line 192) | NCCL_DEVICE_INLINE void* ncclGetResourceBufferMultimemPointer(ncclDevCom...
function NCCL_DEVICE_INLINE (line 201) | NCCL_DEVICE_INLINE void* ncclGetResourceBufferLsaMultimemPointer(ncclDev...
function NCCL_DEVICE_INLINE (line 207) | NCCL_DEVICE_INLINE ncclSymPtr<char> ncclGetResourceBuffer(ncclDevComm co...
FILE: src/include/nccl_device/impl/core__types.h
type ncclWindow_vidmem (line 12) | struct ncclWindow_vidmem {
type ncclMultimemHandle (line 22) | struct ncclMultimemHandle {
FILE: src/include/nccl_device/impl/ll_a2a__types.h
type ncclLLA2AHandle (line 14) | struct ncclLLA2AHandle {
FILE: src/include/nccl_device/impl/mem_barrier__types.h
type ncclLsaBarrierHandle (line 14) | struct ncclLsaBarrierHandle {
function NCCL_DEVICE_INLINE (line 31) | NCCL_DEVICE_INLINE uint32_t* mcInbox(bool multimem) {
function NCCL_DEVICE_INLINE (line 41) | NCCL_DEVICE_INLINE uint32_t* ucInbox(int owner, int peer) {
FILE: src/include/nccl_device/ll_a2a.h
type ncclLLA2AHandle (line 13) | struct ncclLLA2AHandle
FILE: src/include/nccl_device/mem_barrier.h
type ncclLsaBarrierHandle (line 14) | struct ncclLsaBarrierHandle
FILE: src/include/nccl_device/utility.h
function namespace (line 38) | namespace utility {
type Present (line 301) | struct Present<> {}
function NCCL_HOST_DEVICE_INLINE (line 347) | NCCL_HOST_DEVICE_INLINE ~Optional() {
FILE: src/include/net.h
type ncclComm (line 19) | struct ncclComm
type ncclComm (line 20) | struct ncclComm
type ncclComm (line 27) | struct ncclComm
FILE: src/include/net_device.h
type ncclNetDeviceType (line 17) | typedef enum {NCCL_NET_DEVICE_HOST=0, NCCL_NET_DEVICE_UNPACK=1} ncclNetD...
type ncclNetDeviceHandle_v7_t (line 19) | typedef struct {
type ncclNetDeviceHandle_v7_t (line 27) | typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t;
type ncclNetDeviceHandle_v8_t (line 28) | typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_v9_t;
type ncclNetDeviceHandle_v9_t (line 29) | typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_v10_t;
type ncclNetDeviceHandle_v10_t (line 30) | typedef ncclNetDeviceHandle_v10_t ncclNetDeviceHandle_v11_t;
type ncclNetDeviceHandle_v11_t (line 31) | typedef ncclNetDeviceHandle_v11_t ncclNetDeviceHandle_t;
FILE: src/include/npkit/npkit.h
function class (line 23) | class NpKit {
FILE: src/include/npkit/npkit_struct.h
type NpKitEventCollectContext (line 23) | struct NpKitEventCollectContext {
FILE: src/include/nvmlwrap.h
type nvmlDevice_st (line 29) | struct nvmlDevice_st
type nvmlEnableState_t (line 32) | typedef enum nvmlEnableState_enum
type nvmlNvLinkCapability_t (line 38) | typedef enum nvmlNvLinkCapability_enum
type nvmlReturn_t (line 50) | typedef enum nvmlReturn_enum
type nvmlPciInfo_t (line 75) | typedef struct nvmlPciInfo_st
type nvmlGpuP2PStatus_t (line 94) | typedef enum nvmlGpuP2PStatus_enum
type nvmlGpuP2PCapsIndex_t (line 106) | typedef enum nvmlGpuP2PCapsIndex_enum
type nvmlValueType_t (line 119) | typedef enum nvmlValueType_enum
type nvmlValue_t (line 135) | typedef union nvmlValue_st
type nvmlFieldValue_t (line 179) | typedef struct nvmlFieldValue_st
type nvmlGpuFabricState_t (line 198) | typedef unsigned char nvmlGpuFabricState_t;
type nvmlGpuFabricInfo_t (line 200) | typedef struct {
type nvmlGpuFabricInfo_v2_t (line 240) | typedef struct {
type nvmlGpuFabricInfo_v2_t (line 249) | typedef nvmlGpuFabricInfo_v2_t nvmlGpuFabricInfoV_t;
type nvmlPlatformInfo_v2_t (line 259) | typedef struct
type nvmlPlatformInfo_v2_t (line 271) | typedef nvmlPlatformInfo_v2_t nvmlPlatformInfo_t;
type nvmlConfComputeSystemState_t (line 280) | typedef struct nvmlConfComputeSystemState_st {
type nvmlSystemConfComputeSettings_v1_t (line 296) | typedef struct {
type nvmlSystemConfComputeSettings_v1_t (line 304) | typedef nvmlSystemConfComputeSettings_v1_t nvmlSystemConfComputeSettings_t;
type ncclNvmlDeviceInfo (line 311) | struct ncclNvmlDeviceInfo {
type ncclNvmlDevicePairInfo (line 315) | struct ncclNvmlDevicePairInfo {
type ncclNvmlCCStatus (line 322) | struct ncclNvmlCCStatus {
type ncclNvmlCCStatus (line 344) | struct ncclNvmlCCStatus
FILE: src/include/nvtx.h
type nccl_domain (line 50) | struct nccl_domain{static constexpr char const* name{"NCCL"};}
function class (line 55) | class payload_schema {
function setPayloadData (line 177) | void setPayloadData(const uint64_t schemaId) noexcept
function PayloadType (line 189) | PayloadType payload{}
FILE: src/include/nvtx3/nvToolsExt.h
type nvtxRangeId_t (line 239) | typedef uint64_t nvtxRangeId_t;
type nvtxDomainRegistration_st (line 242) | struct nvtxDomainRegistration_st
type nvtxDomainRegistration (line 243) | typedef struct nvtxDomainRegistration_st nvtxDomainRegistration;
type nvtxDomainRegistration (line 253) | typedef nvtxDomainRegistration* nvtxDomainHandle_t;
type nvtxStringRegistration_st (line 256) | struct nvtxStringRegistration_st
type nvtxStringRegistration (line 257) | typedef struct nvtxStringRegistration_st nvtxStringRegistration;
type nvtxStringRegistration (line 267) | typedef nvtxStringRegistration* nvtxStringHandle_t;
type nvtxColorType_t (line 277) | typedef enum nvtxColorType_t
type nvtxMessageType_t (line 286) | typedef enum nvtxMessageType_t
type nvtxMessageValue_t (line 297) | typedef union nvtxMessageValue_t
type nvtxPayloadType_t (line 343) | typedef enum nvtxPayloadType_t
type nvtxEventAttributes_v2 (line 432) | typedef struct nvtxEventAttributes_v2
type nvtxEventAttributes_t (line 525) | typedef struct nvtxEventAttributes_v2 nvtxEventAttributes_t;
type nvtxResourceGenericType_t (line 953) | typedef enum nvtxResourceGenericType_t
type nvtxResourceAttributes_v0 (line 1039) | typedef struct nvtxResourceAttributes_v0
type nvtxResourceAttributes_t (line 1098) | typedef struct nvtxResourceAttributes_v0 nvtxResourceAttributes_t;
type nvtxResourceHandle (line 1104) | struct nvtxResourceHandle
FILE: src/include/nvtx3/nvToolsExtCounters.h
type nvtxSemanticsCounter_t (line 123) | typedef struct nvtxSemanticsCounter_v1
type nvtxCountersAttr_t (line 154) | typedef struct nvtxCountersAttr_v1
type nvtxCountersRegistration_st (line 181) | struct nvtxCountersRegistration_st
type nvtxCountersRegistration (line 182) | typedef struct nvtxCountersRegistration_st nvtxCountersRegistration;
type nvtxCountersRegistration (line 190) | typedef nvtxCountersRegistration* nvtxCountersHandle_t;
type nvtxCountersBatch_t (line 192) | typedef struct nvtxCountersBatch_v1
FILE: src/include/nvtx3/nvToolsExtCuda.h
type nvtxResourceCUDAType_t (line 43) | typedef enum nvtxResourceCUDAType_t
FILE: src/include/nvtx3/nvToolsExtCudaRt.h
type nvtxResourceCUDARTType_t (line 44) | typedef enum nvtxResourceCUDARTType_t
FILE: src/include/nvtx3/nvToolsExtMem.h
type nvtxMemHeap_v1 (line 129) | struct nvtxMemHeap_v1
type nvtxMemHeap_t (line 130) | typedef struct nvtxMemHeap_v1 nvtxMemHeap_t;
type nvtxMemHeap_t (line 133) | typedef nvtxMemHeap_t* nvtxMemHeapHandle_t;
type nvtxMemRegion_v1 (line 138) | struct nvtxMemRegion_v1
type nvtxMemRegion_t (line 139) | typedef struct nvtxMemRegion_v1 nvtxMemRegion_t;
type nvtxMemRegion_t (line 142) | typedef nvtxMemRegion_t* nvtxMemRegionHandle_t;
type nvtxMemRegionRef_t (line 147) | typedef union nvtxMemRegionRef_t
type nvtxMemPermissions_v1 (line 156) | struct nvtxMemPermissions_v1
type nvtxMemPermissions_t (line 157) | typedef struct nvtxMemPermissions_v1 nvtxMemPermissions_t;
type nvtxMemPermissions_t (line 160) | typedef nvtxMemPermissions_t* nvtxMemPermissionsHandle_t;
type nvtxMemVirtualRangeDesc_v1 (line 163) | typedef struct nvtxMemVirtualRangeDesc_v1
type nvtxMemVirtualRangeDesc_v1 (line 168) | typedef nvtxMemVirtualRangeDesc_v1 nvtxMemVirtualRangeDesc_t;
type nvtxMemHeapDesc_v1 (line 172) | typedef struct nvtxMemHeapDesc_v1
type nvtxMemHeapDesc_v1 (line 253) | typedef nvtxMemHeapDesc_v1 nvtxMemHeapDesc_t;
type nvtxMemRegionsRegisterBatch_v1 (line 304) | typedef struct nvtxMemRegionsRegisterBatch_v1
type nvtxMemRegionsRegisterBatch_v1 (line 319) | typedef nvtxMemRegionsRegisterBatch_v1 nvtxMemRegionsRegisterBatch_t;
type nvtxMemRegionsResizeBatch_v1 (line 346) | typedef struct nvtxMemRegionsResizeBatch_v1
type nvtxMemRegionsResizeBatch_v1 (line 358) | typedef nvtxMemRegionsResizeBatch_v1 nvtxMemRegionsResizeBatch_t;
type nvtxMemRegionsUnregisterBatch_v1 (line 389) | typedef struct nvtxMemRegionsUnregisterBatch_v1
type nvtxMemRegionsUnregisterBatch_v1 (line 401) | typedef nvtxMemRegionsUnregisterBatch_v1 nvtxMemRegionsUnregisterBatch_t;
type nvtxMemRegionNameDesc_v1 (line 413) | typedef struct nvtxMemRegionNameDesc_v1
type nvtxMemRegionNameDesc_v1 (line 424) | typedef nvtxMemRegionNameDesc_v1 nvtxMemRegionNameDesc_t;
type nvtxMemRegionsNameBatch_v1 (line 427) | typedef struct nvtxMemRegionsNameBatch_v1
type nvtxMemRegionsNameBatch_v1 (line 439) | typedef nvtxMemRegionsNameBatch_v1 nvtxMemRegionsNameBatch_t;
type nvtxMemPermissionsAssignRegionDesc_v1 (line 470) | typedef struct nvtxMemPermissionsAssignRegionDesc_v1
type nvtxMemPermissionsAssignRegionDesc_v1 (line 477) | typedef nvtxMemPermissionsAssignRegionDesc_v1 nvtxMemPermissionsAssignRe...
type nvtxMemPermissionsAssignBatch_v1 (line 480) | typedef struct nvtxMemPermissionsAssignBatch_v1
type nvtxMemPermissionsAssignBatch_v1 (line 495) | typedef nvtxMemPermissionsAssignBatch_v1 nvtxMemPermissionsAssignBatch_t;
type NvtxExtMemCallbackId (line 645) | typedef enum NvtxExtMemCallbackId
FILE: src/include/nvtx3/nvToolsExtMemCudaRt.h
type nvtxMemCudaArrayRangeDesc_v1 (line 35) | typedef struct nvtxMemCudaArrayRangeDesc_v1
type nvtxMemCudaArrayRangeDesc_v1 (line 44) | typedef nvtxMemCudaArrayRangeDesc_v1 nvtxMemCudaArrayRangeDesc_t;
type nvtxMemCuArrayRangeDesc_v1 (line 61) | typedef struct nvtxMemCuArrayRangeDesc_v1
type nvtxMemCuArrayRangeDesc_v1 (line 70) | typedef nvtxMemCuArrayRangeDesc_v1 nvtxMemCuArrayRangeDesc_t;
FILE: src/include/nvtx3/nvToolsExtOpenCL.h
type nvtxResourceOpenCLType_t (line 43) | typedef enum nvtxResourceOpenCLType_t
FILE: src/include/nvtx3/nvToolsExtPayload.h
type nvtxPayloadEntryTypeInfo_t (line 497) | typedef struct nvtxPayloadEntryTypeInfo_v1
type nvtxPayloadData_t (line 519) | typedef struct nvtxPayloadData_v1
type nvtxSemanticsHeader_t (line 546) | typedef struct nvtxSemanticsHeader_v1
type nvtxPayloadSchemaEntry_t (line 570) | typedef struct nvtxPayloadSchemaEntry_v1
type nvtxPayloadSchemaExtension_t (line 654) | typedef struct nvtxPayloadSchemaExtension_v1
type nvtxPayloadSchemaAttr_t (line 666) | typedef struct nvtxPayloadSchemaAttr_v1
type nvtxPayloadEnum_t (line 749) | typedef struct nvtxPayloadEnum_v1
type nvtxPayloadEnumAttr_t (line 773) | typedef struct nvtxPayloadEnumAttr_v1
type nvtxScopeAttr_t (line 811) | typedef struct nvtxScopeAttr_v1
FILE: src/include/nvtx3/nvToolsExtSemanticsCounters.h
type nvtxSemanticsCounter_t (line 61) | typedef struct nvtxSemanticsCounter_v1 {
FILE: src/include/nvtx3/nvToolsExtSemanticsScope.h
type nvtxSemanticsScope_t (line 22) | typedef struct nvtxSemanticsScope_v1
FILE: src/include/nvtx3/nvToolsExtSync.h
type nvtxResourceSyncPosixThreadType_t (line 101) | typedef enum nvtxResourceSyncPosixThreadType_t
type nvtxResourceSyncWindowsType_t (line 113) | typedef enum nvtxResourceSyncWindowsType_t
type nvtxResourceSyncLinuxType_t (line 126) | typedef enum nvtxResourceSyncLinuxType_t
type nvtxResourceSyncAndroidType_t (line 142) | typedef enum nvtxResourceSyncLinuxType_t nvtxResourceSyncAndroidType_t;
type nvtxSyncUser (line 152) | struct nvtxSyncUser
type nvtxSyncUserAttributes_v0 (line 223) | typedef struct nvtxSyncUserAttributes_v0
type nvtxSyncUserAttributes_t (line 259) | typedef struct nvtxSyncUserAttributes_v0 nvtxSyncUserAttributes_t;
FILE: src/include/nvtx3/nvtx3.hpp
type nvtx3 (line 652) | namespace nvtx3 {
function NVTX3_INLINE_IF_REQUESTED (line 654) | NVTX3_INLINE_IF_REQUESTED namespace NVTX3_VERSION_NAMESPACE
class payload (line 1768) | class payload {
method payload (line 1777) | payload(int64_t value) noexcept
method payload (line 1788) | payload(int32_t value) noexcept
method payload (line 1799) | payload(uint64_t value) noexcept
method payload (line 1810) | payload(uint32_t value) noexcept
method payload (line 1822) | payload(float value) noexcept
method payload (line 1834) | payload(double value) noexcept
method payload (line 1846) | constexpr payload(
method value_type (line 1857) | constexpr value_type get_value() const noexcept { return value_; }
method nvtxPayloadType_t (line 1863) | constexpr nvtxPayloadType_t get_type() const noexcept { return type_; }
class event_attributes (line 1928) | class event_attributes {
method event_attributes (line 1936) | constexpr event_attributes() noexcept
method NVTX3_CONSTEXPR_IF_CPP14 (line 1960) | NVTX3_CONSTEXPR_IF_CPP14 explicit event_attributes(category const& c...
method NVTX3_CONSTEXPR_IF_CPP14 (line 1974) | NVTX3_CONSTEXPR_IF_CPP14 explicit event_attributes(color const& c, A...
method NVTX3_CONSTEXPR_IF_CPP14 (line 1989) | NVTX3_CONSTEXPR_IF_CPP14 explicit event_attributes(payload const& p,...
method NVTX3_CONSTEXPR_IF_CPP14 (line 2004) | NVTX3_CONSTEXPR_IF_CPP14 explicit event_attributes(message const& m,...
method NVTX3_CONSTEXPR_IF_CPP14 (line 2019) | NVTX3_CONSTEXPR_IF_CPP14 explicit event_attributes(nvtxPayloadData_t...
method event_attributes (line 2028) | event_attributes(event_attributes const&) = default;
method event_attributes (line 2029) | event_attributes& operator=(event_attributes const&) = default;
method event_attributes (line 2030) | event_attributes(event_attributes&&) = default;
method event_attributes (line 2031) | event_attributes& operator=(event_attributes&&) = default;
method value_type (line 2037) | constexpr value_type const* get() const noexcept { return &attribute...
class scoped_range_in (line 2091) | class scoped_range_in {
method scoped_range_in (line 2107) | explicit scoped_range_in(event_attributes const& attr) noexcept
method scoped_range_in (line 2137) | explicit scoped_range_in(Args const&... args) noexcept
method scoped_range_in (line 2147) | scoped_range_in() noexcept : scoped_range_in{event_attributes{}} {}
method scoped_range_in (line 2157) | scoped_range_in(scoped_range_in const&) = delete;
method scoped_range_in (line 2158) | scoped_range_in& operator=(scoped_range_in const&) = delete;
method scoped_range_in (line 2159) | scoped_range_in(scoped_range_in&&) = delete;
method scoped_range_in (line 2160) | scoped_range_in& operator=(scoped_range_in&&) = delete;
type detail (line 2179) | namespace detail {
class optional_scoped_range_in (line 2183) | class optional_scoped_range_in
method optional_scoped_range_in (line 2186) | optional_scoped_range_in() = default;
method begin (line 2188) | void begin(event_attributes const& attr) noexcept
method optional_scoped_range_in (line 2210) | optional_scoped_range_in(optional_scoped_range_in const&) = delete;
method optional_scoped_range_in (line 2211) | optional_scoped_range_in& operator=(optional_scoped_range_in const...
method optional_scoped_range_in (line 2212) | optional_scoped_range_in(optional_scoped_range_in&&) = delete;
method optional_scoped_range_in (line 2213) | optional_scoped_range_in& operator=(optional_scoped_range_in&&) = ...
type range_handle (line 2230) | struct range_handle {
method range_handle (line 2239) | constexpr explicit range_handle(value_type id) noexcept : _range_id{...
method range_handle (line 2248) | constexpr range_handle() noexcept = default;
method range_handle (line 2269) | constexpr range_handle(std::nullptr_t) noexcept {}
method value_type (line 2276) | constexpr value_type get_value() const noexcept { return _range_id; }
function range_handle (line 2334) | inline range_handle start_range_in(event_attributes const& attr) noexcept
method range_handle (line 2239) | constexpr explicit range_handle(value_type id) noexcept : _range_id{...
method range_handle (line 2248) | constexpr range_handle() noexcept = default;
method range_handle (line 2269) | constexpr range_handle(std::nullptr_t) noexcept {}
method value_type (line 2276) | constexpr value_type get_value() const noexcept { return _range_id; }
function range_handle (line 2375) | inline range_handle start_range_in(Args const&... args) noexcept
method range_handle (line 2239) | constexpr explicit range_handle(value_type id) noexcept : _range_id{...
method range_handle (line 2248) | constexpr range_handle() noexcept = default;
method range_handle (line 2269) | constexpr range_handle(std::nullptr_t) noexcept {}
method value_type (line 2276) | constexpr value_type get_value() const noexcept { return _range_id; }
function range_handle (line 2410) | inline range_handle start_range(event_attributes const& attr) noexcept
method range_handle (line 2239) | constexpr explicit range_handle(value_type id) noexcept : _range_id{...
method range_handle (line 2248) | constexpr range_handle() noexcept = default;
method range_handle (line 2269) | constexpr range_handle(std::nullptr_t) noexcept {}
method value_type (line 2276) | constexpr value_type get_value() const noexcept { return _range_id; }
function range_handle (line 2448) | inline range_handle start_range(Args const&... args) noexcept
method range_handle (line 2239) | constexpr explicit range_handle(value_type id) noexcept : _range_id{...
method range_handle (line 2248) | constexpr range_handle() noexcept = default;
method range_handle (line 2269) | constexpr range_handle(std::nullptr_t) noexcept {}
method value_type (line 2276) | constexpr value_type get_value() const noexcept { return _range_id; }
function end_range_in (line 2473) | inline void end_range_in(range_handle r) noexcept
function end_range (line 2495) | inline void end_range(range_handle r) noexcept
class unique_range_in (line 2526) | class unique_range_in {
method unique_range_in (line 2541) | explicit unique_range_in(event_attributes const& attr) noexcept
method unique_range_in (line 2566) | explicit unique_range_in(Args const&... args) noexcept
method unique_range_in (line 2576) | constexpr unique_range_in() noexcept : unique_range_in{event_attribu...
method unique_range_in (line 2590) | unique_range_in(unique_range_in&& other) noexcept = default;
method unique_range_in (line 2598) | unique_range_in& operator=(unique_range_in&& other) noexcept = default;
method unique_range_in (line 2602) | unique_range_in(unique_range_in const&) = delete;
method unique_range_in (line 2606) | unique_range_in& operator=(unique_range_in const&) = delete;
type end_range_handle (line 2610) | struct end_range_handle {
function mark_in (line 2649) | inline void mark_in(event_attributes const& attr) noexcept
function mark_in (line 2686) | inline void mark_in(Args const&... args) noexcept
function mark (line 2713) | inline void mark(event_attributes const& attr) noexcept
function mark (line 2745) | inline void mark(Args const&... args) noexcept
FILE: src/include/nvtx3/nvtxDetail/nvtxExtImpl.h
type nvtxExtGlobals1_t (line 77) | typedef struct nvtxExtGlobals1_t
FILE: src/include/nvtx3/nvtxDetail/nvtxExtImplCounters_v1.h
function NVTX_LINKONCE_DEFINE_FUNCTION (line 55) | NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_EXT_COUNTERS_VERSIONED_ID(nvtxEx...
FILE: src/include/nvtx3/nvtxDetail/nvtxExtImplMem_v1.h
function NVTX_LINKONCE_DEFINE_FUNCTION (line 45) | NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemI...
FILE: src/include/nvtx3/nvtxDetail/nvtxExtImplPayload_v1.h
function NVTX_LINKONCE_DEFINE_FUNCTION (line 57) | NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExt...
FILE: src/include/nvtx3/nvtxDetail/nvtxExtInit.h
function NVTX_LINKONCE_DEFINE_FUNCTION (line 109) | NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadI...
function NVTX_LINKONCE_DEFINE_FUNCTION (line 285) | NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxExtInit...
FILE: src/include/nvtx3/nvtxDetail/nvtxExtTypes.h
type nvtxExtModuleSegment_t (line 23) | typedef struct nvtxExtModuleSegment_t
type nvtxExtModuleInfo_t (line 30) | typedef struct nvtxExtModuleInfo_t
FILE: src/include/nvtx3/nvtxDetail/nvtxImpl.h
type nvtxGlobals_t (line 89) | typedef struct nvtxGlobals_t
function NVTX_API (line 359) | NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxEtiGetModuleFunctionTable)(
function NVTX_LINKONCE_DEFINE_FUNCTION (line 405) | NVTX_LINKONCE_DEFINE_FUNCTION const void* NVTX_API NVTX_VERSIONED_IDENTI...
functi
Copy disabled (too large)
Download .json
Condensed preview — 772 files, each showing path, character count, and a content snippet. Download the .json file for the full structured content (22,834K chars).
[
{
"path": ".azuredevops/multinode-ci-nightly.yml",
"chars": 1956,
"preview": "resources:\n repositories:\n - repository: pipelines_repo\n type: github\n endpoint: ROCm\n name: ROCm/ROCm\n\nvaria"
},
{
"path": ".azuredevops/multinode-ci-pr.yml",
"chars": 2063,
"preview": "resources:\n repositories:\n - repository: pipelines_repo\n type: github\n endpoint: ROCm\n name: ROCm/ROCm\n\nvaria"
},
{
"path": ".azuredevops/multinode-ci-slurm-nightly.yml",
"chars": 992,
"preview": "resources:\n repositories:\n - repository: pipelines_repo\n type: github\n endpoint: ROCm\n name: ROCm/ROCm\n\nvaria"
},
{
"path": ".azuredevops/multinode-ci-slurm-pr.yml",
"chars": 1030,
"preview": "resources:\n repositories:\n - repository: pipelines_repo\n type: github\n endpoint: ROCm\n name: ROCm/ROCm\n\nvaria"
},
{
"path": ".azuredevops/rocm-ci.yml",
"chars": 1432,
"preview": "variables:\n- group: common\n- template: /.azuredevops/variables-global.yml@pipelines_repo\n\nparameters:\n- name: pipelinesR"
},
{
"path": ".azuredevops/slurm/build.sh",
"chars": 1660,
"preview": "#!/bin/bash\n#SBATCH --job-name=rccl-build\n#SBATCH --output=rccl-build-%j.out\n#SBATCH --error=rccl-build-%j.out\n#SBATCH -"
},
{
"path": ".azuredevops/slurm/test_rccl-UnitTests.sh",
"chars": 567,
"preview": "#!/bin/bash\n#SBATCH --job-name=rccl-UnitTests\n#SBATCH --output=%x-%j.out\n#SBATCH --error=%x-%j.out\n#SBATCH --time=180\n#S"
},
{
"path": ".azuredevops/slurm/test_rccl-tests.sh",
"chars": 2509,
"preview": "#!/bin/bash\n#SBATCH --job-name=rccl-tests\n#SBATCH --output=%x-%j.out\n#SBATCH --error=%x-%j.out\n#SBATCH --time=60\n#SBATCH"
},
{
"path": ".azuredevops/templates/build.yml",
"chars": 2977,
"preview": "# small subset of files to check for install to determine pass/fail\nparameters:\n- name: expectedInstallFiles\n type: obj"
},
{
"path": ".azuredevops/templates/test_rccl-UnitTests.yml",
"chars": 2603,
"preview": "steps:\n - task: Bash@3\n displayName: RCCL UnitTests\n env:\n BINARIES_DIR: $(Build.BinariesDirectory)\n PI"
},
{
"path": ".azuredevops/templates/test_rccl-tests.yml",
"chars": 2934,
"preview": "steps:\n - task: Bash@3\n displayName: RCCL-Tests\n env:\n BINARIES_DIR: $(Build.BinariesDirectory)\n PIPELI"
},
{
"path": ".azuredevops/tests/pytest/HelloWorld.py",
"chars": 108,
"preview": "import pytest\n\ndef test_HelloWorld():\n greeting = \"Hello, World!\"\n assert greeting == \"Hello, World!\"\n"
},
{
"path": ".clang-format",
"chars": 3829,
"preview": "# Style file for MLSE Libraries based on the modified rocBLAS style\n\n# Common settings\nBasedOnStyle: WebKit\nTabWidth: "
},
{
"path": ".github/CODEOWNERS",
"chars": 231,
"preview": "* @ROCm/rccl-reviewers\n\n# Documentation files\ndocs/ @ROCm/rocm-documentation\n*.md @ROCm/rocm-documentation\n*.rst @ROCm/r"
},
{
"path": ".github/PULL_REQUEST_TEMPLATE.md",
"chars": 822,
"preview": "## Details\n___Do not mention proprietary info or link to internal work items in this PR.___\n\n**Work item:** _\"Internal\","
},
{
"path": ".github/dependabot.yml",
"chars": 0,
"preview": ""
},
{
"path": ".github/scripts/therock_configure_ci.py",
"chars": 4591,
"preview": "# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.\n# SPDX-License-Identifier: MIT \n\nimport fnmatch\nimport "
},
{
"path": ".github/workflows/therock-ci-linux.yml",
"chars": 4910,
"preview": "name: TheRock CI Linux\n\non:\n workflow_call:\n inputs:\n amdgpu_families:\n type: string\n artifact_grou"
},
{
"path": ".github/workflows/therock-ci.yml",
"chars": 2675,
"preview": "name: TheRock CI for rccl\n\non:\n push:\n branches:\n - develop\n pull_request:\n types:\n - labeled\n - "
},
{
"path": ".github/workflows/therock-test-packages-multi-node.yml",
"chars": 3268,
"preview": "name: TheRock Test Packages multi-node\n\non:\n workflow_call:\n inputs:\n amdgpu_families:\n type: string\n "
},
{
"path": ".github/workflows/therock-test-packages-single-node.yml",
"chars": 2525,
"preview": "name: TheRock Test Packages single-node\n\non:\n workflow_call:\n inputs:\n amdgpu_families:\n type: string\n "
},
{
"path": ".gitignore",
"chars": 156,
"preview": "# Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.\n*.gcov\n/coverage/\nbuild/\next/\nsrc/transport/net_ib_r"
},
{
"path": ".gitmodules",
"chars": 376,
"preview": "[submodule \"ext-src/mscclpp\"]\n\tpath = ext-src/mscclpp\n\turl = https://github.com/microsoft/mscclpp.git\n\tignore = dirty\n\ts"
},
{
"path": ".readthedocs.yaml",
"chars": 322,
"preview": "# Read the Docs configuration file\n# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details\n\nversion:"
},
{
"path": "CHANGELOG.md",
"chars": 18165,
"preview": "# Changelog for RCCL\n\nFull documentation for RCCL is available at [https://rccl.readthedocs.io](https://rccl.readthedocs"
},
{
"path": "CMakeLists.txt",
"chars": 63190,
"preview": "# Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.\n# Modifications Copyright (c) Microsoft Corp"
},
{
"path": "CppCheckSuppressions.txt",
"chars": 3337,
"preview": "arrayIndexThenCheck:src/bootstrap.cc:304\narrayIndexThenCheck:src/debug.cc:88\narrayIndexThenCheck:src/graph/search.cc:844"
},
{
"path": "LICENSE.txt",
"chars": 2089,
"preview": "\nAttributions\n\nContains contributions from NVIDIA.\n\nCopyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.\nMo"
},
{
"path": "Makefile",
"chars": 662,
"preview": "#\n# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.\n#\n# See LICENSE.txt for license information\n#\n.PHO"
},
{
"path": "NOTICES.txt",
"chars": 6071,
"preview": "Notices and Licenses file\n_______________________________________________________________\n\nDependencies on nvidia-nccl v"
},
{
"path": "README.md",
"chars": 8233,
"preview": "# RCCL\n\n> [!CAUTION]\n> The rccl repository is retired, please use the [ROCm/rocm-systems](https://github.com/ROCm/rocm-s"
},
{
"path": "cmake/CheckSymbolExistsNoWarn.cmake",
"chars": 1750,
"preview": "# MIT License\n#\n# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.\n#\n# Permission is hereby granted,"
},
{
"path": "cmake/Dependencies.cmake",
"chars": 8267,
"preview": "# MIT License\n#\n# Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.\n#\n# Permission is hereby granted,"
},
{
"path": "cmake/DownloadProject.CMakeLists.cmake.in",
"chars": 495,
"preview": "# Distributed under the OSI-approved MIT License. See accompanying\n# file LICENSE or https://github.com/Crascit/Downloa"
},
{
"path": "cmake/DownloadProject.cmake",
"chars": 7637,
"preview": "# Distributed under the OSI-approved MIT License. See accompanying\n# file LICENSE or https://github.com/Crascit/Downloa"
},
{
"path": "cmake/FindIBVerbs.cmake",
"chars": 1605,
"preview": "# MIT License\n#\n# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.\n#\n# Permission is hereby granted,"
},
{
"path": "cmake/Findmscclpp_nccl.cmake",
"chars": 1532,
"preview": "# MIT License\n#\n# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.\n#\n# Permission is hereby granted,"
},
{
"path": "cmake/Findrocshmem_static.cmake",
"chars": 1664,
"preview": "# MIT License\n#\n# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.\n#\n# Permission is hereby granted,"
},
{
"path": "cmake/MSCCLPP.cmake",
"chars": 9465,
"preview": "# MIT License\n#\n# Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.\n#\n# Permission is hereby granted,"
},
{
"path": "cmake/ROCSHMEM.cmake",
"chars": 5050,
"preview": "# MIT License\n#\n# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.\n#\n# Permission is hereby granted,"
},
{
"path": "cmake/rcclRAS.cmake",
"chars": 791,
"preview": "# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.\n\ncmake_minimum_required(VERSION 3.16)\n\nmessage(\"Buildin"
},
{
"path": "cmake/rocmIb.cmake",
"chars": 9328,
"preview": "# MIT License\n#\n# Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.\n#\n# Permission is hereby granted,"
},
{
"path": "cmake/scripts/add_faults.sh",
"chars": 1329,
"preview": "# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.\n#\n# Permission is hereby granted, free of charge,"
},
{
"path": "cmake/scripts/add_unroll.sh",
"chars": 3044,
"preview": "# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.\n#\n# Permission is hereby granted, free of charge,"
},
{
"path": "cmake/scripts/extract_metadata.cmake",
"chars": 3398,
"preview": "# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.\n#\n# Permission is hereby granted, free of charge,"
},
{
"path": "cmake/scripts/git_version.cmake",
"chars": 3145,
"preview": "# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.\n#\n# Permission is hereby granted, free of charge,"
},
{
"path": "docker/Dockerfile.ubuntu",
"chars": 3457,
"preview": "## base docker image\nARG ROCM_IMAGE_NAME=rocm/dev-ubuntu-22.04\nARG ROCM_IMAGE_TAG=latest\nFROM \"${ROCM_IMAGE_NAME}:${ROCM"
},
{
"path": "docker/README.md",
"chars": 1845,
"preview": "# Using RCCL/RCCL-Tests in a docker environment\n\n## Docker build\n\nAssuming you have docker installed on your system:\n\n##"
},
{
"path": "docs/.gitignore",
"chars": 64,
"preview": "_build/\r\n_doxygen/\r\ndoxygen/html\r\ndoxygen/xml\r\nsphinx/_toc.yml\r\n"
},
{
"path": "docs/api-reference/api-library.rst",
"chars": 1134,
"preview": ".. meta::\n :description: RCCL is a stand-alone library that provides multi-GPU and multi-node collective communication"
},
{
"path": "docs/api-reference/env-variables.rst",
"chars": 5072,
"preview": ".. meta::\n :description: RCCL is a stand-alone library that provides multi-GPU and multi-node collective communication"
},
{
"path": "docs/api-reference/library-specification.rst",
"chars": 2931,
"preview": ".. meta::\n :description: RCCL is a stand-alone library that provides multi-GPU and multi-node collective communication"
},
{
"path": "docs/attributions.rst",
"chars": 2129,
"preview": ".. meta::\n :description: RCCL attributions information\n :keywords: RCCL, ROCm, library, API, attributions\n\n.. toctre"
},
{
"path": "docs/conf.py",
"chars": 1304,
"preview": "# Configuration file for the Sphinx documentation builder.\n#\n# This file only contains a selection of the most common op"
},
{
"path": "docs/doxygen/Doxyfile",
"chars": 112470,
"preview": "# Doxyfile 1.8.17\n\n# This file describes the settings to be used by the documentation system\n# doxygen (www.doxygen.org)"
},
{
"path": "docs/how-to/rccl-usage-tips.rst",
"chars": 12249,
"preview": ".. meta::\n :description: Usage tips for the RCCL library of collective communication primitives\n :keywords: RCCL, RO"
},
{
"path": "docs/how-to/troubleshooting-rccl.rst",
"chars": 8103,
"preview": ".. meta::\n :description: A guide to troubleshooting the RCCL library of multi-GPU and multi-node collective communicat"
},
{
"path": "docs/how-to/using-nccl.rst",
"chars": 20266,
"preview": ".. meta::\n :description: How to use the NCCL Net API\n :keywords: RCCL, ROCm, library, API, NCCL Net, plugin\n\n.. _usi"
},
{
"path": "docs/how-to/using-rccl-tuner-plugin-api.rst",
"chars": 5937,
"preview": ".. meta::\n :description: How to use the RCCL Tuner plugin API\n :keywords: RCCL, ROCm, library, API, Tuner, plugin\n\n."
},
{
"path": "docs/index.rst",
"chars": 1838,
"preview": ".. meta::\n :description: RCCL is a stand-alone library that provides multi-GPU and multi-node collective communication"
},
{
"path": "docs/install/building-installing.rst",
"chars": 3783,
"preview": ".. meta::\r\n :description: Information on how to build the RCCL library from source code\r\n :keywords: RCCL, ROCm, lib"
},
{
"path": "docs/install/docker-install.rst",
"chars": 2317,
"preview": ".. meta::\n :description: Instruction on how to install the RCCL library for collective communication primitives using "
},
{
"path": "docs/install/installation.rst",
"chars": 4477,
"preview": ".. meta::\r\n :description: Instruction on how to install the RCCL library for collective communication primitives using"
},
{
"path": "docs/license.rst",
"chars": 156,
"preview": ".. meta::\r\n :description: RCCL licensing information\r\n :keywords: RCCL, ROCm, library, API, license\r\n\r\nLicense\r\n===="
},
{
"path": "docs/sphinx/_toc.yml.in",
"chars": 1147,
"preview": "root: index\nsubtrees:\n\n- entries:\n - file: what-is-rccl.rst\n title: What is RCCL?\n\n- caption: Install\n entries:\n -"
},
{
"path": "docs/sphinx/requirements.in",
"chars": 23,
"preview": "rocm-docs-core==1.29.0\n"
},
{
"path": "docs/sphinx/requirements.txt",
"chars": 5257,
"preview": "#\n# This file is autogenerated by pip-compile with Python 3.10\n# by the following command:\n#\n# pip-compile requiremen"
},
{
"path": "docs/what-is-rccl.rst",
"chars": 1610,
"preview": ".. meta::\n :description: RCCL is a stand-alone library that provides multi-GPU and multi-node collective communication"
},
{
"path": "ext-net/README.md",
"chars": 23114,
"preview": "# NCCL Net Plugin Documentation\n\nThis page describes the NCCL Net plugin API and how to implement a network plugin for N"
},
{
"path": "ext-net/example/CMakeLists.txt",
"chars": 491,
"preview": "set(SRC_FILES\n ${CMAKE_CURRENT_SOURCE_DIR}/plugin.c\n)\n\n# Create shared library\nadd_library(nccl-net-example SHARED ${"
},
{
"path": "ext-net/example/Makefile",
"chars": 512,
"preview": "#\n# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.\n#\n# See LICENSE.txt for license information\n#\n.DEF"
},
{
"path": "ext-net/example/nccl/common.h",
"chars": 1125,
"preview": "/*************************************************************************\n * Copyright (c) 2024, NVIDIA CORPORATION. Al"
},
{
"path": "ext-net/example/nccl/err.h",
"chars": 513,
"preview": "/*\n * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.\n */\n\n#ifndef NCCL_ERR_H_\n#define NCCL_ERR_H_\n\n/*"
},
{
"path": "ext-net/example/nccl/net.h",
"chars": 982,
"preview": "/*\n * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.\n */\n\n#ifndef NET_H_\n#define NET_H_\n\n#include <st"
},
{
"path": "ext-net/example/nccl/net_device.h",
"chars": 1309,
"preview": "/*************************************************************************\n * Copyright (c) 2023-2023, NVIDIA CORPORATIO"
},
{
"path": "ext-net/example/nccl/net_v10.h",
"chars": 5312,
"preview": "/*\n * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.\n */\n\n#ifndef NET_V10_H_\n#define NET_V10_H_\n\ntype"
},
{
"path": "ext-net/example/nccl/net_v11.h",
"chars": 5923,
"preview": "/*\n * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.\n */\n\n#ifndef NET_V11_H_\n#define NET_V11_H_\n\ntype"
},
{
"path": "ext-net/example/nccl/net_v2.h",
"chars": 2580,
"preview": "/*\n * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.\n */\n\n#ifndef NET_V2_H_\n#define NET_V2_H_\n\ntypede"
},
{
"path": "ext-net/example/nccl/net_v3.h",
"chars": 2370,
"preview": "/*\n * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.\n */\n\n#ifndef NET_V3_H_\n#define NET_V3_H_\n\n#defin"
},
{
"path": "ext-net/example/nccl/net_v4.h",
"chars": 2867,
"preview": "/*\n * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.\n */\n\n#ifndef NET_V4_H_\n#define NET_V4_H_\n\n#defin"
},
{
"path": "ext-net/example/nccl/net_v5.h",
"chars": 2816,
"preview": "/*\n * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.\n */\n\n#ifndef NET_V5_H_\n#define NET_V5_H_\n\ntypede"
},
{
"path": "ext-net/example/nccl/net_v6.h",
"chars": 3490,
"preview": "/*\n * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.\n */\n\n#ifndef NET_V6_H_\n#define NET_V6_H_\n\ntypede"
},
{
"path": "ext-net/example/nccl/net_v7.h",
"chars": 4173,
"preview": "/*\n * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.\n */\n\n#ifndef NET_V7_H_\n#define NET_V7_H_\n\ntypede"
},
{
"path": "ext-net/example/nccl/net_v8.h",
"chars": 4466,
"preview": "/*\n * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.\n */\n\n#ifndef NET_V8_H_\n#define NET_V8_H_\n\ntypede"
},
{
"path": "ext-net/example/nccl/net_v9.h",
"chars": 5063,
"preview": "/*\n * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.\n */\n\n#ifndef NET_V9_H_\n#define NET_V9_H_\n\ntypede"
},
{
"path": "ext-net/example/nccl/types.h",
"chars": 687,
"preview": "/*\n * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.\n */\n\n#ifndef NCCL_TYPES_H_\n#define NCCL_TYPES_H_"
},
{
"path": "ext-net/example/plugin.c",
"chars": 19121,
"preview": "/*************************************************************************\n * Copyright (c) 2015-2024, NVIDIA CORPORATIO"
},
{
"path": "ext-net/google-fastsocket/Makefile",
"chars": 504,
"preview": "CUDA_HOME?=/usr/local/cuda\nINC:=-I$(CUDA_HOME)/include\nPLUGIN_SO:=libnccl-net.so\n\ndefault: $(PLUGIN_SO)\n\n$(PLUGIN_SO): n"
},
{
"path": "ext-profiler/README.md",
"chars": 24567,
"preview": "# NCCL Profiler Plugin Documentation\n\nThis page describes the NCCL Profiler plugin API and how to implement a profiler p"
},
{
"path": "ext-profiler/example/CMakeLists.txt",
"chars": 1259,
"preview": "# Find all C source files in current directory\nset(SRC_FILES\n ${CMAKE_CURRENT_SOURCE_DIR}/plugin.cc\n ${CMAKE_CURRE"
},
{
"path": "ext-profiler/example/Makefile",
"chars": 886,
"preview": "#\n# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.\n#\n# See LICENSE.txt for license information\n#\n.DEFAULT_"
},
{
"path": "ext-profiler/example/README.md",
"chars": 12618,
"preview": "# NCCL Example Profiler Plugin Usage\n\nThis page describes how to use the NCCL example profiler plugin\n\n# Overview\n\nThe e"
},
{
"path": "ext-profiler/example/event.h",
"chars": 8760,
"preview": "/*************************************************************************\n * Copyright (c) 2024, NVIDIA CORPORATION. Al"
},
{
"path": "ext-profiler/example/nccl/common.h",
"chars": 851,
"preview": "/*************************************************************************\n * Copyright (c) 2024, NVIDIA CORPORATION. Al"
},
{
"path": "ext-profiler/example/nccl/err.h",
"chars": 697,
"preview": "/*************************************************************************\n * Copyright (c) 2024, NVIDIA CORPORATION. Al"
},
{
"path": "ext-profiler/example/nccl/net_ib_v1.h",
"chars": 1100,
"preview": "/*************************************************************************\n * Copyright (c) 2024, NVIDIA CORPORATION. Al"
},
{
"path": "ext-profiler/example/nccl/net_socket_v1.h",
"chars": 959,
"preview": "/*************************************************************************\n * Copyright (c) 2024, NVIDIA CORPORATION. Al"
},
{
"path": "ext-profiler/example/nccl/profiler.h",
"chars": 3366,
"preview": "/*************************************************************************\n * Copyright (c) 2024, NVIDIA CORPORATION. Al"
},
{
"path": "ext-profiler/example/nccl/profiler_net.h",
"chars": 729,
"preview": "/*************************************************************************\n * Copyright (c) 2024, NVIDIA CORPORATION. Al"
},
{
"path": "ext-profiler/example/nccl/profiler_v1.h",
"chars": 3353,
"preview": "/*************************************************************************\n * Copyright (c) 2024, NVIDIA CORPORATION. Al"
},
{
"path": "ext-profiler/example/nccl/profiler_v2.h",
"chars": 3319,
"preview": "/*************************************************************************\n * Copyright (c) 2024, NVIDIA CORPORATION. Al"
},
{
"path": "ext-profiler/example/nccl/profiler_v3.h",
"chars": 3414,
"preview": "/*************************************************************************\n * Copyright (c) 2024, NVIDIA CORPORATION. Al"
},
{
"path": "ext-profiler/example/nccl/profiler_v4.h",
"chars": 3876,
"preview": "/*************************************************************************\n * Copyright (c) 2024, NVIDIA CORPORATION. Al"
},
{
"path": "ext-profiler/example/nccl/profiler_v5.h",
"chars": 4441,
"preview": "/*************************************************************************\n * Copyright (c) 2024, NVIDIA CORPORATION. Al"
},
{
"path": "ext-profiler/example/nccl/types.h",
"chars": 615,
"preview": "/*\n * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.\n */\n\n#ifndef NCCL_TYPES_H_\n#define NCCL_TYPES_H_"
},
{
"path": "ext-profiler/example/plugin.cc",
"chars": 34615,
"preview": "/*************************************************************************\n * Copyright (c) 2024, NVIDIA CORPORATION. Al"
},
{
"path": "ext-profiler/example/plugin.h",
"chars": 481,
"preview": "/*************************************************************************\n * Copyright (c) 2024, NVIDIA CORPORATION. Al"
},
{
"path": "ext-profiler/example/print_event.cc",
"chars": 22261,
"preview": "/*************************************************************************\n * Copyright (c) 2024, NVIDIA CORPORATION. Al"
},
{
"path": "ext-profiler/example/print_event.h",
"chars": 464,
"preview": "/*************************************************************************\n * Copyright (c) 2024, NVIDIA CORPORATION. Al"
},
{
"path": "ext-profiler/example/queue.h",
"chars": 1276,
"preview": "/*************************************************************************\n * Copyright (c) 2025, NVIDIA CORPORATION. Al"
},
{
"path": "ext-profiler/google-CoMMA/Makefile",
"chars": 370,
"preview": ".PHONY: build-CoMMA\n\nall: build-CoMMA\n\nbuild-CoMMA: clone-CoMMA\n\tcd CoMMA && cargo build\n\nclone-CoMMA:\n\t@if [ ! -d CoMMA"
},
{
"path": "ext-profiler/inspector/Makefile",
"chars": 1674,
"preview": "#\n# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.\n#\n# See LICENSE.txt for license information\n#\n\n# Variab"
},
{
"path": "ext-profiler/inspector/README.md",
"chars": 6662,
"preview": "# NCCL Inspector Plugin\n\nThe NCCL Inspector is a plugin for the NVIDIA Collective Communications Library (NCCL) that pro"
},
{
"path": "ext-profiler/inspector/exporter/example/README.md",
"chars": 5047,
"preview": "# NCCL Inspector Performance Summary Exporter\n\nThis tool processes NCCL Inspector log files and generates comprehensive "
},
{
"path": "ext-profiler/inspector/exporter/example/perf_summary_exporter.py",
"chars": 20278,
"preview": "from pathlib import Path\nimport argparse\nimport glob\nimport gzip\nimport sys\nimport pandas as pd\nfrom concurrent.futures "
},
{
"path": "ext-profiler/inspector/exporter/example/requirements.txt",
"chars": 88,
"preview": "pandas>=1.3.0\ntqdm>=4.60.0\nduckdb>=0.8.0\nmatplotlib>=3.3.0\npyarrow>=5.0.0\nnumpy>=1.21.0\n"
},
{
"path": "ext-profiler/inspector/inspector.cc",
"chars": 47950,
"preview": "#include \"inspector.h\"\n\n#include <assert.h>\n#include <stdio.h>\n#include <stdlib.h>\n#include <strings.h>\n#include <sys/st"
},
{
"path": "ext-profiler/inspector/inspector.h",
"chars": 5465,
"preview": "#pragma once\n\n#include <pthread.h>\n\n#include \"json.h\"\n#include \"common.h\"\n#include \"version.h\"\n\n#define MAX_CHANNELS "
},
{
"path": "ext-profiler/inspector/inspector_plugin.cc",
"chars": 16381,
"preview": "/*************************************************************************\n * Copyright (c) 2025, NVIDIA CORPORATION. Al"
},
{
"path": "ext-profiler/inspector/json.cc",
"chars": 12881,
"preview": "#include \"json.h\"\n#include <assert.h>\n#include <math.h>\n#include <pthread.h>\n#include <stdio.h>\n#include <stdlib.h>\n#inc"
},
{
"path": "ext-profiler/inspector/json.h",
"chars": 2163,
"preview": "#pragma once\n\n#include <stdbool.h>\n#include <stdint.h>\n#include <stddef.h>\n\ntypedef enum {\n JSON_NONE, // A pseudo-stat"
},
{
"path": "ext-profiler/inspector/nccl/common.h",
"chars": 2526,
"preview": "/*************************************************************************\n * Copyright (c) 2024, NVIDIA CORPORATION. Al"
},
{
"path": "ext-profiler/inspector/nccl/profiler.h",
"chars": 3366,
"preview": "/*************************************************************************\n * Copyright (c) 2024, NVIDIA CORPORATION. Al"
},
{
"path": "ext-profiler/inspector/nccl/profiler_net.h",
"chars": 678,
"preview": "/*************************************************************************\n * Copyright (c) 2024, NVIDIA CORPORATION. Al"
},
{
"path": "ext-profiler/inspector/nccl/profiler_v1.h",
"chars": 3397,
"preview": "/*************************************************************************\n * Copyright (c) 2024, NVIDIA CORPORATION. Al"
},
{
"path": "ext-profiler/inspector/nccl/profiler_v2.h",
"chars": 3362,
"preview": "/*************************************************************************\n * Copyright (c) 2024, NVIDIA CORPORATION. Al"
},
{
"path": "ext-profiler/inspector/nccl/profiler_v3.h",
"chars": 3457,
"preview": "/*************************************************************************\n * Copyright (c) 2024, NVIDIA CORPORATION. Al"
},
{
"path": "ext-profiler/inspector/nccl/profiler_v4.h",
"chars": 3940,
"preview": "/*************************************************************************\n * Copyright (c) 2024, NVIDIA CORPORATION. Al"
},
{
"path": "ext-profiler/inspector/nccl/profiler_v5.h",
"chars": 4441,
"preview": "/*************************************************************************\n * Copyright (c) 2024, NVIDIA CORPORATION. Al"
},
{
"path": "ext-profiler/inspector/nccl/types.h",
"chars": 615,
"preview": "/*\n * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.\n */\n\n#ifndef NCCL_TYPES_H_\n#define NCCL_TYPES_H_"
},
{
"path": "ext-profiler/inspector/version.h",
"chars": 178,
"preview": "#ifndef VERSION_H\n#define VERSION_H\n\n#ifdef __cplusplus\nextern \"C\" {\n#endif\nconst char* get_git_version_info();\n#ifdef _"
},
{
"path": "ext-src/bf16-tuning.patch",
"chars": 1090,
"preview": "diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp\nindex 7a2cd4a..a14dfbc 100644\n--- a/apps/nccl/src"
},
{
"path": "ext-src/check_ibv_access_relaxed_ordering.cc",
"chars": 248,
"preview": "#include <stdio.h>\n#include <infiniband/verbs.h>\n\nint main(void) {\n enum ibv_access_flags has_ibv_access_relaxed_orderi"
},
{
"path": "ext-src/cpx.patch",
"chars": 401,
"preview": "diff --git a/src/numa.cc b/src/numa.cc\nindex d72c99e..16c903d 100644\n--- a/src/numa.cc\n+++ b/src/numa.cc\n@@ -26,6 +26,7 "
},
{
"path": "ext-src/device-flag.patch",
"chars": 9767,
"preview": "diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp\nindex 9f46ff9..fac105a 100644\n--- a/apps/nccl/src"
},
{
"path": "ext-src/disable-executor.patch",
"chars": 12625,
"preview": "diff --git a/CMakeLists.txt b/CMakeLists.txt\nindex a94b634..fee3bb2 100644\n--- a/CMakeLists.txt\n+++ b/CMakeLists.txt\n@@ "
},
{
"path": "ext-src/disable-format-checks.patch",
"chars": 389,
"preview": "diff --git a/CMakeLists.txt b/CMakeLists.txt\nindex a94b634..09ca1fa 100644\n--- a/CMakeLists.txt\n+++ b/CMakeLists.txt\n@@ "
},
{
"path": "ext-src/mem-reg.patch",
"chars": 5831,
"preview": "diff --git a/apps/nccl/include/nccl.h b/apps/nccl/include/nccl.h\nindex bfdb226..70d15cf 100644\n--- a/apps/nccl/include/n"
},
{
"path": "ext-src/mscclpp_ibv_access_relaxed_ordering.patch",
"chars": 2230,
"preview": "diff --git a/CMakeLists.txt b/CMakeLists.txt\nindex a95a8e5..62b4f22 100644\n--- a/CMakeLists.txt\n+++ b/CMakeLists.txt\n@@ "
},
{
"path": "ext-src/no-cache.patch",
"chars": 19949,
"preview": "diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp\nindex a14dfbc..66596f3 100644\n--- a/apps/nccl/src"
},
{
"path": "ext-src/non-multiple-128-fix.patch",
"chars": 665,
"preview": "diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp\nindex 76674ba..7a2cd4a 100644\n--- a/apps/nccl/src"
},
{
"path": "ext-src/read-allred.patch",
"chars": 24000,
"preview": "diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp\nindex 4134241..76674ba 100644\n--- a/apps/nccl/src"
},
{
"path": "ext-src/reg-fix.patch",
"chars": 1837,
"preview": "diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu\nindex 5c19dc6..5fb99ef 100644\n--- a/apps/nccl/src/nccl.cu\n+++"
},
{
"path": "ext-src/remove-clip.patch",
"chars": 2081,
"preview": "diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp\nindex fac105a..9ef93ce 100644\n--- a/apps/nccl/src"
},
{
"path": "ext-src/rocm_netib.patch",
"chars": 35909,
"preview": "diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc\nindex 9bfd8dcf..4d3f0a08 100644\n--- a/src/transport/net_i"
},
{
"path": "ext-tuner/README.md",
"chars": 6306,
"preview": "# NCCL Tuner Plugin Development\n\nThis directory contains resources and examples for developing NCCL tuner plugins. Tuner"
},
{
"path": "ext-tuner/basic/Makefile",
"chars": 555,
"preview": "#\n# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.\n#\n# See LICENSE.txt for license information\n#\n.DEF"
},
{
"path": "ext-tuner/basic/README.md",
"chars": 6060,
"preview": "# Basic NCCL Tuner Plugin\n\nThis directory contains a minimal placeholder implementation of an NCCL tuner plugin. It serv"
},
{
"path": "ext-tuner/basic/nccl/common.h",
"chars": 851,
"preview": "/*************************************************************************\n * Copyright (c) 2024, NVIDIA CORPORATION. Al"
},
{
"path": "ext-tuner/basic/nccl/err.h",
"chars": 513,
"preview": "/*\n * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.\n */\n\n#ifndef NCCL_ERR_H_\n#define NCCL_ERR_H_\n\n/*"
},
{
"path": "ext-tuner/basic/nccl/tuner.h",
"chars": 3407,
"preview": "/*************************************************************************\n * Copyright (c) 2023, NVIDIA CORPORATION. Al"
},
{
"path": "ext-tuner/basic/plugin.c",
"chars": 1364,
"preview": "/*************************************************************************\n * Copyright (c) 2015-2019, NVIDIA CORPORATIO"
},
{
"path": "ext-tuner/example/.gitignore",
"chars": 533,
"preview": "# Compiled shared objects and binaries\n*.so\n*.o\n*.a\n*.out\n*.exe\n*.dll\n*.dylib\n*.bin\n*.elf\n\n# Python cache\n__pycache__/\n*"
},
{
"path": "ext-tuner/example/CMakeLists.txt",
"chars": 794,
"preview": "# Find all C source files in current directory\nset(SRC_FILES\n ${CMAKE_CURRENT_SOURCE_DIR}/plugin.c\n)\n\n# Create shared"
},
{
"path": "ext-tuner/example/Makefile",
"chars": 1464,
"preview": "#\n# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.\n#\n# See LICENSE.txt for license information\n#\n\n.DE"
},
{
"path": "ext-tuner/example/README.md",
"chars": 6896,
"preview": "# NCCL Example Tuner Plugin\n\nThis example plugin shows a practical example of a CSV file-based tuning approach, allowing"
},
{
"path": "ext-tuner/example/nccl/common.h",
"chars": 851,
"preview": "/*************************************************************************\n * Copyright (c) 2024, NVIDIA CORPORATION. Al"
},
{
"path": "ext-tuner/example/nccl/err.h",
"chars": 513,
"preview": "/*\n * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.\n */\n\n#ifndef NCCL_ERR_H_\n#define NCCL_ERR_H_\n\n/*"
},
{
"path": "ext-tuner/example/nccl/tuner.h",
"chars": 4929,
"preview": "/*************************************************************************\n * Copyright (c) 2023, NVIDIA CORPORATION. Al"
},
{
"path": "ext-tuner/example/nccl_tuner.conf",
"chars": 2288,
"preview": "# NCCL Tuner Configuration File (CSV Format)\n# Format: collective_type,min_bytes,max_bytes,algorithm,protocol,channels,n"
},
{
"path": "ext-tuner/example/plugin.c",
"chars": 20227,
"preview": "/*************************************************************************\n * Copyright (c) 2015-2019, NVIDIA CORPORATIO"
},
{
"path": "ext-tuner/example/scripts/README.md",
"chars": 3576,
"preview": "# NCCL Tuner Configuration Scripts\n\nThis directory contains scripts for optimizing NCCL tuner configurations based on pe"
},
{
"path": "ext-tuner/example/scripts/optimize_config.py",
"chars": 19475,
"preview": "#!/usr/bin/env python3\n\"\"\"\nNCCL Tuner Configuration Optimizer\n\nReads a CSV file containing performance data across diffe"
},
{
"path": "ext-tuner/example/test/Makefile",
"chars": 582,
"preview": "#\n# Makefile for NCCL Tuner Plugin Unit Tests\n#\n\nCC := gcc\nCFLAGS := -Wall -Wextra -g -std=c99 -fPIC\nINC := -I. -I../ncc"
},
{
"path": "ext-tuner/example/test/README.md",
"chars": 4802,
"preview": "# NCCL Tuner Plugin Unit Tests\n\nThis directory contains comprehensive unit tests for the NCCL tuner plugin. The tests ve"
},
{
"path": "ext-tuner/example/test/test_plugin.c",
"chars": 37596,
"preview": "/*************************************************************************\n * Unit tests for NCCL Tuner Plugin\n ********"
},
{
"path": "ext-tuner/model_demo/Makefile",
"chars": 416,
"preview": "#\n# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.\n#\n# See LICENSE.txt for license information\n#\nRCCL"
},
{
"path": "ext-tuner/model_demo/README.md",
"chars": 4962,
"preview": "# RCCL Tuner Plugin API Overview\n\nThis document describes the API structure to be implemented by an external tuner plugi"
},
{
"path": "ext-tuner/model_demo/nccl/common.h",
"chars": 851,
"preview": "/*************************************************************************\n * Copyright (c) 2024, NVIDIA CORPORATION. Al"
},
{
"path": "ext-tuner/model_demo/nccl/err.h",
"chars": 513,
"preview": "/*\n * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.\n */\n\n#ifndef NCCL_ERR_H_\n#define NCCL_ERR_H_\n\n/*"
},
{
"path": "ext-tuner/model_demo/nccl/tuner.h",
"chars": 3407,
"preview": "/*************************************************************************\n * Copyright (c) 2023, NVIDIA CORPORATION. Al"
},
{
"path": "ext-tuner/model_demo/plugin.c",
"chars": 11292,
"preview": "/*************************************************************************\n * Copyright (c) 2015-2019, NVIDIA CORPORATIO"
},
{
"path": "install.sh",
"chars": 19368,
"preview": "#!/bin/bash\n# Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.\n\n# #############################"
},
{
"path": "makefiles/common.mk",
"chars": 5421,
"preview": "#\n# Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.\n#\n# See LICENSE.txt for license information\n#\n\nCUD"
},
{
"path": "makefiles/formatting.mk",
"chars": 1182,
"preview": "#\n# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.\n#\n# See LICENSE.txt for license information\n#\n\n# Prereq"
},
{
"path": "makefiles/version.mk",
"chars": 103,
"preview": "##### version\nNCCL_MAJOR := 2\nNCCL_MINOR := 28\nNCCL_PATCH := 3\nNCCL_SUFFIX :=\nPKG_REVISION := 1\n"
},
{
"path": "pkg/Makefile",
"chars": 530,
"preview": "#\n# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.\n#\n# See LICENSE.txt for license information\n#\n.PHO"
},
{
"path": "pkg/debian/.gitignore",
"chars": 81,
"preview": "/*.debhelper.log\n/*.debhelper\n/*.substvars\n/tmp/\n/files\n/libnccl1/\n/libnccl-dev/\n"
},
{
"path": "pkg/debian/Makefile",
"chars": 1645,
"preview": "#\n# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.\n#\n# See LICENSE.txt for license information\n#\n\ninc"
},
{
"path": "pkg/debian/changelog.in",
"chars": 231,
"preview": "nccl (${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}-${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}) trusty; u"
},
{
"path": "pkg/debian/compat",
"chars": 2,
"preview": "9\n"
},
{
"path": "pkg/debian/control.in",
"chars": 1257,
"preview": "Source: nccl\nSection: libs\nMaintainer: cudatools <cudatools@nvidia.com>\nPriority: optional\nBuild-depends: debhelper(>=9)"
},
{
"path": "pkg/debian/gbp.conf",
"chars": 116,
"preview": "[DEFAULT]\ndebian-branch = master\nupstream-branch = master\n\nignore-new = True\n\n[git-buildpackage]\n\nno-purge = True\n"
},
{
"path": "pkg/debian/libnccl-dev.install.in",
"chars": 132,
"preview": "bin/ncclras /usr/bin\ninclude/* /usr/include\nlib/libnccl.so /usr/lib/${pkg:MultiArch}\nlib/libnccl_static.a /usr/lib/${pkg"
},
{
"path": "pkg/debian/libnccl2.install.in",
"chars": 138,
"preview": "lib/libnccl.so.${nccl:Major} /usr/lib/${pkg:MultiArch}\nlib/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} /usr/lib"
},
{
"path": "pkg/debian/rules",
"chars": 237,
"preview": "#!/usr/bin/make -f\n\n%:\n\tdh $@ --parallel\n\noverride_dh_auto_install:\n\tPREFIX=debian/tmp dh_auto_install\n\noverride_dh_auto"
},
{
"path": "pkg/debian/source/format",
"chars": 13,
"preview": "3.0 (native)\n"
},
{
"path": "pkg/redhat/Makefile",
"chars": 1988,
"preview": "#\n# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.\n#\n# See LICENSE.txt for license information\n#\n\ninc"
},
{
"path": "pkg/redhat/nccl.spec.in",
"chars": 2625,
"preview": "Name: libnccl\nVersion: ${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}\nRelease: ${pkg:Re"
},
{
"path": "pkg/srctxz/Makefile",
"chars": 1100,
"preview": "#\n# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.\n#\n# See LICENSE.txt for license information\n#\n\ninc"
},
{
"path": "pkg/srctxz/create_srctxz.sh.in",
"chars": 1513,
"preview": "#!/bin/bash\n#\n# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.\n#\n# See LICENSE.txt for license inform"
},
{
"path": "pkg/txz/Makefile",
"chars": 1213,
"preview": "#\n# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.\n#\n# See LICENSE.txt for license information\n#\n\ninc"
},
{
"path": "pkg/txz/create_txz.sh.in",
"chars": 666,
"preview": "#!/bin/bash\n#\n# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.\n#\n# See LICENSE.txt for license inform"
},
{
"path": "rtest.xml",
"chars": 499,
"preview": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<testset>\n <!-- Typically run with environment variables NCCL_DEBUG=INFO HSA_F"
},
{
"path": "src/CMakeLists.txt",
"chars": 4474,
"preview": "# Source files\nset(LIBSRCFILES\n bootstrap.cc\n channel.cc\n ce_coll.cc\n collectives.cc\n debug.cc\n enqueu"
},
{
"path": "src/Makefile",
"chars": 5748,
"preview": "#\n# Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.\n#\n# See LICENSE.txt for license information\n#\nincl"
},
{
"path": "src/allocator.cc",
"chars": 16185,
"preview": "/*************************************************************************\n * Copyright (c) 2015-2025, NVIDIA CORPORATIO"
},
{
"path": "src/bootstrap.cc",
"chars": 49692,
"preview": "/*************************************************************************\n * Copyright (c) 2016-2022, NVIDIA CORPORATIO"
},
{
"path": "src/ce_coll.cc",
"chars": 23339,
"preview": "/*************************************************************************\n * Copyright (c) 2025, NVIDIA CORPORATION. Al"
},
{
"path": "src/channel.cc",
"chars": 8118,
"preview": "/*************************************************************************\n * Copyright (c) 2015-2022, NVIDIA CORPORATIO"
}
]
// ... and 572 more files (download for full content)
About this extraction
This page contains the full source code of the ROCmSoftwarePlatform/rccl GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 772 files (19.3 MB), approximately 5.1M tokens, and a symbol index with 5939 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.