Repository: ROCmSoftwarePlatform/rccl Branch: develop_deprecated Commit: 57e58688f44c Files: 772 Total size: 19.3 MB Directory structure: gitextract_sughb21j/ ├── .azuredevops/ │ ├── multinode-ci-nightly.yml │ ├── multinode-ci-pr.yml │ ├── multinode-ci-slurm-nightly.yml │ ├── multinode-ci-slurm-pr.yml │ ├── rocm-ci.yml │ ├── slurm/ │ │ ├── build.sh │ │ ├── test_rccl-UnitTests.sh │ │ └── test_rccl-tests.sh │ ├── templates/ │ │ ├── build.yml │ │ ├── test_rccl-UnitTests.yml │ │ └── test_rccl-tests.yml │ └── tests/ │ └── pytest/ │ └── HelloWorld.py ├── .clang-format ├── .github/ │ ├── CODEOWNERS │ ├── PULL_REQUEST_TEMPLATE.md │ ├── dependabot.yml │ ├── scripts/ │ │ └── therock_configure_ci.py │ └── workflows/ │ ├── therock-ci-linux.yml │ ├── therock-ci.yml │ ├── therock-test-packages-multi-node.yml │ └── therock-test-packages-single-node.yml ├── .gitignore ├── .gitmodules ├── .readthedocs.yaml ├── CHANGELOG.md ├── CMakeLists.txt ├── CppCheckSuppressions.txt ├── LICENSE.txt ├── Makefile ├── NOTICES.txt ├── README.md ├── cmake/ │ ├── CheckSymbolExistsNoWarn.cmake │ ├── Dependencies.cmake │ ├── DownloadProject.CMakeLists.cmake.in │ ├── DownloadProject.cmake │ ├── FindIBVerbs.cmake │ ├── Findmscclpp_nccl.cmake │ ├── Findrocshmem_static.cmake │ ├── MSCCLPP.cmake │ ├── ROCSHMEM.cmake │ ├── rcclRAS.cmake │ ├── rocmIb.cmake │ └── scripts/ │ ├── add_faults.sh │ ├── add_unroll.sh │ ├── extract_metadata.cmake │ └── git_version.cmake ├── docker/ │ ├── Dockerfile.ubuntu │ └── README.md ├── docs/ │ ├── .gitignore │ ├── api-reference/ │ │ ├── api-library.rst │ │ ├── env-variables.rst │ │ └── library-specification.rst │ ├── attributions.rst │ ├── conf.py │ ├── doxygen/ │ │ └── Doxyfile │ ├── how-to/ │ │ ├── rccl-usage-tips.rst │ │ ├── troubleshooting-rccl.rst │ │ ├── using-nccl.rst │ │ └── using-rccl-tuner-plugin-api.rst │ ├── index.rst │ ├── install/ │ │ ├── building-installing.rst │ │ ├── docker-install.rst │ │ └── installation.rst │ ├── license.rst │ ├── sphinx/ │ │ ├── _toc.yml.in │ │ ├── requirements.in │ │ └── requirements.txt │ └── what-is-rccl.rst ├── ext-net/ │ ├── README.md │ ├── example/ │ │ ├── CMakeLists.txt │ │ ├── Makefile │ │ ├── nccl/ │ │ │ ├── common.h │ │ │ ├── err.h │ │ │ ├── net.h │ │ │ ├── net_device.h │ │ │ ├── net_v10.h │ │ │ ├── net_v11.h │ │ │ ├── net_v2.h │ │ │ ├── net_v3.h │ │ │ ├── net_v4.h │ │ │ ├── net_v5.h │ │ │ ├── net_v6.h │ │ │ ├── net_v7.h │ │ │ ├── net_v8.h │ │ │ ├── net_v9.h │ │ │ └── types.h │ │ └── plugin.c │ └── google-fastsocket/ │ └── Makefile ├── ext-profiler/ │ ├── README.md │ ├── example/ │ │ ├── CMakeLists.txt │ │ ├── Makefile │ │ ├── README.md │ │ ├── event.h │ │ ├── nccl/ │ │ │ ├── common.h │ │ │ ├── err.h │ │ │ ├── net_ib_v1.h │ │ │ ├── net_socket_v1.h │ │ │ ├── profiler.h │ │ │ ├── profiler_net.h │ │ │ ├── profiler_v1.h │ │ │ ├── profiler_v2.h │ │ │ ├── profiler_v3.h │ │ │ ├── profiler_v4.h │ │ │ ├── profiler_v5.h │ │ │ └── types.h │ │ ├── plugin.cc │ │ ├── plugin.h │ │ ├── print_event.cc │ │ ├── print_event.h │ │ └── queue.h │ ├── google-CoMMA/ │ │ └── Makefile │ └── inspector/ │ ├── Makefile │ ├── README.md │ ├── exporter/ │ │ └── example/ │ │ ├── README.md │ │ ├── perf_summary_exporter.py │ │ └── requirements.txt │ ├── inspector.cc │ ├── inspector.h │ ├── inspector_plugin.cc │ ├── json.cc │ ├── json.h │ ├── nccl/ │ │ ├── common.h │ │ ├── profiler.h │ │ ├── profiler_net.h │ │ ├── profiler_v1.h │ │ ├── profiler_v2.h │ │ ├── profiler_v3.h │ │ ├── profiler_v4.h │ │ ├── profiler_v5.h │ │ └── types.h │ └── version.h ├── ext-src/ │ ├── bf16-tuning.patch │ ├── check_ibv_access_relaxed_ordering.cc │ ├── cpx.patch │ ├── device-flag.patch │ ├── disable-executor.patch │ ├── disable-format-checks.patch │ ├── mem-reg.patch │ ├── mscclpp_ibv_access_relaxed_ordering.patch │ ├── no-cache.patch │ ├── non-multiple-128-fix.patch │ ├── read-allred.patch │ ├── reg-fix.patch │ ├── remove-clip.patch │ └── rocm_netib.patch ├── ext-tuner/ │ ├── README.md │ ├── basic/ │ │ ├── Makefile │ │ ├── README.md │ │ ├── nccl/ │ │ │ ├── common.h │ │ │ ├── err.h │ │ │ └── tuner.h │ │ └── plugin.c │ ├── example/ │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── Makefile │ │ ├── README.md │ │ ├── nccl/ │ │ │ ├── common.h │ │ │ ├── err.h │ │ │ └── tuner.h │ │ ├── nccl_tuner.conf │ │ ├── plugin.c │ │ ├── scripts/ │ │ │ ├── README.md │ │ │ └── optimize_config.py │ │ └── test/ │ │ ├── Makefile │ │ ├── README.md │ │ └── test_plugin.c │ └── model_demo/ │ ├── Makefile │ ├── README.md │ ├── nccl/ │ │ ├── common.h │ │ ├── err.h │ │ └── tuner.h │ └── plugin.c ├── install.sh ├── makefiles/ │ ├── common.mk │ ├── formatting.mk │ └── version.mk ├── pkg/ │ ├── Makefile │ ├── debian/ │ │ ├── .gitignore │ │ ├── Makefile │ │ ├── changelog.in │ │ ├── compat │ │ ├── control.in │ │ ├── gbp.conf │ │ ├── libnccl-dev.install.in │ │ ├── libnccl2.install.in │ │ ├── rules │ │ └── source/ │ │ └── format │ ├── redhat/ │ │ ├── Makefile │ │ └── nccl.spec.in │ ├── srctxz/ │ │ ├── Makefile │ │ └── create_srctxz.sh.in │ └── txz/ │ ├── Makefile │ └── create_txz.sh.in ├── rtest.xml ├── src/ │ ├── CMakeLists.txt │ ├── Makefile │ ├── allocator.cc │ ├── bootstrap.cc │ ├── ce_coll.cc │ ├── channel.cc │ ├── collectives.cc │ ├── commDump.cc │ ├── debug.cc │ ├── dev_runtime.cc │ ├── device/ │ │ ├── CMakeLists.txt │ │ ├── Makefile │ │ ├── all_gather.h │ │ ├── all_reduce.h │ │ ├── alltoall_gda.h │ │ ├── alltoall_pivot.h │ │ ├── broadcast.h │ │ ├── common.cu │ │ ├── common.h │ │ ├── common_kernel.h │ │ ├── generate.py │ │ ├── msccl_kernel_impl.h │ │ ├── network/ │ │ │ └── unpack/ │ │ │ ├── unpack.h │ │ │ └── unpack_defs.h │ │ ├── onerank.cu │ │ ├── op128.h │ │ ├── primitives.h │ │ ├── prims_ll.h │ │ ├── prims_ll128.h │ │ ├── prims_simple.h │ │ ├── rccl_metadata.h │ │ ├── rccl_ptr.h │ │ ├── reduce.h │ │ ├── reduce_kernel.h │ │ ├── reduce_scatter.h │ │ ├── sendrecv.h │ │ └── symmetric/ │ │ ├── all_gather.cuh │ │ ├── all_reduce.cuh │ │ ├── generate.py │ │ ├── kernel.cuh │ │ ├── primitives.cuh │ │ └── reduce_scatter.cuh │ ├── enhcompat.cc │ ├── enqueue.cc │ ├── graph/ │ │ ├── CMakeLists.txt │ │ ├── connect.cc │ │ ├── paths.cc │ │ ├── rings.cc │ │ ├── rings.h │ │ ├── rome_models.cc │ │ ├── rome_models.h │ │ ├── search.cc │ │ ├── topo.cc │ │ ├── topo.h │ │ ├── trees.cc │ │ ├── tuning.cc │ │ ├── xml.cc │ │ └── xml.h │ ├── group.cc │ ├── include/ │ │ ├── BfdBacktrace.hpp │ │ ├── alloc.h │ │ ├── allocator.h │ │ ├── alt_rsmi.h │ │ ├── amdsmi_wrap.h │ │ ├── api_trace.h │ │ ├── archinfo.h │ │ ├── argcheck.h │ │ ├── bitops.h │ │ ├── bootstrap.h │ │ ├── ce_coll.h │ │ ├── channel.h │ │ ├── checks.h │ │ ├── coll_net.h │ │ ├── collectives.h │ │ ├── comm.h │ │ ├── core.h │ │ ├── cpuset.h │ │ ├── cudawrap.h │ │ ├── debug.h │ │ ├── dev_runtime.h │ │ ├── device.h │ │ ├── enqueue.h │ │ ├── gdrwrap.h │ │ ├── git_version.h │ │ ├── graph.h │ │ ├── group.h │ │ ├── hip_rocm_version_info.h │ │ ├── ibvcore.h │ │ ├── ibvsymbols.h │ │ ├── ibvwrap.h │ │ ├── info.h │ │ ├── ionic/ │ │ │ ├── ionicdvcore.h │ │ │ ├── ionicdvsymbols.h │ │ │ └── ionicdvwrap.h │ │ ├── ipcsocket.h │ │ ├── latency_profiler/ │ │ │ ├── CollTrace.h │ │ │ ├── CollTraceEvent.h │ │ │ ├── CollTraceFunc.h │ │ │ ├── CollTraceUtils.h │ │ │ ├── EventQueue.h │ │ │ └── MIT-LICENSE.txt │ │ ├── mlx5/ │ │ │ ├── mlx5dvcore.h │ │ │ ├── mlx5dvsymbols.h │ │ │ └── mlx5dvwrap.h │ │ ├── mnnvl.h │ │ ├── msccl/ │ │ │ ├── msccl_kernel.h │ │ │ ├── msccl_lifecycle.h │ │ │ ├── msccl_parser.h │ │ │ ├── msccl_scheduler.h │ │ │ ├── msccl_setup.h │ │ │ ├── msccl_status.h │ │ │ └── msccl_struct.h │ │ ├── mscclpp/ │ │ │ └── mscclpp_nccl.h │ │ ├── nccl_common.h │ │ ├── nccl_device/ │ │ │ ├── README.md │ │ │ ├── comm.h │ │ │ ├── coop.h │ │ │ ├── core.h │ │ │ ├── impl/ │ │ │ │ ├── comm__funcs.h │ │ │ │ ├── comm__types.h │ │ │ │ ├── core__funcs.h │ │ │ │ ├── core__types.h │ │ │ │ ├── ll_a2a__funcs.h │ │ │ │ ├── ll_a2a__types.h │ │ │ │ ├── mem_barrier__funcs.h │ │ │ │ ├── mem_barrier__types.h │ │ │ │ ├── ptr__funcs.h │ │ │ │ └── ptr__types.h │ │ │ ├── ll_a2a.h │ │ │ ├── mem_barrier.h │ │ │ ├── ptr.h │ │ │ └── utility.h │ │ ├── nccl_device.h │ │ ├── net.h │ │ ├── net_device.h │ │ ├── npkit/ │ │ │ ├── npkit.h │ │ │ ├── npkit_event.h │ │ │ └── npkit_struct.h │ │ ├── nvmlwrap.h │ │ ├── nvtx.h │ │ ├── nvtx3/ │ │ │ ├── nvToolsExt.h │ │ │ ├── nvToolsExtCounters.h │ │ │ ├── nvToolsExtCuda.h │ │ │ ├── nvToolsExtCudaRt.h │ │ │ ├── nvToolsExtMem.h │ │ │ ├── nvToolsExtMemCudaRt.h │ │ │ ├── nvToolsExtOpenCL.h │ │ │ ├── nvToolsExtPayload.h │ │ │ ├── nvToolsExtPayloadHelper.h │ │ │ ├── nvToolsExtSemanticsCounters.h │ │ │ ├── nvToolsExtSemanticsScope.h │ │ │ ├── nvToolsExtSync.h │ │ │ ├── nvtx3.hpp │ │ │ └── nvtxDetail/ │ │ │ ├── nvtxExtHelperMacros.h │ │ │ ├── nvtxExtImpl.h │ │ │ ├── nvtxExtImplCounters_v1.h │ │ │ ├── nvtxExtImplMemCudaRt_v1.h │ │ │ ├── nvtxExtImplMem_v1.h │ │ │ ├── nvtxExtImplPayload_v1.h │ │ │ ├── nvtxExtInit.h │ │ │ ├── nvtxExtPayloadHelperInternal.h │ │ │ ├── nvtxExtPayloadTypeInfo.h │ │ │ ├── nvtxExtTypes.h │ │ │ ├── nvtxImpl.h │ │ │ ├── nvtxImplCore.h │ │ │ ├── nvtxImplCudaRt_v3.h │ │ │ ├── nvtxImplCuda_v3.h │ │ │ ├── nvtxImplOpenCL_v3.h │ │ │ ├── nvtxImplSync_v3.h │ │ │ ├── nvtxInit.h │ │ │ ├── nvtxInitDecls.h │ │ │ ├── nvtxInitDefs.h │ │ │ ├── nvtxLinkOnce.h │ │ │ └── nvtxTypes.h │ │ ├── nvtx_payload_schemas.h │ │ ├── nvtx_stub.h │ │ ├── p2p.h │ │ ├── param.h │ │ ├── plugin/ │ │ │ ├── nccl_net.h │ │ │ ├── nccl_profiler.h │ │ │ ├── nccl_tuner.h │ │ │ ├── net/ │ │ │ │ ├── net_v10.h │ │ │ │ ├── net_v11.h │ │ │ │ ├── net_v6.h │ │ │ │ ├── net_v7.h │ │ │ │ ├── net_v8.h │ │ │ │ └── net_v9.h │ │ │ ├── plugin.h │ │ │ ├── profiler/ │ │ │ │ ├── net_ib.h │ │ │ │ ├── net_ib_v1.h │ │ │ │ ├── net_socket.h │ │ │ │ ├── net_socket_v1.h │ │ │ │ ├── profiler_v1.h │ │ │ │ ├── profiler_v2.h │ │ │ │ ├── profiler_v3.h │ │ │ │ ├── profiler_v4.h │ │ │ │ └── profiler_v5.h │ │ │ └── tuner/ │ │ │ ├── tuner_v2.h │ │ │ ├── tuner_v3.h │ │ │ ├── tuner_v4.h │ │ │ └── tuner_v5.h │ │ ├── profiler.h │ │ ├── proxy.h │ │ ├── proxy_trace/ │ │ │ └── proxy_trace.h │ │ ├── ras.h │ │ ├── rccl_common.h │ │ ├── rccl_float8.h │ │ ├── rccl_vars.h │ │ ├── recorder.h │ │ ├── register.h │ │ ├── register_inline.h │ │ ├── rocm_smi_wrap.h │ │ ├── rocmwrap.h │ │ ├── roctx.h │ │ ├── scheduler.h │ │ ├── shm.h │ │ ├── shmutils.h │ │ ├── signals.h │ │ ├── socket.h │ │ ├── strongstream.h │ │ ├── sym_kernels.h │ │ ├── timer.h │ │ ├── transport.h │ │ ├── trees.h │ │ ├── tuner.h │ │ └── utils.h │ ├── init.cc │ ├── init_nvtx.cc │ ├── misc/ │ │ ├── CMakeLists.txt │ │ ├── alt_rsmi.cc │ │ ├── amdsmi_wrap.cc │ │ ├── api_trace.c │ │ ├── api_trace.cc │ │ ├── archinfo.cc │ │ ├── argcheck.cc │ │ ├── cudawrap.cc │ │ ├── gdrwrap.cc │ │ ├── ibvsymbols.cc │ │ ├── ibvwrap.cc │ │ ├── ionicdvsymbols.cc │ │ ├── ionicdvwrap.cc │ │ ├── ipcsocket.cc │ │ ├── latency_profiler/ │ │ │ ├── CollTrace.cc │ │ │ ├── CollTraceEvent.cc │ │ │ ├── CollTraceFunc.cc │ │ │ ├── CollTraceUtils.cc │ │ │ └── MIT-LICENSE.txt │ │ ├── mlx5dvsymbols.cc │ │ ├── mlx5dvwrap.cc │ │ ├── msccl/ │ │ │ ├── msccl_lifecycle.cc │ │ │ ├── msccl_parser.cc │ │ │ ├── msccl_setup.cc │ │ │ └── msccl_status.cc │ │ ├── mscclpp/ │ │ │ ├── mscclpp_nccl.cc │ │ │ └── mscclpp_nccl_syms.txt │ │ ├── npkit.cc │ │ ├── nvmlwrap.cc │ │ ├── nvmlwrap_stub.cc │ │ ├── param.cc │ │ ├── proxy_trace/ │ │ │ └── proxy_trace.cc │ │ ├── recorder.cc │ │ ├── rocm_smi_wrap.cc │ │ ├── rocmwrap.cc │ │ ├── roctx.cc │ │ ├── shmutils.cc │ │ ├── signals.cc │ │ ├── socket.cc │ │ ├── strongstream.cc │ │ └── utils.cc │ ├── mnnvl.cc │ ├── msccl.cc │ ├── nccl.h.in │ ├── nccl.pc.in │ ├── nccl_device/ │ │ ├── CMakeLists.txt │ │ ├── core.cc │ │ ├── ll_a2a.cc │ │ └── mem_barrier.cc │ ├── plugin/ │ │ ├── CMakeLists.txt │ │ ├── net/ │ │ │ ├── CMakeLists.txt │ │ │ ├── net_v10.cc │ │ │ ├── net_v11.cc │ │ │ ├── net_v6.cc │ │ │ ├── net_v7.cc │ │ │ ├── net_v8.cc │ │ │ └── net_v9.cc │ │ ├── net.cc │ │ ├── plugin_open.cc │ │ ├── profiler/ │ │ │ ├── CMakeLists.txt │ │ │ ├── profiler_v1.cc │ │ │ ├── profiler_v2.cc │ │ │ ├── profiler_v3.cc │ │ │ ├── profiler_v4.cc │ │ │ └── profiler_v5.cc │ │ ├── profiler.cc │ │ ├── tuner/ │ │ │ ├── CMakeLists.txt │ │ │ ├── tuner_v2.cc │ │ │ ├── tuner_v3.cc │ │ │ ├── tuner_v4.cc │ │ │ └── tuner_v5.cc │ │ └── tuner.cc │ ├── proxy.cc │ ├── ras/ │ │ ├── CMakeLists.txt │ │ ├── client.cc │ │ ├── client_support.cc │ │ ├── collectives.cc │ │ ├── peers.cc │ │ ├── ras.cc │ │ ├── ras_internal.h │ │ └── rasnet.cc │ ├── rccl_wrap.cc │ ├── register/ │ │ ├── CMakeLists.txt │ │ ├── coll_reg.cc │ │ ├── register.cc │ │ └── sendrecv_reg.cc │ ├── scheduler/ │ │ ├── CMakeLists.txt │ │ └── symmetric_sched.cc │ ├── sym_kernels.cc │ ├── transport/ │ │ ├── CMakeLists.txt │ │ ├── coll_net.cc │ │ ├── generic.cc │ │ ├── net.cc │ │ ├── net_ib.cc │ │ ├── net_socket.cc │ │ ├── nvls.cc │ │ ├── p2p.cc │ │ ├── profiler.cc │ │ └── shm.cc │ └── transport.cc ├── test/ │ ├── AllGatherTests.cpp │ ├── AllReduceTests.cpp │ ├── AllToAllTests.cpp │ ├── AllToAllVTests.cpp │ ├── AllocTests.cpp │ ├── AltRsmiTests.cpp │ ├── ArgCheckTests.cpp │ ├── BitOpsTests.cpp │ ├── BroadcastTests.cpp │ ├── CMakeLists.txt │ ├── CommTests.cpp │ ├── EnqueueTests.cpp │ ├── GatherTests.cpp │ ├── GroupCallTests.cpp │ ├── IpcsocketTests.cpp │ ├── NetSocketTests.cpp │ ├── NonBlockingTests.cpp │ ├── ParamTests.cpp │ ├── ParamTestsConfFile.txt │ ├── ProxyTests.cpp │ ├── README.md │ ├── RcclWrapTests.cpp │ ├── ReduceScatterTests.cpp │ ├── ReduceTests.cpp │ ├── RegisterTests.cpp │ ├── ScatterTests.cpp │ ├── SendRecvTests.cpp │ ├── StandaloneTests.cpp │ ├── TransportTests.cpp │ ├── _RecorderTests.cpp │ ├── common/ │ │ ├── CallCollectiveForked.cpp │ │ ├── CallCollectiveForked.hpp │ │ ├── CollectiveArgs.cpp │ │ ├── CollectiveArgs.hpp │ │ ├── DeviceBufferHelpers.hpp │ │ ├── EnvVars.cpp │ │ ├── EnvVars.hpp │ │ ├── ErrCode.hpp │ │ ├── MPIEnvironment.cpp │ │ ├── MPIEnvironment.hpp │ │ ├── MPIHelpers.cpp │ │ ├── MPIHelpers.hpp │ │ ├── MPIStandaloneTest.hpp │ │ ├── MPITestBase.hpp │ │ ├── MPITestCore.cpp │ │ ├── MPITestCore.hpp │ │ ├── MPITestRunner.md │ │ ├── PrepDataFuncs.cpp │ │ ├── PrepDataFuncs.hpp │ │ ├── ProcessIsolatedTestRunner.cpp │ │ ├── ProcessIsolatedTestRunner.hpp │ │ ├── ProcessIsolatedTestRunner.md │ │ ├── PtrUnion.cpp │ │ ├── PtrUnion.hpp │ │ ├── RcclMockFuncs.hpp │ │ ├── ResourceGuards.hpp │ │ ├── StandaloneUtils.cpp │ │ ├── StandaloneUtils.hpp │ │ ├── TestBed.cpp │ │ ├── TestBed.hpp │ │ ├── TestBedChild.cpp │ │ ├── TestBedChild.hpp │ │ ├── TestChecks.cpp │ │ ├── TestChecks.hpp │ │ ├── TransportUtils.hpp │ │ ├── main.cpp │ │ ├── main_fixtures.cpp │ │ └── main_mpi.cpp │ ├── ext-plugins/ │ │ ├── .gitignore │ │ ├── README.md │ │ ├── assets/ │ │ │ └── csv_confs/ │ │ │ ├── incorrect_values_config.conf │ │ │ ├── multinode_config.conf │ │ │ ├── no_matching_config.conf │ │ │ ├── singlenode_config.conf │ │ │ ├── unsupported_algo_proto_config.conf │ │ │ ├── valid_config_with_wildcards.conf │ │ │ └── valid_config_without_wildcards.conf │ │ ├── pytest.ini │ │ ├── requirements.txt │ │ └── tests/ │ │ ├── conftest.py │ │ ├── ext-profiler/ │ │ │ ├── test_allgather.py │ │ │ ├── test_allreduce.py │ │ │ ├── test_alltoall.py │ │ │ ├── test_broadcast.py │ │ │ ├── test_reduce.py │ │ │ ├── test_reducescatter.py │ │ │ └── test_sendrecv.py │ │ └── ext-tuner/ │ │ ├── test_allgather.py │ │ ├── test_allreduce.py │ │ ├── test_broadcast.py │ │ ├── test_reduce.py │ │ └── test_reducescatter.py │ ├── graph/ │ │ └── XmlTests.cpp │ ├── latency_profiler/ │ │ └── LatencyProfilerUnitTest.cpp │ ├── proxy_trace/ │ │ └── ProxyTraceUnitTests.cpp │ └── transport/ │ ├── NetIbMPITests.cpp │ ├── NetMPITests.cpp │ ├── P2pMPITests.cpp │ ├── ShmMPITests.cpp │ ├── TransportMPIBase.cpp │ └── TransportMPIBase.hpp ├── toolchain-linux.cmake └── tools/ ├── EmptyKernelTest/ │ ├── EmptyKernelTest.cpp │ ├── Makefile │ └── run.sh ├── GraphBench/ │ ├── GraphBench.cpp │ └── Makefile ├── HelloRccl/ │ ├── HelloRccl.cpp │ ├── HelloRccl.hpp │ ├── Makefile │ └── runTest.sh ├── JitterBench/ │ ├── Common.hpp │ ├── Compatibility.hpp │ ├── GetClosestNumaNode.hpp │ ├── JitterBench.cpp │ ├── Makefile │ ├── Timeline.hpp │ └── runSweep.sh ├── RcclReplayer/ │ ├── Makefile │ ├── README.md │ ├── rcclReplayer.cpp │ ├── rcclReplayer.hpp │ └── replay_log_converter.py ├── TopoVisual/ │ ├── README.md │ ├── extract_topo.awk │ └── topo_visual.sh ├── TransferBench/ │ └── README.md ├── ib-test/ │ ├── Makefile │ ├── ib_test.cpp │ ├── include/ │ │ └── nccl.h │ └── utils.cpp ├── msccl-algorithms/ │ ├── allgather_16n_direct_0_3m_ll128.xml │ ├── allgather_16n_direct_0_3m_ll128_op.xml │ ├── allgather_32n_direct_0_6m_ll128.xml │ ├── allgather_32n_direct_0_6m_ll128_op.xml │ ├── allreduce-allpairs-8n-ll-32tb-op.xml │ ├── allreduce-allpairs-8n-ll-32tb.xml │ ├── allreduce-allpairs-8n-ll-64tb-op.xml │ ├── allreduce-allpairs-8n-ll-64tb.xml │ ├── allreduce-allpairs-8n-simple-op.xml │ ├── allreduce-allpairs-8n-simple.xml │ ├── alltoall-8n-0-9kb.xml │ ├── alltoall-8n-190kb-512kb.xml │ ├── alltoall-8n-512kb-7mb.xml │ ├── alltoall-8n-7mb-43mb.xml │ └── alltoall-8n-9kb-190kb.xml ├── msccl-unit-test-algorithms/ │ ├── all-reduce-ring-ll.xml │ ├── all-reduce-ring-ll128.xml │ └── all-reduce-ring-simple.xml ├── p2p-latency-test/ │ ├── Makefile │ ├── README.md │ ├── build_and_run.sh │ ├── ll_latency_test.cpp │ ├── ll_latency_test.cu │ └── p2p_latency_test.cpp ├── rccl-prim-test/ │ ├── Makefile │ ├── copy_kernel.h │ └── rccl_prim_test.cpp ├── scripts/ │ ├── exclude_static_list.txt │ ├── npkit_trace_analysis.py │ ├── npkit_trace_generator.py │ ├── pytorch-all-reduce/ │ │ ├── README.md │ │ ├── all_reduce.py │ │ └── trace_runs.sh │ ├── pytorch-log-parser.py │ ├── rcclDiagnostics.py │ ├── rccl_bw_test.py │ ├── rocprof-log-parser.py │ ├── test_runner/ │ │ ├── README.md │ │ ├── configs/ │ │ │ ├── mi300x_mellanox_ib.json │ │ │ ├── rccl_perf_tests.json │ │ │ └── test_config_sample.json │ │ ├── lib/ │ │ │ ├── __init__.py │ │ │ ├── test_config.py │ │ │ ├── test_executor.py │ │ │ └── test_parser.py │ │ └── test_runner.py │ ├── topo_val.sh │ └── ucx_ompi_rccl_rccltests_TB_script.sh ├── time-trace/ │ ├── rccl-TimeTrace.sh │ └── time_trace_generator.py └── topo_expl/ ├── Makefile ├── README.md ├── include/ │ ├── device_table.h │ ├── model.h │ ├── nccl.h │ └── utils.h ├── model.cpp ├── models/ │ ├── topo_16p1h.xml │ ├── topo_16p1h_vm.xml │ ├── topo_16p_gio-1s-1rp-cascade.xml │ ├── topo_16p_gio-3s-1rp-split-flat.xml │ ├── topo_3p_pcie.xml │ ├── topo_3p_pcie_1.xml │ ├── topo_4p1h.xml │ ├── topo_4p1h_1.xml │ ├── topo_4p2h.xml │ ├── topo_4p2h_1.xml │ ├── topo_4p2h_2nic.xml │ ├── topo_4p3l.xml │ ├── topo_4p3l_2h.xml │ ├── topo_4p3l_ia.xml │ ├── topo_4p3l_n2.xml │ ├── topo_4p3l_n2_1.xml │ ├── topo_4p3l_n4.xml │ ├── topo_4p4h.xml │ ├── topo_4p_942.xml │ ├── topo_8p1h.xml │ ├── topo_8p1h_1.xml │ ├── topo_8p1h_2.xml │ ├── topo_8p1h_3.xml │ ├── topo_8p1h_4.xml │ ├── topo_8p1h_5.xml │ ├── topo_8p1h_n1.xml │ ├── topo_8p6l.xml │ ├── topo_8p6l_1nic.xml │ ├── topo_8p6l_2nic.xml │ ├── topo_8p6l_3nic.xml │ ├── topo_8p6l_4nic.xml │ ├── topo_8p6l_5nic.xml │ ├── topo_8p6l_6nic.xml │ ├── topo_8p_4nics.xml │ ├── topo_8p_90a.xml │ ├── topo_8p_90a_1.xml │ ├── topo_8p_942.xml │ ├── topo_8p_942vm.xml │ ├── topo_8p_950.xml │ ├── topo_8p_pcie.xml │ ├── topo_8p_pcie_1.xml │ ├── topo_8p_pcie_2nic.xml │ ├── topo_8p_rome.xml │ ├── topo_8p_rome_4n_1.xml │ ├── topo_8p_rome_4n_2.xml │ ├── topo_8p_rome_4nics.xml │ ├── topo_8p_rome_n2.xml │ ├── topo_8p_rome_n2_1.xml │ ├── topo_8p_rome_n2_2.xml │ ├── topo_8p_rome_n4.xml │ ├── topo_8p_rome_n4_1.xml │ ├── topo_8p_rome_pcie.xml │ ├── topo_8p_rome_vm1.xml │ ├── topo_8p_ts1.xml │ ├── topo_8p_ts1_1.xml │ ├── topo_8p_ts1_n4.xml │ ├── topo_8p_ts1_n4_1.xml │ ├── topo_8p_ts1_n4_2.xml │ ├── topo_collnet_n1.xml │ └── topo_collnet_n4.xml ├── topo_expl.cpp └── utils.cpp ================================================ FILE CONTENTS ================================================ ================================================ FILE: .azuredevops/multinode-ci-nightly.yml ================================================ resources: repositories: - repository: pipelines_repo type: github endpoint: ROCm name: ROCm/ROCm variables: - group: common - template: /.azuredevops/variables-global.yml@pipelines_repo - name: pytestFolder value: '.azuredevops/tests/pytest' parameters: - name: pytestList type: object default: - HelloWorld trigger: none pr: none schedules: - cron: "0 5 * 11-3 *" # 11 PM CST (November - March) displayName: "Nightly Build (CST)" branches: include: - develop always: false - cron: "0 4 * 4-10 *" # 11 PM CDT (April - October) displayName: "Nightly Build (CDT)" branches: include: - develop always: false jobs: - job: rccl timeoutInMinutes: 180 pool: rocm-ci_rccl_pool workspace: clean: all steps: - task: DeleteFiles@1 inputs: Contents: '**/*' - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml@pipelines_repo parameters: submoduleBehaviour: recursive - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml@pipelines_repo parameters: installEnabled: false printDiskSpace: false extraBuildFlags: >- -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTS=ON -GNinja - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml@pipelines_repo parameters: componentName: rccl testDir: $(Build.SourcesDirectory)/build/test testExecutable: 'LD_LIBRARY_PATH=$(Build.SourcesDirectory)/build:${LD_LIBRARY_PATH} NCCL_DEBUG=INFO RCCL_ENABLE_SIGNALHANDLER=1 ./rccl-UnitTests' testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes' - ${{ each pytestScript in parameters.pytestList }}: - task: Bash@3 displayName: Test ${{ pytestScript }} continueOnError: true inputs: targetType: inline workingDirectory: $(Build.SourcesDirectory)/$(pytestFolder) script: pytest ${{ pytestScript }}.py ================================================ FILE: .azuredevops/multinode-ci-pr.yml ================================================ resources: repositories: - repository: pipelines_repo type: github endpoint: ROCm name: ROCm/ROCm variables: - group: common - template: /.azuredevops/variables-global.yml@pipelines_repo - name: pytestFolder value: '.azuredevops/tests/pytest' parameters: - name: pytestList type: object default: - HelloWorld trigger: none pr: autoCancel: true branches: include: - develop paths: exclude: - .github - .jenkins - docs - '*.md' - LICENSE.txt - NOTICES.txt drafts: false stages: - stage: rcclStage displayName: 'RCCL develop PR' jobs: - deployment: rccl_pr_approval displayName: "CI Run Requires Approval" environment: rccl - job: rccl timeoutInMinutes: 180 pool: rocm-ci_rccl_pool workspace: clean: all steps: - task: DeleteFiles@1 inputs: Contents: '**/*' - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml@pipelines_repo parameters: submoduleBehaviour: recursive - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/build-cmake.yml@pipelines_repo parameters: installEnabled: false printDiskSpace: false extraBuildFlags: >- -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTS=ON -DGPU_TARGETS=gfx942 -GNinja - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/test.yml@pipelines_repo parameters: componentName: rccl testDir: $(Build.SourcesDirectory)/build/test testExecutable: 'LD_LIBRARY_PATH=$(Build.SourcesDirectory)/build:${LD_LIBRARY_PATH} NCCL_DEBUG=INFO RCCL_ENABLE_SIGNALHANDLER=1 ./rccl-UnitTests' testParameters: '--gtest_output=xml:./test_output.xml --gtest_color=yes' - ${{ each pytestScript in parameters.pytestList }}: - task: Bash@3 displayName: Test ${{ pytestScript }} continueOnError: true inputs: targetType: inline workingDirectory: $(Build.SourcesDirectory)/$(pytestFolder) script: pytest ${{ pytestScript }}.py ================================================ FILE: .azuredevops/multinode-ci-slurm-nightly.yml ================================================ resources: repositories: - repository: pipelines_repo type: github endpoint: ROCm name: ROCm/ROCm variables: - group: common - template: /.azuredevops/variables-global.yml@pipelines_repo trigger: none pr: none schedules: - cron: "0 5 * 11-3 *" # 11 PM CST (November - March) displayName: "Nightly Build (CST)" branches: include: - develop always: false - cron: "0 4 * 4-10 *" # 11 PM CDT (April - October) displayName: "Nightly Build (CDT)" branches: include: - develop always: false jobs: - job: rccl timeoutInMinutes: 240 pool: rocm-ci_rccl_slurm_pool workspace: clean: all steps: - task: DeleteFiles@1 inputs: Contents: '**/*' - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml@pipelines_repo parameters: submoduleBehaviour: recursive - template: templates/build.yml - template: templates/test_rccl-UnitTests.yml - template: templates/test_rccl-tests.yml ================================================ FILE: .azuredevops/multinode-ci-slurm-pr.yml ================================================ resources: repositories: - repository: pipelines_repo type: github endpoint: ROCm name: ROCm/ROCm variables: - group: common - template: /.azuredevops/variables-global.yml@pipelines_repo trigger: none pr: autoCancel: true branches: include: - develop paths: exclude: - .github - .jenkins - docs - '*.md' - LICENSE.txt - NOTICES.txt drafts: false stages: - stage: rcclStage displayName: 'RCCL develop PR' jobs: - deployment: rccl_pr_approval displayName: "CI Run Requires Approval" environment: rccl - job: rccl timeoutInMinutes: 240 pool: rocm-ci_rccl_slurm_pool workspace: clean: all steps: - task: DeleteFiles@1 inputs: Contents: '**/*' - template: ${{ variables.CI_TEMPLATE_PATH }}/steps/checkout.yml@pipelines_repo parameters: submoduleBehaviour: recursive - template: templates/build.yml - template: templates/test_rccl-UnitTests.yml - template: templates/test_rccl-tests.yml ================================================ FILE: .azuredevops/rocm-ci.yml ================================================ variables: - group: common - template: /.azuredevops/variables-global.yml@pipelines_repo parameters: - name: pipelinesRepoRef type: string default: refs/heads/develop - name: systemsRepoRef type: string default: refs/heads/develop - name: systemsSparseCheckoutDir type: string default: 'projects/rocprofiler-sdk' - name: triggerDownstreamJobs type: boolean default: true resources: repositories: - repository: pipelines_repo type: github endpoint: ROCm name: ROCm/ROCm ref: ${{ parameters.pipelinesRepoRef }} - repository: systems_repo type: github endpoint: ROCm name: ROCm/rocm-systems ref: ${{ parameters.systemsRepoRef }} trigger: batch: true branches: include: - develop - mainline paths: exclude: - .github - .jenkins - docs - '.*.y*ml' - '*.md' - LICENSE.txt - NOTICES.txt pr: autoCancel: true branches: include: - develop - mainline paths: exclude: - .github - .jenkins - docs - '.*.y*ml' - '*.md' - LICENSE.txt - NOTICES.txt drafts: false stages: - stage: rccl jobs: - template: ${{ variables.CI_COMPONENT_PATH }}/rccl.yml@pipelines_repo parameters: sparseCheckoutDir: '' systemsRepo: systems_repo systemsSparseCheckoutDir: ${{ parameters.systemsSparseCheckoutDir }} triggerDownstreamJobs: ${{ parameters.triggerDownstreamJobs }} ================================================ FILE: .azuredevops/slurm/build.sh ================================================ #!/bin/bash #SBATCH --job-name=rccl-build #SBATCH --output=rccl-build-%j.out #SBATCH --error=rccl-build-%j.out #SBATCH --time=60 #SBATCH --nodes=1 #SBATCH --exclusive #SBATCH --partition=gt short_id=$(hostname | cut -d'.' -f1 | cut -d'-' -f3-) echo "Node identifier: $short_id" source /etc/profile.d/lmod.sh module load rocm/6.4.1 # Setup local binary path export PATH="$HOME/.local/bin:$PATH" mkdir -p "$HOME/.local/bin" # Install Ninja if not already available if ! command -v ninja &>/dev/null; then echo "Ninja not found. Installing locally..." wget -q https://github.com/ninja-build/ninja/releases/download/v1.11.1/ninja-linux.zip -O /tmp/ninja.zip unzip -q /tmp/ninja.zip -d "$HOME/.local/bin" chmod +x "$HOME/.local/bin/ninja" fi echo "Using Ninja at: $(which ninja)" ninja --version # Define GPU target export GPU_TARGETS="gfx942" cd "${SLURM_SUBMIT_DIR:-$PWD}" ## Building RCCL mkdir -p build cd build cmake -G Ninja -DCMAKE_INSTALL_PREFIX="$BINARIES_DIR" -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=${GPU_TARGETS} -DBUILD_TESTS=ON -DROCM_PATH="$ROCM_PATH" .. cmake --build . cmake --build . --target install # Building RCCL Replayer cd ../tools/RcclReplayer 2>/dev/null || cd ../RcclReplayer RCCL_DIR="../../build" ROCM_DIR="$ROCM_PATH" MPI_DIR="$MPI_HOME" make cd "${SLURM_SUBMIT_DIR:-$PWD}" ## Building RCCL-Tests git clone https://github.com/ROCm/rccl-tests cd rccl-tests mkdir -p build cd build cmake -DCMAKE_PREFIX_PATH="$BINARIES_DIR;$MPI_HOME" -DUSE_MPI=ON -DCMAKE_INSTALL_PREFIX="$BINARIES_DIR" -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=${GPU_TARGETS} -DROCM_PATH="$ROCM_PATH" .. cmake --build . cmake --build . --target install ================================================ FILE: .azuredevops/slurm/test_rccl-UnitTests.sh ================================================ #!/bin/bash #SBATCH --job-name=rccl-UnitTests #SBATCH --output=%x-%j.out #SBATCH --error=%x-%j.out #SBATCH --time=180 #SBATCH --nodes=1 #SBATCH --exclusive #SBATCH --partition=gt short_id=$(hostname | cut -d'.' -f1 | cut -d'-' -f3-) echo "Node identifier: $short_id" source /etc/profile.d/lmod.sh module load rocm/6.4.1 cd "$BINARIES_DIR/bin" LD_LIBRARY_PATH="$BINARIES_DIR/lib:$LD_LIBRARY_PATH" NCCL_DEBUG=INFO RCCL_ENABLE_SIGNALHANDLER=1 HSA_NO_SCRATCH_RECLAIM=1 ./rccl-UnitTests --gtest_output=xml:$PIPELINE_WORKSPACE/rccl-UnitTests_output.xml --gtest_color=yes ================================================ FILE: .azuredevops/slurm/test_rccl-tests.sh ================================================ #!/bin/bash #SBATCH --job-name=rccl-tests #SBATCH --output=%x-%j.out #SBATCH --error=%x-%j.out #SBATCH --time=60 #SBATCH --nodes=1 #SBATCH --exclusive #SBATCH --partition=gt short_id=$(hostname | cut -d'.' -f1 | cut -d'-' -f3-) echo "Node identifier: $short_id" source /etc/profile.d/lmod.sh module load rocm/6.4.1 cd ${PIPELINE_WORKSPACE}/TestResults mkdir -p ${PIPELINE_WORKSPACE}/TestResults/rccl-tests_logs export WORKDIR=${PIPELINE_WORKSPACE}/TestResults/rccl-tests_logs export PATH="$BINARIES_DIR/bin:$PATH" export LD_LIBRARY_PATH="$BINARIES_DIR/lib:$LD_LIBRARY_PATH" ### create hostlist #nodelist=($(scontrol show hostnames)) #echo "SLURM nodes:" #echo ${nodelist[@]} #echo "" # #hosts_8ppn=() #for node in "${nodelist[@]}" #do # hosts_8ppn+=("${node}:8") #done #echo ${hosts_8ppn[@]} ### Run multi- and single-node RCCL-Tests ## Run single-node RCCL-Tests for n in 1 do total=$((n*8)) #h_8ppn=`echo ${hosts_8ppn[@]:0:${n}} | tr ' ' ','` for coll in all_reduce all_gather reduce_scatter alltoall alltoallv broadcast gather reduce scatter sendrecv do for dtype in float bfloat16 half fp8_e5m2 do out_filename="${WORKDIR}/rccl-tests_${coll}_1KB-16GB_nodes${n}_gpus${total}_${dtype}.log" #cmd="${MPI_HOME}/bin/mpirun -np ${total} --host ${h_8ppn} -mca pml ob1 -mca btl ^openib -mca oob_tcp_if_exclude docker,lo -mca btl_tcp_if_exclude docker,lo -x PATH -x LD_LIBRARY_PATH -x NCCL_DEBUG=VERSION -x NCCL_IB_HCA=bnxt_re0,bnxt_re1,bnxt_re2,bnxt_re3,bnxt_re4,bnxt_re5,bnxt_re6,bnxt_re7 -x NCCL_IGNORE_CPU_AFFINITY=1 -x HSA_NO_SCRATCH_RECLAIM=1 -x NCCL_IB_GID_INDEX=3 ${BINARIES_DIR}/bin/${coll}_perf -b 1K -e 16G -f 2 -g 1 -n 100 -w 50 -d ${dtype} -Z json -x ${WORKDIR}/rccl-tests_${coll}_nodes${n}_gpus${total}_${dtype}.json" cmd="${MPI_HOME}/bin/mpirun -np ${total} -mca pml ^ucx -mca osc ^ucx -mca btl ^openib -mca oob_tcp_if_exclude docker,lo -mca btl_tcp_if_exclude docker,lo -x PATH -x LD_LIBRARY_PATH -x NCCL_DEBUG=VERSION -x NCCL_IGNORE_CPU_AFFINITY=1 -x HSA_NO_SCRATCH_RECLAIM=1 ${BINARIES_DIR}/bin/${coll}_perf -b 1K -e 16G -f 2 -g 1 -n 100 -w 50 -d ${dtype} -Z json -x ${WORKDIR}/rccl-tests_${coll}_nodes${n}_gpus${total}_${dtype}.json" echo "Running ${coll}" 2>&1 | tee ${out_filename} echo "Run cmd: ${cmd}" 2>&1 | tee -a ${out_filename} eval ${cmd} 2>&1 | tee -a ${out_filename} sleep 2 done done done ## To add ### Summarize results ### Convert to junit ================================================ FILE: .azuredevops/templates/build.yml ================================================ # small subset of files to check for install to determine pass/fail parameters: - name: expectedInstallFiles type: object default: - bin/rccl-UnitTests - include/rccl/rccl.h - lib/cmake/rccl/rccl-config.cmake - lib/librccl.so - share/doc/rccl/LICENSE.txt - share/rccl/msccl-algorithms - share/rccl/msccl-unit-test-algorithms steps: - task: Bash@3 displayName: Build Job env: BINARIES_DIR: $(Build.BinariesDirectory) inputs: targetType: inline script: | echo "##[section]Starting build job..." rm -rf $(Build.BinariesDirectory)/* echo "Submitting build job..." mkdir -p $(Build.BinariesDirectory) BUILD_JOB_ID=$(sbatch --export=ALL --parsable $(Build.SourcesDirectory)/.azuredevops/slurm/build.sh) echo "Submitted build job: $BUILD_JOB_ID" echo "##vso[task.setvariable variable=BUILD_JOB_ID]$BUILD_JOB_ID" echo "Waiting for build job to start..." while squeue -j $BUILD_JOB_ID 2>/dev/null | grep -q $BUILD_JOB_ID; do echo "##[section]Build job $BUILD_JOB_ID is still running..." sleep 60 done echo "Waiting for final status via sacct..." LOOP_COUNT=0 MAX_LOOPS=30 # Maximum of 30 loops (30 minutes) while true; do STATE=$(sacct -j "$BUILD_JOB_ID" --format=JobID,State --noheader | awk '$1 ~ /\.batch$/ { print $2; exit }' | xargs) echo "##[section]Build job state: $STATE" if [[ "$STATE" == "COMPLETED" ]]; then break elif [[ "$STATE" =~ ^(FAILED|CANCELLED|TIMEOUT)$ ]]; then echo "Build failed with state $STATE" break fi sleep 60 LOOP_COUNT=$((LOOP_COUNT + 1)) if [ $LOOP_COUNT -ge $MAX_LOOPS ]; then echo "Time limit reached while waiting for final status." exit 1 # Exit with an error code if time limit is reached fi done echo "Checking for expected installed files..." MISSING_FILES=0 expectedFiles="${{ join(' ', parameters.expectedInstallFiles) }}" i=1 total=$(echo "$expectedFiles" | wc -w) while [ $i -le $total ]; do relpath=$(echo "$expectedFiles" | cut -d ' ' -f"$i") fullpath="$BINARIES_DIR/$relpath" if [ ! -e "$fullpath" ]; then echo "##vso[task.logissue type=error]Missing expected file: $fullpath" MISSING_FILES=1 fi i=$((i + 1)) done if [ "$MISSING_FILES" -eq 1 ]; then echo "One or more expected files are missing from the install directory." exit 1 else echo "All expected files are present in the install directory." fi - task: Bash@3 displayName: Build Logs condition: always() inputs: targetType: inline script: | cat rccl-build-${BUILD_JOB_ID}.out || echo "No log found" ================================================ FILE: .azuredevops/templates/test_rccl-UnitTests.yml ================================================ steps: - task: Bash@3 displayName: RCCL UnitTests env: BINARIES_DIR: $(Build.BinariesDirectory) PIPELINE_WORKSPACE: $(Pipeline.Workspace) inputs: targetType: inline script: | echo "Submitting test job..." TEST_JOB_ID=$(sbatch --export=ALL --parsable $(Build.SourcesDirectory)/.azuredevops/slurm/test_rccl-UnitTests.sh) echo "Submitted test job: $TEST_JOB_ID" echo "##vso[task.setvariable variable=TEST_JOB_ID]$TEST_JOB_ID" echo "Waiting for test job to start..." while squeue -j $TEST_JOB_ID 2>/dev/null | grep -q $TEST_JOB_ID; do echo "##[section]Test job $TEST_JOB_ID is still running..." sleep 60 done echo "Waiting for final status via sacct..." LOOP_COUNT=0 MAX_LOOPS=120 # Maximum of 120 loops (120 minutes) while true; do STATE=$(sacct -j "$TEST_JOB_ID" --format=JobID,State --noheader | awk '$1 ~ /\.batch$/ { print $2; exit }' | xargs) echo "##[section]Test job state: $STATE" if [[ "$STATE" == "COMPLETED" ]]; then break elif [[ "$STATE" =~ ^(FAILED|CANCELLED|TIMEOUT)$ ]]; then echo "Test failed with state $STATE" break fi sleep 60 LOOP_COUNT=$((LOOP_COUNT + 1)) if [ $LOOP_COUNT -ge $MAX_LOOPS ]; then echo "Time limit reached while waiting for final status." exit 1 # Exit with an error code if time limit is reached fi done echo "Checking test result XML for failures..." TEST_XML=$(find "$(Pipeline.Workspace)" -name 'rccl-UnitTests_output.xml' | head -n1) if [ -z "$TEST_XML" ]; then echo "##vso[task.logissue type=error]No $TEST_XML file found" echo "##vso[task.complete result=Failed;]DONE" exit 1 fi if grep -q 'failures="[^0]' "$TEST_XML"; then echo "##vso[task.logissue type=error]Test failures detected in $TEST_XML" echo "##vso[task.complete result=Failed;]DONE" exit 1 else echo "No test failures detected." fi - task: Bash@3 displayName: Test Logs condition: always() inputs: targetType: inline script: | cat rccl-UnitTests-${TEST_JOB_ID}.out || echo "No log found" - task: PublishTestResults@2 displayName: 'Publish Results' condition: succeededOrFailed() inputs: searchFolder: $(Pipeline.Workspace) testResultsFormat: JUnit testResultsFiles: '**/rccl-UnitTests_output.xml' ================================================ FILE: .azuredevops/templates/test_rccl-tests.yml ================================================ steps: - task: Bash@3 displayName: RCCL-Tests env: BINARIES_DIR: $(Build.BinariesDirectory) PIPELINE_WORKSPACE: $(Pipeline.Workspace) inputs: targetType: inline script: | echo "Submitting test job..." TEST_JOB_ID=$(sbatch --export=ALL --parsable $(Build.SourcesDirectory)/.azuredevops/slurm/test_rccl-tests.sh) echo "Submitted test job: $TEST_JOB_ID" echo "##vso[task.setvariable variable=TEST_JOB_ID]$TEST_JOB_ID" echo "Waiting for test job to start..." while squeue -j $TEST_JOB_ID 2>/dev/null | grep -q $TEST_JOB_ID; do echo "##[section]Test job $TEST_JOB_ID is still running..." sleep 60 done echo "Waiting for final status via sacct..." LOOP_COUNT=0 MAX_LOOPS=120 # Maximum of 120 loops (120 minutes) while true; do STATE=$(sacct -j "$TEST_JOB_ID" --format=JobID,State --noheader | awk '$1 ~ /\.batch$/ { print $2; exit }' | xargs) echo "##[section]Test job state: $STATE" if [[ "$STATE" == "COMPLETED" ]]; then break elif [[ "$STATE" =~ ^(FAILED|CANCELLED|TIMEOUT)$ ]]; then echo "Test failed with state $STATE" break fi sleep 60 LOOP_COUNT=$((LOOP_COUNT + 1)) if [ $LOOP_COUNT -ge $MAX_LOOPS ]; then echo "Time limit reached while waiting for final status." exit 1 # Exit with an error code if time limit is reached fi done echo "Checking test result json for failures..." TEST_JSON=$(find "$(Pipeline.Workspace)" -name 'rccl-tests*.json') if [ -z "$TEST_JSON" ]; then echo "##vso[task.logissue type=error]No $TEST_JSON file(s) found" echo "##vso[task.complete result=Failed;]DONE" exit 1 fi #echo "Checking test result XML for failures..." #TEST_XML=$(find "$(Pipeline.Workspace)" -name 'rccl-tests_output.xml' | head -n1) #if [ -z "$TEST_XML" ]; then # echo "##vso[task.logissue type=error]No $TES_XML file found" # echo "##vso[task.complete result=Failed;]DONE" # exit 1 #fi #if grep -q 'failures="[^0]' "$TEST_XML"; then # echo "##vso[task.logissue type=error]Test failures detected in $TEST_XML" # echo "##vso[task.complete result=Failed;]DONE" # exit 1 #else # echo "No test failures detected." #fi - task: Bash@3 displayName: Test Logs condition: always() inputs: targetType: inline script: | cat rccl-tests-${TEST_JOB_ID}.out || echo "No log found" # - task: PublishTestResults@2 # displayName: 'Publish Results' # condition: succeededOrFailed() # inputs: # searchFolder: $(Pipeline.Workspace) # testResultsFormat: JUnit # testResultsFiles: '**/rccl-tests_output.xml' ================================================ FILE: .azuredevops/tests/pytest/HelloWorld.py ================================================ import pytest def test_HelloWorld(): greeting = "Hello, World!" assert greeting == "Hello, World!" ================================================ FILE: .clang-format ================================================ # Style file for MLSE Libraries based on the modified rocBLAS style # Common settings BasedOnStyle: WebKit TabWidth: 4 IndentWidth: 4 UseTab: Never ColumnLimit: 100 UseCRLF: false # Other languages JavaScript, Proto --- Language: Json DisableFormat: true --- Language: Cpp # http://releases.llvm.org/6.0.1/tools/clang/docs/ClangFormatStyleOptions.html#disabling-formatting-on-a-piece-of-code # int formatted_code; # // clang-format off # void unformatted_code ; # // clang-format on # void formatted_code_again; DisableFormat: false Standard: Cpp11 AccessModifierOffset: -4 AlignAfterOpenBracket: true AlignArrayOfStructures: Right AlignConsecutiveAssignments: true AlignConsecutiveDeclarations: true AlignEscapedNewlines: Left AlignOperands: true AlignTrailingComments: false AllowAllArgumentsOnNextLine: false AllowAllParametersOfDeclarationOnNextLine: true AllowShortBlocksOnASingleLine: Never AllowShortCaseLabelsOnASingleLine: true AllowShortFunctionsOnASingleLine: Empty AllowShortIfStatementsOnASingleLine: false AllowShortLoopsOnASingleLine: false AlwaysBreakAfterReturnType: None AlwaysBreakBeforeMultilineStrings: false AlwaysBreakTemplateDeclarations: Yes BinPackArguments: false BinPackParameters: false BitFieldColonSpacing: Both # Configure each individual brace in BraceWrapping BreakBeforeBraces: Custom # Control of individual brace wrapping cases BraceWrapping: AfterCaseLabel: true AfterClass: true AfterControlStatement: Always AfterEnum: true AfterExternBlock: false AfterFunction: true AfterNamespace: true AfterStruct: true AfterUnion: true BeforeCatch: true BeforeElse: true BeforeLambdaBody: true BeforeWhile: true IndentBraces: false SplitEmptyFunction: false SplitEmptyRecord: false SplitEmptyNamespace: false BreakBeforeBinaryOperators: All BreakBeforeTernaryOperators: true BreakConstructorInitializers: BeforeComma BreakInheritanceList: BeforeComma BreakStringLiterals: true CommentPragmas: '^ IWYU pragma:' CompactNamespaces: false ConstructorInitializerIndentWidth: 4 ContinuationIndentWidth: 4 Cpp11BracedListStyle: true DeriveLineEnding: false DerivePointerAlignment: false EmptyLineAfterAccessModifier: Never EmptyLineBeforeAccessModifier: Always ExperimentalAutoDetectBinPacking: false FixNamespaceComments: true ForEachMacros: [] IfMacros: [] IncludeBlocks: Preserve IndentAccessModifiers: false IndentCaseBlocks: true IndentCaseLabels: true IndentExternBlock: NoIndent IndentPPDirectives: BeforeHash IndentWrappedFunctionNames: true KeepEmptyLinesAtTheStartOfBlocks: true LambdaBodyIndentation: Signature MacroBlockBegin: '' MacroBlockEnd: '' MaxEmptyLinesToKeep: 1 NamespaceIndentation: None PPIndentWidth: -1 PackConstructorInitializers: NextLine PenaltyBreakBeforeFirstCallParameter: 19 PenaltyBreakComment: 300 PenaltyBreakFirstLessLess: 120 PenaltyBreakString: 1000 PenaltyExcessCharacter: 1000000 PenaltyReturnTypeOnItsOwnLine: 60 PointerAlignment: Left QualifierAlignment: Leave ReferenceAlignment: Pointer ReflowComments: false ShortNamespaceLines: 0 SortIncludes: CaseSensitive SortUsingDeclarations: true SpaceAfterCStyleCast: false SpaceAfterLogicalNot: false SpaceAfterTemplateKeyword: false SpaceAroundPointerQualifiers: Default SpaceBeforeAssignmentOperators: true SpaceBeforeCaseColon: false SpaceBeforeCpp11BracedList: false SpaceBeforeCtorInitializerColon: true SpaceBeforeInheritanceColon: true SpaceBeforeParens: Never SpaceBeforeRangeBasedForLoopColon: true SpaceBeforeSquareBrackets: false SpaceInEmptyBlock: false SpaceInEmptyParentheses: false SpacesBeforeTrailingComments: 1 SpacesInAngles: Never SpacesInCStyleCastParentheses: false SpacesInConditionalStatement: false SpacesInContainerLiterals: true SpacesInParentheses: false SpacesInSquareBrackets: false --- ================================================ FILE: .github/CODEOWNERS ================================================ * @ROCm/rccl-reviewers # Documentation files docs/ @ROCm/rocm-documentation *.md @ROCm/rocm-documentation *.rst @ROCm/rocm-documentation .readthedocs.yaml @ROCm/rocm-documentation src/include/api_trace.h @ROCm/ROCM-DevTools-Team ================================================ FILE: .github/PULL_REQUEST_TEMPLATE.md ================================================ ## Details ___Do not mention proprietary info or link to internal work items in this PR.___ **Work item:** _"Internal", or link to GitHub issue (if applicable)._ **What were the changes?** _One sentence describing the work done._ **Why were the changes made?** _Explain the motivation behind the work. Provide any publicly-available historical context._ **How was the outcome achieved?** _Technical details behind the work. Explain any publicly-available hardware peculiarities._ **Additional Documentation:** _What else should the reviewer know?_ ## Approval Checklist ___Do not approve until these items are satisfied.___ - [ ] Verify the CHANGELOG has been updated, if - there are any NCCL API version changes, - any changes impact library users, and/or - any changes impact any other ROCm library. ================================================ FILE: .github/dependabot.yml ================================================ ================================================ FILE: .github/scripts/therock_configure_ci.py ================================================ # Copyright (c) Advanced Micro Devices, Inc., or its affiliates. # SPDX-License-Identifier: MIT import fnmatch import json import os from pathlib import Path import subprocess import sys from typing import Iterable, Optional, Mapping def gha_set_output(vars: Mapping[str, str | Path]): """Sets values in a step's output parameters. This appends to the file located at the $GITHUB_OUTPUT environment variable. See * https://docs.github.com/en/actions/reference/workflow-commands-for-github-actions#setting-an-output-parameter * https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/passing-information-between-jobs """ print(f"Setting github output:\n{vars}") step_output_file = os.getenv("GITHUB_OUTPUT") if not step_output_file: print(" Warning: GITHUB_OUTPUT env var not set, can't set github outputs") return with open(step_output_file, "a") as f: f.writelines(f"{k}={str(v)}" + "\n" for k, v in vars.items()) def get_modified_paths(base_ref: str) -> Optional[Iterable[str]]: """Returns the paths of modified files relative to the base reference.""" try: return subprocess.run( ["git", "diff", "--name-only", base_ref], stdout=subprocess.PIPE, check=True, text=True, timeout=60, ).stdout.splitlines() except TimeoutError: print( "Computing modified files timed out. Not using PR diff to determine" " jobs to run.", file=sys.stderr, ) return None GITHUB_WORKFLOWS_CI_PATTERNS = [ "therock*.yml", ] def is_path_workflow_file_related_to_ci(path: str) -> bool: return any( fnmatch.fnmatch(path, ".github/workflows/" + pattern) for pattern in GITHUB_WORKFLOWS_CI_PATTERNS ) def check_for_workflow_file_related_to_ci(paths: Optional[Iterable[str]]) -> bool: if paths is None: return False return any(is_path_workflow_file_related_to_ci(p) for p in paths) # Paths matching any of these patterns are considered to have no influence over # build or test workflows so any related jobs can be skipped if all paths # modified by a commit/PR match a pattern in this list. SKIPPABLE_PATH_PATTERNS = [ "docs/*", "*.gitignore", "*.md", "*LICENSE*", "*NOTICES*", '.github/CODEOWNERS', '.github/*.md', '.github/dependabot.yml', '.azuredevops*', ] def is_path_skippable(path: str) -> bool: """Determines if a given relative path to a file matches any skippable patterns.""" return any(fnmatch.fnmatch(path, pattern) for pattern in SKIPPABLE_PATH_PATTERNS) def check_for_non_skippable_path(paths: Optional[Iterable[str]]) -> bool: """Returns true if at least one path is not in the skippable set.""" if paths is None: return False return any(not is_path_skippable(p) for p in paths) def should_ci_run_given_modified_paths(paths: Optional[Iterable[str]]) -> bool: """Returns true if CI workflows should run given a list of modified paths.""" if paths is None: print("No files were modified, skipping TheRock CI jobs") return False paths_set = set(paths) github_workflows_paths = set( [p for p in paths if p.startswith(".github/workflows")] ) other_paths = paths_set - github_workflows_paths related_to_ci = check_for_workflow_file_related_to_ci(github_workflows_paths) contains_other_non_skippable_files = check_for_non_skippable_path(other_paths) print("should_ci_run_given_modified_paths findings:") print(f" contains_other_non_skippable_files: {contains_other_non_skippable_files}") if related_to_ci: print("Enabling build jobs since a related workflow file was modified") return True elif contains_other_non_skippable_files: print("Enabling TheRock CI jobs since a non-skippable path was modified") return True else: print( "Only unrelated and/or skippable paths were modified, skipping TheRock CI jobs" ) return False def main(args): base_ref = args.get("base_ref") modified_paths = get_modified_paths(base_ref) print("modified_paths (max 200):", modified_paths[:200]) enable_jobs = should_ci_run_given_modified_paths(modified_paths) output = { 'enable_therock_ci': json.dumps(enable_jobs) } gha_set_output(output) if __name__ == "__main__": args = {} args["base_ref"] = os.environ.get("BASE_REF", "HEAD^1") main(args) ================================================ FILE: .github/workflows/therock-ci-linux.yml ================================================ name: TheRock CI Linux on: workflow_call: inputs: amdgpu_families: type: string artifact_group: type: string extra_cmake_options: type: string permissions: contents: read jobs: therock-build-linux: name: Build Linux Packages runs-on: azure-linux-scale-rocm permissions: id-token: write container: image: ghcr.io/rocm/therock_build_manylinux_x86_64@sha256:1f1ce0ab151146c7f86ee4345be74c42d8ca83200d9d26843e8a71df01ecad4e options: -v /runner/config:/home/awsconfig/ env: AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }} TEATIME_FORCE_INTERACTIVE: 0 AWS_SHARED_CREDENTIALS_FILE: /home/awsconfig/credentials.ini CACHE_DIR: ${{ github.workspace }}/.container-cache # The ccache.conf will be written by setup_ccache.py before this gets used. CCACHE_CONFIGPATH: ${{ github.workspace }}/.ccache/ccache.conf steps: - name: Checkout TheRock repository uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: repository: "ROCm/TheRock" ref: ff46daa79b4c826c4f4676893d0d6586de567dfa # 2026-01-12 commit - name: Checkout rccl repository uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: repository: "ROCm/rccl" path: rccl - name: Checkout rccl-tests repository uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: repository: "ROCm/rccl-tests" path: rccl-tests - name: Install python deps run: | pip install -r requirements.txt # safe.directory must be set before Runner Health Status - name: Adjust git config run: | git config --global --add safe.directory $PWD git config fetch.parallel 10 - name: Setup ccache run: | ./build_tools/setup_ccache.py \ --config-preset "github-oss-presubmit" \ --dir "$(dirname $CCACHE_CONFIGPATH)" \ --local-path "$CACHE_DIR/ccache" - name: Runner health status run: | ./build_tools/health_status.py - name: Fetch sources run: | ./build_tools/fetch_sources.py --jobs 12 - name: Configure Projects env: amdgpu_families: ${{ env.AMDGPU_FAMILIES }} package_version: ADHOCBUILD extra_cmake_options: ${{ inputs.extra_cmake_options }} BUILD_DIR: build run: | python3 build_tools/github_actions/build_configure.py - name: Build therock-dist run: cmake --build build - name: Build therock-archives run: cmake --build build --target therock-archives - name: Report #if: ${{ !cancelled() }} run: | echo "Full SDK du:" echo "------------" du -h -d 1 build/dist/rocm echo "Artifact Archives:" echo "------------------" ls -lh build/artifacts/*.tar.xz echo "Artifacts:" echo "----------" du -h -d 1 build/artifacts echo "CCache Stats:" echo "-------------" ccache -s -v tail -v -n +1 .ccache/compiler_check_cache/* > build/logs/ccache_compiler_check_cache.log - name: Configure AWS Credentials for non-forked repos if: ${{ always() && !github.event.pull_request.head.repo.fork }} uses: aws-actions/configure-aws-credentials@7474bc4690e29a8392af63c5b98e7449536d5c3a # v4.3.1 with: aws-region: us-east-2 role-to-assume: arn:aws:iam::692859939525:role/therock-artifacts-external - name: Post Build Upload if: always() run: | python3 build_tools/github_actions/post_build_upload.py \ --run-id ${{ github.run_id }} \ --artifact-group ${{ env.AMDGPU_FAMILIES }} \ --build-dir build \ --upload therock-test-linux-multi-node: name: "Test multi-node" if: ${{ inputs.amdgpu_families == 'gfx950-dcgpu' }} permissions: contents: read id-token: write needs: [therock-build-linux] uses: ./.github/workflows/therock-test-packages-multi-node.yml with: amdgpu_families: ${{ inputs.amdgpu_families }} artifact_group: ${{ inputs.artifact_group }} test_runs_on: nova-linux-slurm-scale-runner artifact_run_id: ${{ github.run_id }} therock-test-linux-single-node: name: "Test single-node" if: ${{ inputs.amdgpu_families == 'gfx94X-dcgpu' }} needs: [therock-build-linux] uses: ./.github/workflows/therock-test-packages-single-node.yml with: amdgpu_families: ${{ inputs.amdgpu_families }} artifact_group: ${{ inputs.artifact_group }} test_runs_on: linux-mi325-4gpu-ossci-rocm artifact_run_id: ${{ github.run_id }} ================================================ FILE: .github/workflows/therock-ci.yml ================================================ name: TheRock CI for rccl on: push: branches: - develop pull_request: types: - labeled - opened - synchronize workflow_dispatch: permissions: contents: read concurrency: # A PR number if a pull request and otherwise the commit hash. This cancels # queued and in-progress runs for the same PR (presubmit) or commit # (postsubmit). The workflow name is prepended to avoid conflicts between # different workflows. group: ${{ github.workflow }}-${{ github.event.number || github.sha }} cancel-in-progress: true jobs: setup: runs-on: ubuntu-24.04 env: # The commit being checked out is the merge commit for a PR. Its first # parent will be the tip of the base branch. BASE_REF: HEAD^ outputs: enable_therock_ci: ${{ steps.configure.outputs.enable_therock_ci }} steps: - name: "Checking out repository" uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: # We need the parent commit to do a diff fetch-depth: 2 - name: "Configuring CI options" id: configure run: python .github/scripts/therock_configure_ci.py therock-ci-linux: name: TheRock CI Linux needs: setup if: ${{ needs.setup.outputs.enable_therock_ci == 'true' }} permissions: contents: read id-token: write strategy: fail-fast: false matrix: amdgpu_family: [gfx94X-dcgpu, gfx950-dcgpu] uses: ./.github/workflows/therock-ci-linux.yml secrets: inherit with: amdgpu_families: ${{ matrix.amdgpu_family }} artifact_group: ${{ matrix.amdgpu_family }} extra_cmake_options: > -DTHEROCK_ENABLE_ALL=OFF -DTHEROCK_BUILD_TESTING=ON -DTHEROCK_BUNDLE_SYSDEPS=ON -DTHEROCK_ENABLE_COMM_LIBS=ON -DTHEROCK_ENABLE_ROCPROFV3=ON -DTHEROCK_USE_EXTERNAL_RCCL=ON -DTHEROCK_USE_EXTERNAL_RCCL_TESTS=ON -DTHEROCK_RCCL_SOURCE_DIR=./rccl -DTHEROCK_RCCL_TESTS_SOURCE_DIR=./rccl-tests -DTHEROCK_ENABLE_MPI=ON therock_ci_summary: name: TheRock CI Summary if: always() needs: - setup - therock-ci-linux runs-on: ubuntu-24.04 steps: - name: Output failed jobs run: | echo '${{ toJson(needs) }}' FAILED_JOBS="$(echo '${{ toJson(needs) }}' \ | jq --raw-output \ 'map_values(select(.result!="success" and .result!="skipped")) | keys | join(",")' \ )" if [[ "${FAILED_JOBS}" != "" ]]; then echo "The following jobs failed: ${FAILED_JOBS}" exit 1 fi ================================================ FILE: .github/workflows/therock-test-packages-multi-node.yml ================================================ name: TheRock Test Packages multi-node on: workflow_call: inputs: amdgpu_families: type: string artifact_group: type: string test_runs_on: type: string artifact_run_id: type: string workflow_dispatch: inputs: amdgpu_families: type: string artifact_group: type: string test_runs_on: type: string artifact_run_id: type: string permissions: contents: read id-token: write jobs: test_rccl_multi_node: name: 'Test multi-node' runs-on: ${{ inputs.test_runs_on }} defaults: run: shell: bash permissions: contents: read id-token: write env: VENV_DIR: ${{ github.workspace }}/.venv ARTIFACT_RUN_ID: "${{ inputs.artifact_run_id }}" OUTPUT_ARTIFACTS_DIR: /apps/cvs_tests/dist_new/dist/rocm THEROCK_BIN_DIR: "./build/bin" AWS_SHARED_CREDENTIALS_FILE: /apps/cvs_tests/awsconfig/credentials.ini steps: - name: Checkout Repository uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: repository: "ROCm/TheRock" ref: ff46daa79b4c826c4f4676893d0d6586de567dfa # 2026-01-12 commit - name: Run setup test environment workflow uses: './.github/actions/setup_test_environment' with: ARTIFACT_RUN_ID: ${{ env.ARTIFACT_RUN_ID }} AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }} ARTIFACT_GROUP: ${{ inputs.artifact_group }} OUTPUT_ARTIFACTS_DIR: ${{ env.OUTPUT_ARTIFACTS_DIR }} VENV_DIR: ${{ env.VENV_DIR }} FETCH_ARTIFACT_ARGS: "--rccl --tests" IS_PR_FROM_FORK: ${{ github.event.pull_request.head.repo.fork }} # The following step leverages slurm to run multi node rccl tests on the slurm mi350x cluster. # salloc will hold 4 nodes while the commands inside the block run. After the block completes, salloc automatically releases the nodes. # sbatch script runs rccl_heatmap_cvs script which validates and generates a bandwidth heatmap file for different rccl collectives - name: Test gfx950 if: ${{ inputs.amdgpu_families == 'gfx950-dcgpu' }} run: | SETUP_NODES=1 sbatch --wait -N4 /apps/cvs_tests/cvs-sbatch/sbatch/default.sbatch - name: Configure AWS Credentials for non-forked repos if: ${{ always() && !github.event.pull_request.head.repo.fork }} uses: aws-actions/configure-aws-credentials@7474bc4690e29a8392af63c5b98e7449536d5c3a # v4.3.1 with: aws-region: us-east-2 role-to-assume: arn:aws:iam::692859939525:role/therock-artifacts-external - name: Post test report upload if: always() working-directory: ${{ github.workspace }} run: | export PYTHONPATH="${PYTHONPATH}:${{ github.workspace }}/build_tools" python3 build_tools/github_actions/upload_test_report_script.py \ --run-id "${{ github.run_id }}" \ --amdgpu-family "${{ inputs.amdgpu_families }}" \ --report-path "/apps/cvs_tests/test_reports" \ --log-destination "/logs/gfx950-dcgpu" \ --index-file-name "index_rccl_test_report.html" ================================================ FILE: .github/workflows/therock-test-packages-single-node.yml ================================================ name: TheRock Test Packages single-node on: workflow_call: inputs: amdgpu_families: type: string artifact_group: type: string test_runs_on: type: string artifact_run_id: type: string workflow_dispatch: inputs: amdgpu_families: type: string artifact_group: type: string test_runs_on: type: string artifact_run_id: type: string permissions: contents: read jobs: test_rccl_single_node: name: 'Test single-node' runs-on: ${{ inputs.test_runs_on }} container: image: ghcr.io/rocm/no_rocm_image_ubuntu24_04@sha256:4150afe4759d14822f0e3f8930e1124f26e11f68b5c7b91ec9a02b20b1ebbb98 options: --ipc host --group-add video --device /dev/kfd --device /dev/dri --group-add 110 --ulimit memlock=-1:-1 --security-opt seccomp=unconfined --env-file /etc/podinfo/gha-gpu-isolation-settings --user 0:0 defaults: run: shell: bash env: VENV_DIR: ${{ github.workspace }}/.venv ARTIFACT_RUN_ID: "${{ inputs.artifact_run_id }}" OUTPUT_ARTIFACTS_DIR: "./build" THEROCK_BIN_DIR: "./build/bin" steps: - name: Checkout Repository uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: repository: "ROCm/TheRock" ref: ff46daa79b4c826c4f4676893d0d6586de567dfa # 2026-01-12 commit - name: Run setup test environment workflow uses: './.github/actions/setup_test_environment' with: ARTIFACT_RUN_ID: ${{ env.ARTIFACT_RUN_ID }} AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }} ARTIFACT_GROUP: ${{ inputs.artifact_group }} OUTPUT_ARTIFACTS_DIR: ${{ env.OUTPUT_ARTIFACTS_DIR }} VENV_DIR: ${{ env.VENV_DIR }} FETCH_ARTIFACT_ARGS: "--rccl --tests" IS_PR_FROM_FORK: ${{ github.event.pull_request.head.repo.fork }} - name: Test timeout-minutes: 15 # Currently, TheRock CI in RCCL always builds with MPI-supported enabled which causes the # RCCL correctness tests to fail on the mi325 runners which don't have MPI pre-installed. # TODO (geomin12): Rebuild rccl-tests without MPI to enable RCCL correctness tests. run: | pytest ./build_tools/github_actions/test_executable_scripts/test_rccl.py -v -s \ -k "not test_rccl_correctness_tests" \ --log-cli-level=info ================================================ FILE: .gitignore ================================================ # Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved. *.gcov /coverage/ build/ ext/ src/transport/net_ib_rocm.cc # Visual Studio Code .vscode ================================================ FILE: .gitmodules ================================================ [submodule "ext-src/mscclpp"] path = ext-src/mscclpp url = https://github.com/microsoft/mscclpp.git ignore = dirty shallow = true [submodule "ext-src/json"] path = ext-src/json url = https://github.com/nlohmann/json.git ignore = dirty shallow = true [submodule "ext-src/rocSHMEM"] path = ext-src/rocSHMEM url = https://github.com/ROCm/rocSHMEM.git branch = develop ================================================ FILE: .readthedocs.yaml ================================================ # Read the Docs configuration file # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details version: 2 build: os: ubuntu-22.04 tools: python: "3.10" sphinx: configuration: docs/conf.py formats: [htmlzip, pdf, epub] python: install: - requirements: docs/sphinx/requirements.txt ================================================ FILE: CHANGELOG.md ================================================ # Changelog for RCCL Full documentation for RCCL is available at [https://rccl.readthedocs.io](https://rccl.readthedocs.io) ## Unreleased - RCCL 2.28.3 for ROCm 7.11 ### Known issues * AllGather regression for small message sizes (less than 1 MB) due to the Direct algorithm. * ROCTx feature needs to be verified. * Profiler plugin needs to be verified. ### Changed * Compatibility with NCCL 2.28.3. * The MSCCL feature is now disabled by default. The `--disable-msccl-kernel` build flag is replaced with `--enable-msccl-kernel` in the `rccl/install.sh` script. * MSCCL and NPKIT are deprecated and will be removed in a future release of RCCL. ## Unreleased - RCCL 2.27.7 for ROCm 7.2.0 ### Changed * RCCL error messages have been made more verbose in several cases. RCCL now prints out fatal error messages by default. Fatal error messages can be suppressed by setting `NCCL_DEBUG=NONE`. * Disabled `reduceCopyPacks` pipelining for `gfx950`. * Experimental support for traffic shaping using warp specialization (also known as WarpSpeed) is now available for the Ring algorithm. * Enabling WarpSpeed in auto mode using RCCL_WARP_SPEED_AUTO optimizes performance and reduces the CU count by 50% on a single node for AllReduce, AllGather from 64MB, and ReduceScatter from 256MB. * The following configuration knobs control WarpSpeed behavior for debugging purposes: `RCCL_WARP_SPEED_ENABLE`, `RCCL_UNROLL_FACTOR`, `RCCL_WARP_SPEED_CU_COUNT`, and `RCCL_THREADS_PER_BLOCK`. Note that the effective unroll factor is calculated as 2 raised to the value of `RCCL_UNROLL_FACTOR`. ### Known issues * AllToAllv/AlltoAll for single GPU is hanging. ## Unreleased - RCCL 2.27.7 for ROCm 7.1.1 ### Changed * Enabling P2P batching with `RCCL_P2P_BATCH_ENABLE=1` is only applicable up to 32 nodes. ### Resolved Issues * Fixed crash when using the librccl-profiler plugin with the all-to-all collective after the 2.27 update. ## RCCL 2.27.7 for ROCm 7.1.0 ### Added * Added `RCCL_IB_QPS_PER_P2P` to set the number of QPs per connection for P2P operations. When set (≥1), P2P operations (Send/Recv) use `RCCL_IB_QPS_PER_P2P`, while other collective operations continue to use `NCCL_IB_QPS_PER_CONNECTION`. When not set, `NCCL_IB_QPS_PER_CONNECTION` applies to all operations. * Added `RCCL_FORCE_ENABLE_DMABUF` as a debugging feature if the user wants to explicitly enable DMABUF and forego system/kernel checks. * Added `RCCL_P2P_BATCH_THRESHOLD` to set the message size limit for batching P2P operations. This mainly affects small message performance for alltoall at a large scale but also applies to alltoallv. * Added `RCCL_P2P_BATCH_ENABLE` to enable batching P2P operations to receive performance gains for smaller messages up to 4MB for alltoall when the workload requires it. This is to avoid performance dips for larger messages. * Added `RCCL_CHANNEL_TUNING_ENABLE` to enable channel tuning that overrides RCCL's internal adjustments based on threadThreshold. ### Changed * The MSCCL++ feature is now disabled by default. The `--disable-mscclpp` build flag is replaced with `--enable-mscclpp` in the `rccl/install.sh` script. * Compatibility with NCCL 2.27.7. ### Optimized * Enabled and optimized batched P2P operations to improve small message performance for AllToAll and AllGather. * Optimized channel count selection to improve efficiency for small to medium message sizes in ReduceScatter. * Changed code inlining to improve latency for small message sizes for AllReduce, AllGather, and ReduceScatter. ### Known issues * Symmetric memory kernels are currently disabled due to ongoing CUMEM enablement work. * When running this version of RCCL using ROCm versions earlier than 6.4.0, the user must set the environment flag `HSA_NO_SCRATCH_RECLAIM=1`. ## RCCL 2.26.6 for ROCm 7.0.0 ### Resolved issues * Resolved an issue when using more than 64 channels when multiple collectives are used in the same `ncclGroup()` call. * Fixed unit test failures in tests ending with `ManagedMem` and `ManagedMemGraph` suffixes. * Suboptimal algorithmic switching point for AllReduce on MI300x. * Fixed the known issue "When splitting a communicator using `ncclCommSplit` in some GPU configurations, MSCCL initialization can cause a segmentation fault." with a design change to use `comm` instead of `rank` for `mscclStatus`. The Global map for `comm` to `mscclStatus` is still not thread safe but should be explicitly handled by mutexes for read writes. This is tested for correctness, but there is a plan to use a thread-safe map data structure in upcoming changes. * Fixed broken functionality within the LL protocol on gfx950 by disabling inlining of LLGenericOp kernels. ### Added * Added new GPU target `gfx950`. * Added support for `unroll=1` in device-code generation to improve performance, * Set a default of 112 channels for a single node with `8 * gfx950`, * Enabled LL128 protocol on `gfx950`. * Added MSCCL support for AllGather multinode gfx942/gfx950 (i.e., 16 and 32 GPUs). To enable, set the environment variable `RCCL_MSCCL_FORCE_ENABLE=1`. Max message size for MSCCL AllGather usage is `12292 * sizeof(datatype) * nGPUs`. * Thread thresholds for LL/LL128 are selected in Tuning Models for the MI300X. This impacts the number of channels used for AG and RS. Channel tuning model is bypassed if `NCCL_THREAD_THRESHOLDS`, `NCCL_MIN_NCHANNELS', or 'NCCL_MAX_NCHANNELS` are set. * Multi-node tuning for AllGather, AllReduce, and ReduceScatter that leverages LL/LL64/LL128 protocol to use nontemporal vector load/store for tunable message size ranges. * LL/LL128 usage ranges for AR, AG, and RS are part of the tuning models, which enable architecture-specific tuning in conjunction with the existing Rome Models scheme in RCCL. * Two new APIs are exposed as part of an initiative to separate RCCL code. These APIs are `rcclGetAlgoInfo` and `rcclFuncMaxSendRecvCount`. However, user-level invocation requires that RCCL be built with `RCCL_EXPOSE_STATIC` enabled. * Enabled double-buffering in `reduceCopyPacks` to trigger pipelining, especially to overlap `bf16` arithmetic and bridge the gap between `fp32` performance and `bf16` for both `gfx942` and `gfx950`. Pipelining has been made tunable via `rcclSetPipelining`, similar to algorithms/protocols so that regression is avoided in certain message sizes. * Added a direct allgather algorithm. This is enabled by default for multi-node if there are 16 nodes or fewer. The message size threshold is 4MB. * Added `RCCL_OVERRIDE_PROTO` and `RCCL_OVERRIDE_ALGO` to allow direct replacement of protocol and algorithm choices. Unlike `NCCL_PROTO` and `NCCL_ALGO`, which re-run the model across enabled combinations and may not guarantee the intended override, these new options enforce the specified selections explicitly. ### Changed * Compatibility with NCCL 2.23.4. * Compatibility with NCCL 2.24.3. * Compatibility with NCCL 2.25.1. * Compatibility with NCCL 2.26.6. ### Optimized * Improved the performance of the `FP8` Sum operation by upcasting to `FP16`. ### Known Issues * When running this version of RCCL using ROCm versions earlier than 6.4.0, the user must set the environment flag `HSA_NO_SCRATCH_RECLAIM=1`. ## RCCL 2.22.3 for ROCm 6.4.2 ### Added * Added support for the LL128 protocol on gfx942. ## RCCL 2.22.3 for ROCm 6.4.1 ### Resolved issues * Fixed the accuracy issue for MSCCLPP `allreduce7` kernel in graph mode. * Fixed IntraNet performance. * Fixed an issue where, in rare circumstances, the application could stop responding due to a proxy thread synchronization issue. ### Known issues * When splitting a communicator using `ncclCommSplit` in some GPU configurations, MSCCL initialization can cause a segmentation fault. The recommended workaround is to disable MSCCL with `export RCCL_MSCCL_ENABLE=0`. * Within the RCCL-UnitTests test suite, failures occur in tests ending with the `ManagedMem` and `ManagedMemGraph` suffixes. These failures only affect the test results and do not affect the RCCL component itself. This issue will be resolved in the next major release. ## RCCL 2.22.3 for ROCm 6.4.0 ### Added * `RCCL_SOCKET_REUSEADDR` and `RCCL_SOCKET_LINGER` environment parameters. * Setting `NCCL_DEBUG=TRACE NCCL_DEBUG_SUBSYS=VERBS` will generate traces for fifo and data `ibv_post_sends`. * Added `--log-trace` flag to enable traces through the install.sh script (e.g. `./install.sh --log-trace`). ### Changed * Compatibility with NCCL 2.22.3 * Added support for the rail-optimized tree algorithm for the MI300 series. This feature requires the use of all eight GPUs within each node. It limits NIC traffic to use only GPUs of the same index across nodes and should not impact performance on non-rail-optimized network topologies. The original method of building trees can be enabled by setting the environment variable `RCCL_DISABLE_RAIL_TREES=1`. * Additional debug information about how the trees are built can be logged to the GRAPH logging subsys by setting `RCCL_OUTPUT_TREES=1`. * Added documentation about the NPS4 and CPX partition modes performance benefits on the MI300X. ## RCCL 2.21.5 for ROCm 6.3.1 ### Added ### Changed * Enhanced user documentation ### Resolved issues * Corrected user help strings in `install.sh` ## RCCL 2.21.5 for ROCm 6.3.0 ### Added * MSCCL++ integration for AllReduce and AllGather on gfx942 * Performance collection to rccl_replayer * Tuner Plugin example for MI300 * Tuning table for large number of nodes * Support for amdclang++ * Allow NIC ID remapping using `NCCL_RINGS_REMAP` environment variable ### Changed * Compatibility with NCCL 2.21.5 * Increased channel count for MI300X multi-node * Enabled MSCCL for single-process multi-threaded contexts * Enabled gfx12 * Enabled CPX mode for MI300X * Enabled tracing with rocprof * Improved version reporting * Enabled GDRDMA for Linux kernel 6.4.0+ ### Resolved issues * Fixed model matching with PXN enable ## RCCL 2.20.5 for ROCm 6.2.1 ### Fixed - GDR support flag now set with DMABUF ### Known issues - On systems running Linux kernel 6.8.0, such as Ubuntu 24.04, Direct Memory Access (DMA) transfers between the GPU and NIC are disabled and impacts multi-node RCCL performance. - This issue was reproduced with RCCL 2.20.5 (ROCm 6.2.0 and 6.2.1) on systems with Broadcom Thor-2 NICs and affects other systems with RoCE networks using Linux 6.8.0 or newer. - Older RCCL versions are also impacted. - This issue will be addressed in a future ROCm release. ## RCCL 2.20.5 for ROCm 6.2.0 ### Changed - Compatibility with NCCL 2.20.5 - Compatibility with NCCL 2.19.4 - Performance tuning for some collective operations on MI300 - Enabled NVTX code in RCCL - Replaced rccl_bfloat16 with hip_bfloat16 - NPKit updates: - Removed warm-up iteration removal by default, need to opt in now - Doubled the size of buffers to accommodate for more channels - Modified rings to be rail-optimized topology friendly - Replaced ROCmSoftwarePlatform links with ROCm links ### Added - Support for fp8 and rccl_bfloat8 - Support for using HIP contiguous memory - Implemented ROC-TX for host-side profiling - Enabled static build - Added new rome model - Added fp16 and fp8 cases to unit tests - New unit test for main kernel stack size - New -n option for topo_expl to override # of nodes - Improved debug messages of memory allocations ### Fixed - Bug when configuring RCCL for only LL128 protocol - Scratch memory allocation after API change for MSCCL ## RCCL 2.18.6 for ROCm 6.1.0 ### Changed - Compatibility with NCCL 2.18.6 ## RCCL 2.18.3 for ROCm 6.0.0 ### Changed - Compatibility with NCCL 2.18.3 ## RCCL 2.17.1-1 for ROCm 5.7.0 ### Changed - Compatibility with NCCL 2.17.1-1 - Performance tuning for some collective operations ### Added - Minor improvements to MSCCL codepath - NCCL_NCHANNELS_PER_PEER support - Improved compilation performance - Support for gfx94x ### Fixed - Potential race-condition during ncclSocketClose() ## RCCL 2.16.2 for ROCm 5.6.0 ### Changed - Compatibility with NCCL 2.16.2 ### Fixed - Remove workaround and use indirect function call ## RCCL 2.15.5 for ROCm 5.5.0 ### Changed - Compatibility with NCCL 2.15.5 - Unit test executable renamed to rccl-UnitTests ### Added - HW-topology aware binary tree implementation - Experimental support for MSCCL - New unit tests for hipGraph support - NPKit integration ### Fixed - rocm-smi ID conversion - Support for HIP_VISIBLE_DEVICES for unit tests - Support for p2p transfers to non (HIP) visible devices ### Removed - Removed TransferBench from tools. Exists in standalone repo: https://github.com/ROCm/TransferBench ## RCCL-2.13.4 for ROCm 5.4.0 ### Changed - Compatibility with NCCL 2.13.4 - Improvements to RCCL when running with hipGraphs - RCCL_ENABLE_HIPGRAPH environment variable is no longer necessary to enable hipGraph support - Minor latency improvements ### Fixed - Resolved potential memory access error due to asynchronous memset ## RCCL-2.12.10 for ROCm 5.3.0 ### Changed - Improvements to LL128 algorithms ### Added - Adding initial hipGraph support via opt-in environment variable RCCL_ENABLE_HIPGRAPH - Integrating with NPKit (https://github.com/microsoft/NPKit) profiling code ## RCCL-2.12.10 for ROCm 5.2.3 ### Added - Compatibility with NCCL 2.12.10 - Packages for test and benchmark executables on all supported OSes using CPack. - Adding custom signal handler - opt-in with RCCL_ENABLE_SIGNALHANDLER=1 - Additional details provided if Binary File Descriptor library (BFD) is pre-installed - Adding support for reusing ports in NET/IB channels - Opt-in with NCCL_IB_SOCK_CLIENT_PORT_REUSE=1 and NCCL_IB_SOCK_SERVER_PORT_REUSE=1 - When "Call to bind failed : Address already in use" error happens in large-scale AlltoAll (e.g., >=64 MI200 nodes), users are suggested to opt-in either one or both of the options to resolve the massive port usage issue - Avoid using NCCL_IB_SOCK_SERVER_PORT_REUSE when NCCL_NCHANNELS_PER_NET_PEER is tuned >1 ### Removed - Removed experimental clique-based kernels ## RCCL-2.11.4 for ROCm 5.2.0 ### Changed - Unit testing framework rework - Minor bug fixes ### Known issues - Managed memory is not currently supported for clique-based kernels ## RCCL-2.11.4 for ROCm 5.1.0 ### Added - Compatibility with NCCL 2.11.4 ### Known issues - Managed memory is not currently supported for clique-based kernels ## RCCL-2.10.3 for ROCm 5.0.0 ### Added - Compatibility with NCCL 2.10.3 ### Known issues - Managed memory is not currently supported for clique-based kernels ## RCCL-2.9.9 for ROCm 4.5.0 ### Changed - Packaging split into a runtime package called rccl and a development package called rccl-devel. The development package depends on runtime. The runtime package suggests the development package for all supported OSes except CentOS 7 to aid in the transition. The suggests feature in packaging is introduced as a deprecated feature and will be removed in a future rocm release. ### Added - Compatibility with NCCL 2.9.9 ### Known issues - Managed memory is not currently supported for clique-based kernels ## [RCCL-2.8.4 for ROCm 4.3.0] ### Added - Ability to select the number of channels to use for clique-based all reduce (RCCL_CLIQUE_ALLREDUCE_NCHANNELS). This can be adjusted to tune for performance when computation kernels are being executed in parallel. ### Optimizations - Additional tuning for clique-based kernel AllReduce performance (still requires opt in with RCCL_ENABLE_CLIQUE=1) - Modification of default values for number of channels / byte limits for clique-based all reduce based on device architecture ### Changed - Replaced RCCL_FORCE_ENABLE_CLIQUE to RCCL_CLIQUE_IGNORE_TOPO - Clique-based kernels can now be enabled on topologies where all active GPUs are XGMI-connected - Topologies not normally supported by clique-based kernels require RCCL_CLIQUE_IGNORE_TOPO=1 ### Fixed - Install script '-r' flag invoked alone no longer incorrectly deletes any existing builds. ### Known issues - Managed memory is not currently supported for clique-based kernels ## [RCCL-2.8.4 for ROCm 4.2.0] ### Added - Compatibility with NCCL 2.8.4 ### Optimizations - Additional tuning for clique-based kernels - Enabling GPU direct RDMA read from GPU - Fixing potential memory leak issue when re-creating multiple communicators within same process - Improved topology detection ### Known issues - None ## [RCCL-2.7.8 for ROCm 4.1.0] ### Added - Experimental support for clique-based kernels (opt in with RCCL_ENABLE_CLIQUE=1) - Clique-based kernels may offer better performance for smaller input sizes - Clique-based kernels are currently only enabled for AllReduce under a certain byte limit (controlled via RCCL_CLIQUE_ALLREDUCE_BYTE_LIMIT) ### Optimizations - Performance improvements for Rome-based systems ### Known issues - Clique-based kernels are currently experimental and have not been fully tested on all topologies. By default, clique-based kernels are disabled if the detected topology is not supported (override with RCCL_FORCE_ENABLE_CLIQUE) - Clique-based kernels may hang if there are differences between environment variables set across ranks. - Clique-based kernels may fail if the input / output device pointers are not the base device pointers returned by hipMalloc. ## [RCCL-2.7.8 for ROCm 3.9.0] ### Added - Adding support for alltoallv RCCL kernel ### Optimizations - Modifications to topology based on XGMI links ### Known issues - None ## [RCCL-2.7.6 for ROCm 3.8.0] ### Added - Support for static library builds ### Known issues - None ## [RCCL-2.7.6 for ROCm 3.7.0] ### Added - Updated to RCCL API version of 2.7.6 - Added gather, scatter and all-to-all collectives ## [RCCL-2.7.0 for ROCm 3.6.0] ### Added - Updated to RCCL API version of 2.6.4 ## [RCCL-2.7.0 for ROCm 3.5.0] ### Added - Compatibility with NCCL 2.6 - Network interface improvements with API v3 ### Optimizations - Fixing issues and built time improvements for hip-clang - Network topology detection - Improved CPU type detection - Infiniband adaptive routing support ### Changed - Switched to hip-clang as default compiler ### Deprecated - Deprecated hcc build ================================================ FILE: CMakeLists.txt ================================================ # Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved. # Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License. # CMake version minimum requirements #================================================================================================== cmake_minimum_required(VERSION 3.16) # CMake Toolchain file to define compilers and path to ROCm #================================================================================================== if (NOT CMAKE_TOOLCHAIN_FILE) set(CMAKE_TOOLCHAIN_FILE "${CMAKE_CURRENT_SOURCE_DIR}/toolchain-linux.cmake") message(STATUS "CMAKE_TOOLCHAIN_FILE: ${CMAKE_TOOLCHAIN_FILE}") endif() # RCCL project #================================================================================================== project(rccl CXX) # Build options #================================================================================================== option(BUILD_ADDRESS_SANITIZER "Enable address sanitizer" OFF) option(BUILD_BFD "Enable custom backtrace (if bfd.h exists)" OFF) option(BUILD_LOCAL_GPU_TARGET_ONLY "Build only for GPUs detected on this machine" OFF) option(BUILD_SHARED_LIBS "Build as shared library" ON) option(BUILD_TESTS "Build unit test programs" OFF) option(COLLTRACE "Collective Trace Option" ON) option(DUMP_ASM "Disassemble and dump" OFF) option(ENABLE_CODE_COVERAGE "Enable code coverage" OFF) option(ENABLE_MSCCL_KERNEL "Enable MSCCL while compiling" OFF) option(ENABLE_MSCCLPP "Enable MSCCL++" OFF) option(ENABLE_MSCCLPP_CLIP "Enable MSCCL++ CLIP" OFF) option(ENABLE_MSCCLPP_EXECUTOR "Enable MSCCL++ Executor" OFF) option(ENABLE_MSCCLPP_FORMAT_CHECKS "Enable formatting checks in MSCCL++" OFF) option(ENABLE_NPKIT "Enable NPKit" OFF) option(ENABLE_IFC "Enable indirect function call" OFF) option(GENERATE_SYM_KERNELS "Generate symmetric memory kernels" OFF) option(INSTALL_DEPENDENCIES "Force install dependencies" OFF) option(REPORT_KERNEL_RESOURCE_USE "Append -Rpass-analysis=kernel to CXX flags" OFF) option(ROCTX "Enable ROCTX" ON) option(PROFILE "Enable profiling" OFF) option(TIMETRACE "Enable time-trace during compilation" OFF) option(TRACE "Enable additional tracing" OFF) option(FAULT_INJECTION "Enable fault injection" ON) option(QUIET_WARNINGS "Supress compiler warnings" OFF) option(ENABLE_ROCSHMEM "Enable rocSHMEM support in RCCL" OFF) # Default GPU architectures to build #================================================================================================== set(DEFAULT_GPUS gfx906 gfx908 gfx90a gfx942 gfx950 gfx1030 gfx1100 gfx1101 gfx1102 gfx1200 gfx1201) # Load CMake modules #================================================================================================== include(CheckIncludeFiles) include(CheckSymbolExists) include(cmake/Dependencies.cmake) # GTest, rocm-cmake, rocm_local_targets include(cmake/CheckSymbolExistsNoWarn.cmake) # Include rocSHMEM build module only if enabled if(ENABLE_ROCSHMEM) include(cmake/ROCSHMEM.cmake) endif() list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") # Build only for local GPU architecture if (BUILD_LOCAL_GPU_TARGET_ONLY) message(STATUS "Building only for local GPU target") if (COMMAND rocm_local_targets) rocm_local_targets(DEFAULT_GPUS) else() message(WARNING "Unable to determine local GPU targets. Falling back to default GPUs.") endif() endif() # Determine which GPU architectures to build for set(GPU_TARGETS "${DEFAULT_GPUS}" CACHE STRING "Target default GPUs if GPU_TARGETS is not defined.") # ROCM NetIB patch include(cmake/rocmIb.cmake) # Modify GPU architectures for Address Sanitizer builds by appending "xnack+" if (BUILD_ADDRESS_SANITIZER) SET(amdgpu_targets "") foreach(amdgpu_target IN LISTS GPU_TARGETS) if(NOT amdgpu_target STREQUAL "") string(FIND "${amdgpu_target}" ":xnack+" HAS_XNACK_SUFFIX) if(HAS_XNACK_SUFFIX EQUAL -1) list(APPEND amdgpu_targets "${amdgpu_target}:xnack+") else() list(APPEND amdgpu_targets "${amdgpu_target}") endif() endif() endforeach() SET(GPU_TARGETS "${amdgpu_targets}") endif() # Check if clang compiler can offload to GPU_TARGETS if (COMMAND rocm_check_target_ids) message(STATUS "Checking for ROCm support for GPU targets: " "${GPU_TARGETS}") rocm_check_target_ids(SUPPORTED_GPUS TARGETS ${GPU_TARGETS}) else() message(WARNING "Unable to check for supported GPU targets. Falling back to default GPUs.") set(SUPPORTED_GPUS ${DEFAULT_GPUS}) endif() set(GPU_TARGETS "${SUPPORTED_GPUS}") message(STATUS "Compiling for ${GPU_TARGETS}") ## NOTE: Reload rocm-cmake in order to update GPU_TARGETS include(cmake/Dependencies.cmake) # Reloading to use desired GPU_TARGETS instead of defaults # Try to establish ROCM_PATH (for find_package) #================================================================================================== if(NOT DEFINED ROCM_PATH) # Guess default location set(ROCM_PATH "/opt/rocm") message(WARNING "Unable to find ROCM_PATH: Falling back to ${ROCM_PATH}") else() message(STATUS "ROCM_PATH found: ${ROCM_PATH}") endif() set(ENV{ROCM_PATH} ${ROCM_PATH}) if("${CMAKE_CXX_COMPILER}" MATCHES ".*amdclang\\+\\+") message(STATUS "Compiling with amdclang++") set(COMPILER_EXE_NAME amdclang++) set(COMPILER_GREP_STRING "AMD clang version") set(COMPILER_AWK_CMD "awk -F\" \" '{ printf $4}'") elseif("${CMAKE_CXX_COMPILER}" MATCHES ".*clang\\+\\+") message(STATUS "Compiling with clang++") set(COMPILER_EXE_NAME clang++) set(COMPILER_GREP_STRING "AMD clang version") set(COMPILER_AWK_CMD "awk -F\" \" '{ printf $4}'") elseif("${CMAKE_CXX_COMPILER}" MATCHES ".*hipcc$") message(STATUS "Compiling with hipcc") set(COMPILER_EXE_NAME hipcc) set(COMPILER_GREP_STRING "HIP version") set(COMPILER_AWK_CMD "awk -F\" \" '{ printf $3}' | awk -F\"-\" '{ printf $1}'") else() message(FATAL_ERROR "RCCL can be built only with hipcc or amdclang++") endif() # Set CMAKE flags #================================================================================================== set(CMAKE_INSTALL_PREFIX "${ROCM_PATH}" CACHE PATH "") set(CMAKE_CXX_STANDARD 17) # We use C++17 features, this will add compile option: -std=c++17 set(CMAKE_CXX_EXTENSIONS OFF) # Without this line, it will add -std=gnu++17 instead, which has some issues. if(ROCM_PATH) list(APPEND CMAKE_PREFIX_PATH # Add ROCM_PATH to CMake search paths (for finding HIP / HSA ${ROCM_PATH} ${ROCM_PATH}/hip ${ROCM_PATH}/llvm) endif() # Check for required dependencies #================================================================================================== ## Check for Threads set(THREADS_PREFER_PTHREAD_FLAG ON) find_package(Threads REQUIRED) ## Check for HIP find_package(hip REQUIRED) message(STATUS "HIP compiler: ${HIP_COMPILER}") message(STATUS "HIP runtime: ${HIP_RUNTIME}") if(NOT "${HIP_COMPILER}" MATCHES "clang") message(FATAL_ERROR "RCCL requires clang-based compiler (amdclang++ or hipcc)") endif() ## Check for compiler version find_program(compiler_executable ${COMPILER_EXE_NAME}) message(STATUS "${COMPILER_EXE_NAME} executable: ${compiler_executable}") execute_process( COMMAND bash "-c" "${compiler_executable} --version | grep \"${COMPILER_GREP_STRING}\" | ${COMPILER_AWK_CMD}" OUTPUT_VARIABLE compiler_version_string) message(STATUS "${COMPILER_EXE_NAME} version: ${compiler_version_string}") ## Check for HIP version find_program(hipconfig_executable hipconfig) message(STATUS "hipconfig executable: ${hipconfig_executable}") execute_process( COMMAND bash "-c" "${hipconfig_executable} -v | awk -F\"-\" '{ printf $1 }'" OUTPUT_VARIABLE hip_version_string) message(STATUS "${COMPILER_EXE_NAME} HIP version: ${hip_version_string}") ## Check for ROCm version set(EXPLICIT_ROCM_VERSION "" CACHE STRING "Explicit ROCM version to compile to (auto detect if empty)") if(NOT DEFINED ROCMCORE_PATH) set(ROCMCORE_PATH "${ROCM_PATH}" CACHE PATH "Path to ROCm core") endif() if(EXPLICIT_ROCM_VERSION) set(rocm_version_string "${EXPLICIT_ROCM_VERSION}") elseif(ROCMCORE_PATH) message(STATUS "Reading ROCM version from ${ROCMCORE_PATH}/.info/version") file(READ "${ROCMCORE_PATH}/.info/version" rocm_version_string) else() message(FATAL_ERROR "Could not determine ROCM version (set EXPLICIT_ROCM_VERSION or set ROCM_PATH to a valid installation)") endif() string(REGEX MATCH "([0-9]+)\\.([0-9]+)\\.([0-9]+)" rocm_version_matches ${rocm_version_string}) if (rocm_version_matches) set(ROCM_MAJOR_VERSION ${CMAKE_MATCH_1}) set(ROCM_MINOR_VERSION ${CMAKE_MATCH_2}) set(ROCM_PATCH_VERSION ${CMAKE_MATCH_3}) message(STATUS "ROCm version: ${ROCM_MAJOR_VERSION}.${ROCM_MINOR_VERSION}.${ROCM_PATCH_VERSION}") # Convert the version components to int for comparison math(EXPR ROCM_VERSION "(10000 * ${ROCM_MAJOR_VERSION}) + (100 * ${ROCM_MINOR_VERSION}) + ${ROCM_PATCH_VERSION}") add_definitions("-DROCM_VERSION=${ROCM_VERSION}") else() message(WARNING "Failed to extract ROCm version.") endif() ### Required for checking HIP device symbols when building with amdclang++ set(CMAKE_REQUIRED_LIBRARIES hip::device) ### Check for hipDeviceMallocUncached support check_symbol_exists("hipDeviceMallocUncached" "hip/hip_runtime_api.h" HIP_UNCACHED_MEMORY) ### Check for hipHostMallocUncached support check_symbol_exists("hipHostMallocUncached" "hip/hip_runtime_api.h" HIP_HOST_UNCACHED_MEMORY) ### Check for hipDeviceMallocContiguous support check_symbol_exists("hipDeviceMallocContiguous" "hip/hip_runtime_api.h" HIP_CONTIGUOUS_MEMORY) unset(CMAKE_REQUIRED_LIBRARIES) ### Check for indirect function call support if(ENABLE_IFC) if("${hip_version_string}" VERSION_GREATER_EQUAL "5.5.30201") set(IFC_ENABLED ON) message(STATUS "Indirect function call enabled") else() set(IFC_ENABLED OFF) message(WARNING "Indirect function call disabled - requires HIP version >= 5.5.30201") endif() else() set(IFC_ENABLED OFF) endif() ## Check for LL128 support if("${hip_version_string}" VERSION_GREATER_EQUAL "6.1.33591") set(LL128_ENABLED ON) message(STATUS "RCCL LL128 protocol enabled") else() message(STATUS "RCCL LL128 protocol disabled - requires HIP version >= 6.1.33591") endif() ## Check for hsa-runtime64 find_package(hsa-runtime64 REQUIRED) get_target_property(HSA_INCLUDE_PATH hsa-runtime64::hsa-runtime64 INTERFACE_INCLUDE_DIRECTORIES) message(STATUS "HSA runtime: ${HSA_INCLUDE_PATH}") ## Check for amd-smi if ROCm 7.11.0 or newer if(ROCM_VERSION VERSION_GREATER_EQUAL "71100") find_package(amd_smi PATHS ${ROCM_PATH}/lib/cmake/amd_smi) if(amd_smi_FOUND) message(STATUS "amd_smi_INCLUDE_DIR: ${amd_smi_INCLUDE_DIR}") message(STATUS "amd_smi_LIB_DIR: ${amd_smi_LIB_DIR}") set(SMI_INCLUDE_DIR "${amd_smi_INCLUDE_DIR}" CACHE INTERNAL "amd-smi include directory") set(SMI_LIB_DIR "${amd_smi_LIB_DIR}" CACHE INTERNAL "amd-smi library directory") set(SMI_LIB_NAME "amd-smi-lib" CACHE INTERNAL "amd-smi-lib for packaging") if(NOT EXISTS "${SMI_INCLUDE_DIR}" OR NOT EXISTS "${SMI_LIB_DIR}") message(FATAL_ERROR "amd_smi not found in ${SMI_INCLUDE_DIR}") endif() message(STATUS "Found amd_smi at ${SMI_INCLUDE_DIR}") set(SMI_LIBRARIES amd_smi) set(USE_AMDSMI ON CACHE INTERNAL "Use amd-smi instead of rocm-smi") endif() endif() if(NOT USE_AMDSMI) ## Fallback to rocm-smi if amd-smi not found or ROCm < 7.11.0 message(WARNING "Could not find amd_smi. Falling back to rocm_smi.") find_package(rocm_smi PATHS ${ROCM_PATH}/lib/cmake/rocm_smi) if(rocm_smi_FOUND) set(SMI_INCLUDE_DIR "${rocm_smi_INCLUDE_DIR}" CACHE INTERNAL "rocm-smi include directory") set(SMI_LIB_DIR "${rocm_smi_LIB_DIR}" CACHE INTERNAL "rocm-smi library directory") else() message(WARNING "CMake could not find rocm-smi. Checking old include directory structure for rocm_smi") set(SMI_INCLUDE_DIR "${ROCM_PATH}/rocm_smi/include") set(SMI_LIB_DIR "${ROCM_PATH}/rocm_smi/lib") endif() if(NOT EXISTS "${SMI_INCLUDE_DIR}" OR NOT EXISTS "${SMI_LIB_DIR}") message(FATAL_ERROR "rocm_smi not found in ${SMI_INCLUDE_DIR}") endif() message(STATUS "Found rocm_smi at ${SMI_INCLUDE_DIR}") set(SMI_LIB_NAME "rocm-smi-lib" CACHE INTERNAL "rocm-smi-lib for packaging") set(SMI_LIBRARIES rocm_smi64) check_include_file_cxx("${SMI_INCLUDE_DIR}/rocm_smi/rocm_smi64Config.h" HAVE_ROCM_SMI64CONFIG) ### Check for RSMI_INIT_FLAG_THRAD_ONLY_MUTEX support file(READ "${SMI_INCLUDE_DIR}/rocm_smi/rocm_smi.h" rocm_smi_incl) string(FIND "${rocm_smi_incl}" "RSMI_INIT_FLAG_THRAD_ONLY_MUTEX" matchres) if(${matchres} EQUAL -1) message(STATUS "RSMI_INIT_FLAG_THRAD_ONLY_MUTEX not supported") else() message(STATUS "RSMI_INIT_FLAG_THRAD_ONLY_MUTEX supported") set(HAVE_ROCM_SMI_THREAD_ONLY_MUTEX True) endif () endif() ## Check for BFD library if custom backtrace is requested if(BUILD_BFD) enable_language(C) check_include_files(bfd.h HAVE_BFD) if (HAVE_BFD) message(STATUS "-- Found BFD support") ### Required for checking HIP device symbols when building with amdclang++ set(CMAKE_REQUIRED_LIBRARIES hip::device) # Check for specific BFD feature support CHECK_SYMBOL_EXISTS(bfd_get_section_flags "bfd.h" HAVE_DECL_BFD_GET_SECTION_FLAGS) CHECK_SYMBOL_EXISTS(bfd_get_section_vma "bfd.h" HAVE_DECL_BFD_GET_SECTION_VMA) CHECK_CXX_SOURCE_COMPILES( "#include int main (int argc, char **argv){ bfd_size_type size; bfd abfd; asection sec; size = bfd_section_size(&abfd, &sec); return (int)(size); }" HAVE_TWO_ARG_BFD_SECTION_SIZE) unset(CMAKE_REQUIRED_LIBRARIES) # Check for iberty support find_library(HAVE_IBERTY iberty PATHS /usr/lib64 /usr/lib/ PATH_SUFFIXES x86_64-linux-gnu) if(HAVE_IBERTY) message(STATUS "iberty found @ ${HAVE_IBERTY}") endif() # Check for demangle support find_path(DEMANGLE_DIR demangle.h PATHS /usr/include PATH_SUFFIXES libiberty) if(NOT DEMANGLE_DIR) message(WARNING "Could not find demangle.h ${DEMANGLE_DIR}") else() message(STATUS "Found demangle.h in ${DEMANGLE_DIR}") endif() else() message(WARNING "bfd.h header not found - Disabling custom backtrace") endif() endif() # Check for --amdgpu-kernarg-preload-count check_cxx_compiler_flag("-mllvm --amdgpu-kernarg-preload-count=16" HAVE_KERNARG_PRELOAD) if (HAVE_KERNARG_PRELOAD) message(STATUS "Kernarg preloading to SGPR enabled") endif() check_cxx_compiler_flag("-parallel-jobs=12" HAVE_PARALLEL_JOBS) if (HAVE_PARALLEL_JOBS) message(STATUS "Parallel jobs enabled") endif() ## Disable building MSCCL++ if the build environment is invalid ## Currently MSCCL++ is supported only on gfx942 and gfx950, and only on Ubuntu and CentOS set(MSCCLPP_SUPPORTED_ARCHS "gfx942" "gfx942:xnack-" "gfx942:xnack+" "gfx950" "gfx950:xnack-" "gfx950:xnack+") # Check if any of the supported architectures are in GPU_TARGETS set(ARCH_MATCH_FOUND OFF) set(MSCCLPP_GPU_TARGETS "") foreach(ARCH IN LISTS GPU_TARGETS) if(ARCH IN_LIST MSCCLPP_SUPPORTED_ARCHS) set(ARCH_MATCH_FOUND ON) list(APPEND MSCCLPP_GPU_TARGETS "${ARCH}") endif() endforeach() set(MSCCLPP_GPU_TARGETS "${MSCCLPP_GPU_TARGETS}" CACHE STRING "GPU Targets supported by MSCCL++" FORCE) if (ENABLE_MSCCLPP AND NOT ARCH_MATCH_FOUND) set(ENABLE_MSCCLPP OFF) message(WARNING "Can only build MSCCL++ for supported GPU_TARGETS: ${MSCCLPP_SUPPORTED_ARCHS}; current GPU_TARGETS: ${GPU_TARGETS}; so disabling MSCCL++ build") endif() # MSCCL++ is only supported on ROCm 6.2.0 or newer if (ENABLE_MSCCLPP AND ROCM_VERSION VERSION_LESS "60200") set(ENABLE_MSCCLPP OFF) message(WARNING "MSCCL++ integration only supported on ROCm 6.2.0 or greater; disabling MSCCL++ build") endif() ## Disable WARP_SPEED if the build environment is invalid set(WARP_SPEED_SUPPORTED_ARCHS "gfx942" "gfx942:xnack-" "gfx942:xnack+" "gfx950" "gfx950:xnack-" "gfx950:xnack+") set(ARCH_MATCH_FOUND OFF) foreach(ARCH IN LISTS GPU_TARGETS) if(ARCH IN_LIST WARP_SPEED_SUPPORTED_ARCHS) set(ARCH_MATCH_FOUND ON) endif() endforeach() if (NOT ARCH_MATCH_FOUND) set(ENABLE_WARP_SPEED OFF) message(WARNING "Can only build WARP_SPEED for supported GPU_TARGETS: ${WARP_SPEED_SUPPORTED_ARCHS}; current GPU_TARGETS: ${GPU_TARGETS}; so disabling WARP_SPEED build") endif() # cmake_host_system_information(RESULT HOST_OS_ID QUERY DISTRIB_ID) ## Requires cmake 3.22 execute_process( COMMAND bash -c "grep '^ID=' /etc/os-release | cut -d'=' -f2 | cut -d'\"' -f2" OUTPUT_VARIABLE HOST_OS_ID OUTPUT_STRIP_TRAILING_WHITESPACE ) execute_process( COMMAND bash -c "grep '^ID_LIKE=' /etc/os-release | cut -d'=' -f2 | cut -d'\"' -f2" OUTPUT_VARIABLE HOST_OS_FAMILY OUTPUT_STRIP_TRAILING_WHITESPACE ) if (ENABLE_MSCCLPP AND NOT(${HOST_OS_ID} STREQUAL "ubuntu" OR ${HOST_OS_ID} STREQUAL "centos")) set(ENABLE_MSCCLPP OFF) message(WARNING "MSCCL++ integration not supported on this OS (${HOST_OS_ID}); disabling MSCCL++ build") endif() # Check for ROCTX if(ROCTX) find_library(ROCTX_LIB NAMES roctx64) find_path(ROCTRACER_INCLUDE_DIR "roctracer/roctx.h") if(ROCTX_LIB AND ROCTRACER_INCLUDE_DIR) set(ROCTX_ENABLE ON) message(STATUS "ROCTX include directory found: ${ROCTRACER_INCLUDE_DIR}") message(STATUS "ROCTX library found: ${ROCTX_LIB}") else() message(WARNING "ROCTX library not found. Skipping ROCTX linking.") endif() endif() # Determine version from makefiles/version.mk and fill in templates #================================================================================================== ## parse version from Makefile NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH must exist ## NCCL_SUFFIX is optional ## NCCL_VERSION formatting is ((X) * 1000 + (Y) * 100 + (Z)) so we must first detect one or two digits first file(READ makefiles/version.mk version_mk_text) if("${version_mk_text}" MATCHES "NCCL_MAJOR *:= *([0-9]*)") set(NCCL_MAJOR ${CMAKE_MATCH_1}) else() message(FATAL_ERROR "Failed to parse NCCL_MAJOR") endif() if("${version_mk_text}" MATCHES "NCCL_MINOR *:= *([0-9]*)") set(NCCL_MINOR ${CMAKE_MATCH_1}) else() message(FATAL_ERROR "Failed to parse NCCL_MINOR") endif() if("${version_mk_text}" MATCHES "NCCL_PATCH *:= *([0-9]*)") set(NCCL_PATCH ${CMAKE_MATCH_1}) else() message(FATAL_ERROR "Failed to parse NCCL_PATCH") endif() if("${version_mk_text}" MATCHES "NCCL_SUFFIX *:= *([0-9]*)") set(NCCL_SUFFIX ${CMAKE_MATCH_1}) else() set(NCCL_SUFFIX) endif() if("${version_mk_text}" MATCHES "PKG_REVISION *:= *([0-9]*)") set(PKG_REVISION ${CMAKE_MATCH_1}) else() message(FATAL_ERROR "Failed to parse PKG_REVISION") endif() if("${NCCL_PATCH}" MATCHES "[0-9][0-9]") set(NCCL_VERSION "${NCCL_MAJOR}${NCCL_MINOR}${NCCL_PATCH}") else() set(NCCL_VERSION "${NCCL_MAJOR}${NCCL_MINOR}0${NCCL_PATCH}") endif() ## Setup VERSION set(VERSION_STRING "${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}") rocm_setup_version(VERSION ${VERSION_STRING}) ## Fill in version information for main header file configure_file(src/nccl.h.in ${PROJECT_BINARY_DIR}/include/rccl/rccl.h) # For external linking configure_file(src/nccl.h.in ${PROJECT_BINARY_DIR}/include/nccl.h) # Used by some internal files # Collect list of all source files #================================================================================================== # E.g: find src -type f \( -name "*.cc" -o -name "*.h" -o -name "*.hpp" \) | sort set(SRC_FILES src/allocator.cc src/bootstrap.cc src/ce_coll.cc src/channel.cc src/collectives.cc src/commDump.cc src/debug.cc src/dev_runtime.cc src/enqueue.cc src/group.cc src/init.cc src/init_nvtx.cc src/mnnvl.cc src/msccl.cc src/proxy.cc src/rccl_wrap.cc src/sym_kernels.cc src/transport.cc src/device/all_gather.h src/device/all_reduce.h src/device/alltoall_pivot.h src/device/alltoall_gda.h src/device/broadcast.h src/device/common.h src/device/common_kernel.h src/device/op128.h src/device/primitives.h src/device/prims_ll128.h src/device/prims_ll.h src/device/prims_simple.h src/device/reduce.h src/device/reduce_kernel.h src/device/reduce_scatter.h src/device/rccl_metadata.h src/device/rccl_ptr.h src/device/sendrecv.h src/device/common.cu src/device/onerank.cu src/device/network/unpack/unpack_defs.h src/device/network/unpack/unpack.h src/device/symmetric/all_gather.cuh src/device/symmetric/all_reduce.cuh src/device/symmetric/kernel.cuh src/device/symmetric/primitives.cuh src/device/symmetric/reduce_scatter.cuh src/graph/connect.cc src/graph/paths.cc src/graph/rings.cc src/graph/rings.h src/graph/rome_models.cc src/graph/rome_models.h src/graph/search.cc src/graph/topo.cc src/graph/topo.h src/graph/trees.cc src/graph/tuning.cc src/graph/xml.cc src/graph/xml.h src/include/alloc.h src/include/allocator.h src/include/alt_rsmi.h src/include/archinfo.h src/include/api_trace.h src/include/argcheck.h src/include/BfdBacktrace.hpp src/include/bitops.h src/include/bootstrap.h src/include/ce_coll.h src/include/channel.h src/include/checks.h src/include/collectives.h src/include/coll_net.h src/include/comm.h src/include/core.h src/include/cpuset.h # src/include/cudawrap.h src/include/debug.h src/include/dev_runtime.h src/include/device.h src/include/enqueue.h src/include/gdrwrap.h src/include/git_version.h src/include/graph.h src/include/group.h src/include/hip_rocm_version_info.h src/include/ibvcore.h src/include/ibvsymbols.h src/include/ibvwrap.h src/include/info.h src/include/ipcsocket.h src/include/mnnvl.h src/include/nccl_common.h src/include/nccl_device.h src/include/net_device.h src/include/net.h src/include/nvmlwrap.h src/include/nvtx.h src/include/nvtx_payload_schemas.h src/include/nvtx_stub.h src/include/p2p.h src/include/param.h src/include/profiler.h src/include/proxy.h src/include/ras.h src/include/rccl_common.h src/include/rccl_vars.h src/include/register.h src/include/register_inline.h src/include/rccl_float8.h src/include/rocmwrap.h src/include/roctx.h src/include/recorder.h src/include/scheduler.h src/include/shm.h src/include/shmutils.h src/include/signals.h src/include/socket.h src/include/strongstream.h src/include/sym_kernels.h src/include/timer.h src/include/transport.h src/include/trees.h src/include/tuner.h src/include/utils.h src/include/mlx5/mlx5dvcore.h src/include/mlx5/mlx5dvsymbols.h src/include/mlx5/mlx5dvwrap.h src/include/ionic/ionicdvcore.h src/include/ionic/ionicdvsymbols.h src/include/ionic/ionicdvwrap.h src/include/msccl/msccl_lifecycle.h src/include/msccl/msccl_parser.h src/include/msccl/msccl_scheduler.h src/include/msccl/msccl_setup.h src/include/msccl/msccl_status.h src/include/msccl/msccl_struct.h src/include/nccl_device/comm.h src/include/nccl_device/coop.h src/include/nccl_device/core.h src/include/nccl_device/ll_a2a.h src/include/nccl_device/mem_barrier.h src/include/nccl_device/ptr.h src/include/nccl_device/utility.h src/include/nccl_device/impl/comm__funcs.h src/include/nccl_device/impl/comm__types.h src/include/nccl_device/impl/core__funcs.h src/include/nccl_device/impl/core__types.h src/include/nccl_device/impl/ll_a2a__funcs.h src/include/nccl_device/impl/ll_a2a__types.h src/include/nccl_device/impl/mem_barrier__funcs.h src/include/nccl_device/impl/mem_barrier__types.h src/include/nccl_device/impl/ptr__funcs.h src/include/nccl_device/impl/ptr__types.h src/include/npkit/npkit.h src/include/npkit/npkit_event.h src/include/npkit/npkit_struct.h src/include/nvtx3/nvToolsExt.h src/include/nvtx3/nvToolsExtCounters.h src/include/nvtx3/nvToolsExtCuda.h src/include/nvtx3/nvToolsExtCudaRt.h src/include/nvtx3/nvToolsExtMem.h src/include/nvtx3/nvToolsExtMemCudaRt.h src/include/nvtx3/nvToolsExtOpenCL.h src/include/nvtx3/nvToolsExtPayload.h src/include/nvtx3/nvToolsExtPayloadHelper.h src/include/nvtx3/nvToolsExtSemanticsCounters.h src/include/nvtx3/nvToolsExtSemanticsScope.h src/include/nvtx3/nvToolsExtSync.h src/include/nvtx3/nvtx3.hpp src/include/nvtx3/nvtxDetail/nvtxExtHelperMacros.h src/include/nvtx3/nvtxDetail/nvtxExtImpl.h src/include/nvtx3/nvtxDetail/nvtxExtImplCounters_v1.h src/include/nvtx3/nvtxDetail/nvtxExtImplMem_v1.h src/include/nvtx3/nvtxDetail/nvtxExtImplMemCudaRt_v1.h src/include/nvtx3/nvtxDetail/nvtxExtImplPayload_v1.h src/include/nvtx3/nvtxDetail/nvtxExtInit.h src/include/nvtx3/nvtxDetail/nvtxExtPayloadHelperInternal.h src/include/nvtx3/nvtxDetail/nvtxExtPayloadTypeInfo.h src/include/nvtx3/nvtxDetail/nvtxExtTypes.h src/include/nvtx3/nvtxDetail/nvtxImpl.h src/include/nvtx3/nvtxDetail/nvtxImplCore.h src/include/nvtx3/nvtxDetail/nvtxImplCuda_v3.h src/include/nvtx3/nvtxDetail/nvtxImplCudaRt_v3.h src/include/nvtx3/nvtxDetail/nvtxImplOpenCL_v3.h src/include/nvtx3/nvtxDetail/nvtxImplSync_v3.h src/include/nvtx3/nvtxDetail/nvtxInit.h src/include/nvtx3/nvtxDetail/nvtxInitDecls.h src/include/nvtx3/nvtxDetail/nvtxInitDefs.h src/include/nvtx3/nvtxDetail/nvtxLinkOnce.h src/include/nvtx3/nvtxDetail/nvtxTypes.h src/include/proxy_trace/proxy_trace.h src/include/plugin/nccl_net.h src/include/plugin/nccl_profiler.h src/include/plugin/nccl_tuner.h src/include/plugin/plugin.h src/include/plugin/net/net_v6.h src/include/plugin/net/net_v7.h src/include/plugin/net/net_v8.h src/include/plugin/net/net_v9.h src/include/plugin/net/net_v10.h src/include/plugin/net/net_v11.h src/include/plugin/profiler/net_ib_v1.h src/include/plugin/profiler/net_ib.h src/include/plugin/profiler/net_socket_v1.h src/include/plugin/profiler/net_socket.h src/include/plugin/profiler/profiler_v1.h src/include/plugin/profiler/profiler_v2.h src/include/plugin/profiler/profiler_v3.h src/include/plugin/profiler/profiler_v4.h src/include/plugin/profiler/profiler_v5.h src/include/plugin/tuner/tuner_v2.h src/include/plugin/tuner/tuner_v3.h src/include/plugin/tuner/tuner_v4.h src/include/plugin/tuner/tuner_v5.h src/misc/alt_rsmi.cc src/misc/archinfo.cc src/misc/argcheck.cc src/misc/api_trace.c src/misc/api_trace.cc # src/misc/cudawrap.cc # src/misc/gdrwrap.cc src/misc/ibvsymbols.cc src/misc/ibvwrap.cc src/misc/ipcsocket.cc src/misc/mlx5dvsymbols.cc src/misc/mlx5dvwrap.cc src/misc/ionicdvsymbols.cc src/misc/ionicdvwrap.cc src/misc/npkit.cc # src/misc/nvmlwrap.cc src/misc/nvmlwrap_stub.cc src/misc/param.cc src/misc/rocmwrap.cc src/misc/roctx.cc src/misc/recorder.cc src/misc/shmutils.cc src/misc/signals.cc src/misc/socket.cc src/misc/strongstream.cc src/misc/utils.cc src/misc/msccl/msccl_lifecycle.cc src/misc/msccl/msccl_parser.cc src/misc/msccl/msccl_setup.cc src/misc/msccl/msccl_status.cc src/misc/proxy_trace/proxy_trace.cc src/nccl_device/core.cc src/nccl_device/ll_a2a.cc src/nccl_device/mem_barrier.cc src/plugin/net.cc src/plugin/plugin_open.cc src/plugin/profiler.cc src/plugin/tuner.cc src/plugin/net/net_v6.cc src/plugin/net/net_v7.cc src/plugin/net/net_v8.cc src/plugin/net/net_v9.cc src/plugin/net/net_v10.cc src/plugin/net/net_v11.cc src/plugin/profiler/profiler_v1.cc src/plugin/profiler/profiler_v2.cc src/plugin/profiler/profiler_v3.cc src/plugin/profiler/profiler_v4.cc src/plugin/profiler/profiler_v5.cc src/plugin/tuner/tuner_v2.cc src/plugin/tuner/tuner_v3.cc src/plugin/tuner/tuner_v4.cc src/plugin/tuner/tuner_v5.cc src/ras/client.cc src/ras/client_support.cc src/ras/collectives.cc src/ras/peers.cc src/ras/ras.cc src/ras/ras_internal.h src/ras/rasnet.cc src/register/coll_reg.cc src/register/register.cc src/register/sendrecv_reg.cc src/scheduler/symmetric_sched.cc src/transport/coll_net.cc src/transport/generic.cc src/transport/net.cc src/transport/net_ib.cc src/transport/net_ib_rocm.cc src/transport/net_socket.cc src/transport/nvls.cc src/transport/p2p.cc src/transport/profiler.cc src/transport/shm.cc src/include/latency_profiler/CollTrace.h src/include/latency_profiler/CollTraceEvent.h src/include/latency_profiler/CollTraceFunc.h src/include/latency_profiler/CollTraceUtils.h src/include/latency_profiler/EventQueue.h src/misc/latency_profiler/CollTrace.cc src/misc/latency_profiler/CollTraceEvent.cc src/misc/latency_profiler/CollTraceFunc.cc src/misc/latency_profiler/CollTraceUtils.cc ) if(USE_AMDSMI) set(SMI_SOURCES src/include/amdsmi_wrap.h src/misc/amdsmi_wrap.cc ) else() set(SMI_SOURCES src/include/rocm_smi_wrap.h src/misc/rocm_smi_wrap.cc ) endif() list(APPEND SRC_FILES ${SMI_SOURCES}) if (ENABLE_MSCCL_KERNEL) set(MSCCL_KERNEL_SOURCES src/device/msccl_kernel_impl.h src/include/msccl/msccl_kernel.h ) list(APPEND SRC_FILES ${MSCCL_KERNEL_SOURCES}) endif() if (ENABLE_MSCCLPP) set(MSCCLPP_SOURCES src/include/mscclpp/mscclpp_nccl.h src/misc/mscclpp/mscclpp_nccl.cc ) list(APPEND SRC_FILES ${MSCCLPP_SOURCES}) endif() # Hipify source files (copy of source generated into hipify directory) #================================================================================================== find_program(hipify-perl_executable hipify-perl) if(NOT hipify-perl_executable) message(FATAL_ERROR "hipify-perl not found") endif() set(HIPIFY_DIR "${CMAKE_CURRENT_BINARY_DIR}/hipify") ## Loop over each source file to hipify foreach(SRC_FILE ${SRC_FILES}) # Check that file exists if (NOT EXISTS ${CMAKE_SOURCE_DIR}/${SRC_FILE}) message(FATAL_ERROR "Unable to find file listed in CMakeLists.txt: ${CMAKE_SOURCE_DIR}/${SRC_FILE}") endif() # Establish hipified copy of the source file set(HIP_FILE "${HIPIFY_DIR}/${SRC_FILE}") get_filename_component(HIP_FILE_DIR ${HIP_FILE} DIRECTORY) # Make sure the file name is unique and there is no duplicate add_file_unique(HIP_SOURCES ${HIP_FILE}) # Convert .cu files to .cpp so that they get processed properly string(REPLACE "\.cuh" "\.h" HIP_FILE ${HIP_FILE}) string(REPLACE "\.cu" "\.cu.cpp" HIP_FILE ${HIP_FILE}) list(APPEND HIP_SOURCES ${HIP_FILE}) # Create a custom command to create hipified source code if (FAULT_INJECTION) add_custom_command( OUTPUT ${HIP_FILE} COMMAND mkdir -p ${HIP_FILE_DIR} && ${hipify-perl_executable} -quiet-warnings ${CMAKE_SOURCE_DIR}/${SRC_FILE} -o ${HIP_FILE} && ${CMAKE_COMMAND} -E env bash ${CMAKE_CURRENT_SOURCE_DIR}/cmake/scripts/add_unroll.sh ${HIP_FILE} && ${CMAKE_COMMAND} -E env bash ${CMAKE_CURRENT_SOURCE_DIR}/cmake/scripts/add_faults.sh ${HIP_FILE} MAIN_DEPENDENCY ${SRC_FILE} COMMENT "Hipifying ${SRC_FILE} -> ${HIP_FILE}" ) else() add_custom_command( OUTPUT ${HIP_FILE} COMMAND mkdir -p ${HIP_FILE_DIR} && ${hipify-perl_executable} -quiet-warnings ${CMAKE_SOURCE_DIR}/${SRC_FILE} -o ${HIP_FILE} && ${CMAKE_COMMAND} -E env bash ${CMAKE_CURRENT_SOURCE_DIR}/cmake/scripts/add_unroll.sh ${HIP_FILE} MAIN_DEPENDENCY ${SRC_FILE} COMMENT "Hipifying ${SRC_FILE} -> ${HIP_FILE}" ) endif() endforeach() # Adding custom target to hipify all the source files # This is required to make sure that all the hipified source files are # available before compiling the unit tests executable(s) add_custom_target(hipify_all DEPENDS ${HIP_SOURCES}) # Generate device/host tables and all the collective functions that are going to be in librccl.so #================================================================================================== find_package(Python3 COMPONENTS Interpreter REQUIRED) if (NOT Python3_FOUND) message(FATAL_ERROR "RCCL requires Python3 for generating host/device tables") endif() set(GEN_DIR "${HIPIFY_DIR}/gensrc") set(GEN_SYM_DIR "${GEN_DIR}/symmetric") if(ONLY_FUNCS) message(WARNING "Using ONLY_FUNCS = ${ONLY_FUNCS}. Not meant for release builds.") endif() # Execute the python script to generate required collective functions execute_process( COMMAND ${Python3_EXECUTABLE} ${CMAKE_SOURCE_DIR}/src/device/generate.py ${GEN_DIR} ${IFC_ENABLED} ${COLLTRACE} ${ENABLE_MSCCL_KERNEL} ${BUILD_LOCAL_GPU_TARGET_ONLY} ${ONLY_FUNCS} WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} RESULT_VARIABLE gen_py_result ERROR_VARIABLE gen_py_error ) if (gen_py_result) message(SEND_ERROR "Error: ${gen_py_error}") message(FATAL_ERROR "${CMAKE_SOURCE_DIR}/src/device/generate.py failed") endif() if (GENERATE_SYM_KERNELS) # Execute the python script to generate required symmetric memory kernels execute_process( COMMAND ${Python3_EXECUTABLE} ${CMAKE_SOURCE_DIR}/src/device/symmetric/generate.py ${GEN_SYM_DIR} WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} RESULT_VARIABLE gen_sym_py_result ERROR_VARIABLE gen_sym_py_error ) if (gen_sym_py_result) message(SEND_ERROR "Error: ${gen_sym_py_error}") message(FATAL_ERROR "${CMAKE_SOURCE_DIR}/src/device/symmetric/generate.py failed") endif() endif() # Find the generated files in the output directory file(GLOB_RECURSE GENERATED_FILES "${GEN_DIR}/*") # Append all found generated files to the list foreach(file ${GENERATED_FILES}) list(APPEND HIP_SOURCES ${file}) endforeach() # Create an initial git_version.cpp file (that will be updated with latest git version) #================================================================================================== # Create initial empty file at configure time file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/git_version.cpp "") # Add a custom target that always runs at build time to update git version add_custom_target(update_git_version ALL COMMAND ${CMAKE_COMMAND} -DRCCL_SOURCE_DIR=${CMAKE_CURRENT_SOURCE_DIR} -DRCCL_BINARY_DIR=${CMAKE_CURRENT_BINARY_DIR} -P ${CMAKE_CURRENT_SOURCE_DIR}/cmake/scripts/git_version.cmake BYPRODUCTS ${CMAKE_CURRENT_BINARY_DIR}/git_version.cpp COMMENT "Updating git version information" VERBATIM ) list(APPEND HIP_SOURCES ${CMAKE_CURRENT_BINARY_DIR}/git_version.cpp) # Set up RCCL library #================================================================================================== ## Set RCCL source files add_library(rccl ${HIP_SOURCES}) ## Set RCCL dependencies ## Ensure git version is updated before building rccl add_dependencies(rccl update_git_version) ## Set RCCL include directories target_include_directories(rccl PRIVATE ${PROJECT_BINARY_DIR}/include) # for generated rccl.h header target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src) # for hipfied headers target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/device) target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/device/network/unpack) target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/include) target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/include/mlx5) target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/include/nccl_device) target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/include/ionic) target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/include/plugin) target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/gensrc) target_include_directories(rccl PRIVATE ${HSA_INCLUDE_PATH}) target_include_directories(rccl PRIVATE ${ROCM_SMI_INCLUDE_DIR}) target_include_directories(rccl PRIVATE ${ROCMCORE_PATH}/include) if(DEMANGLE_DIR) target_include_directories(rccl PRIVATE ${DEMANGLE_DIR}) endif() if(ROCTX_ENABLE) target_include_directories(rccl PRIVATE ${ROCTRACER_INCLUDE_DIR}) endif() ## Set RCCL compile definitions if(COLLTRACE) target_compile_definitions(rccl PRIVATE ENABLE_COLLTRACE) endif() if(ENABLE_MSCCL_KERNEL) message(WARNING "MSCCL is deprecated and will be removed in a future version of RCCL.") target_compile_definitions(rccl PRIVATE COMPILE_MSCCL_KERNEL) endif() if(ENABLE_MSCCLPP) target_compile_definitions(rccl PRIVATE ENABLE_MSCCLPP) endif() if(USE_AMDSMI) target_compile_definitions(rccl PRIVATE USE_AMDSMI) else() if(HAVE_ROCM_SMI64CONFIG) target_compile_definitions(rccl PRIVATE USE_ROCM_SMI64CONFIG) endif() if(HAVE_ROCM_SMI_THREAD_ONLY_MUTEX) target_compile_definitions(rccl PRIVATE USE_ROCM_SMI_THREAD_ONLY_MUTEX) endif() endif() if(ENABLE_WARP_SPEED) target_compile_definitions(rccl PRIVATE ENABLE_WARP_SPEED) endif() if(ENABLE_ROCSHMEM) target_compile_definitions(rccl PRIVATE ENABLE_ROCSHMEM) endif() # ==== rocSHMEM integration (optional) ==== if (ENABLE_ROCSHMEM) add_rocshmem_targets() # Ensure rocSHMEM is fully built/installed before compiling rccl if (TARGET rocshmem_ext) add_dependencies(rccl rocshmem_ext) endif() if (ROCSHMEM_INCLUDE_DIR) target_include_directories(rccl PRIVATE ${ROCSHMEM_INCLUDE_DIR}) endif() # Moved to where MSCCL target_links ## target_link_libraries(rccl PRIVATE ${ROCSHMEM_LIBRARY}) target_link_libraries(rccl PRIVATE ${IBVERBS}) endif() # NPKit flags ## May be better to move these to a separate file if(ENABLE_NPKIT) message(WARNING "NPKit is deprecated and will be removed in a future version of RCCL. Please consider using alternative profiling tools.") target_compile_definitions(rccl PRIVATE ENABLE_NPKIT) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_TIME_SYNC_GPU) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_TIME_SYNC_CPU) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_ENTRY) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_EXIT) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_ENTRY) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_EXIT) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_ENTRY) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_EXIT) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_COPY_SEND_ENTRY) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_COPY_SEND_EXIT) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_DIRECT_COPY_SEND_ENTRY) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_DIRECT_COPY_SEND_EXIT) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_DIRECT_RECV_ENTRY) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_DIRECT_RECV_EXIT) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_DIRECT_RECV_COPY_SEND_ENTRY) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_DIRECT_RECV_COPY_SEND_EXIT) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_DIRECT_RECV_REDUCE_COPY_SEND_ENTRY) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_DIRECT_RECV_REDUCE_COPY_SEND_EXIT) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_DIRECT_SEND_ENTRY) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_DIRECT_SEND_EXIT) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_DIRECT_SEND_FROM_OUTPUT_ENTRY) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_DIRECT_SEND_FROM_OUTPUT_EXIT) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_RECV_ENTRY) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_RECV_EXIT) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_RECV_COPY_SEND_ENTRY) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_RECV_COPY_SEND_EXIT) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_ENTRY) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_EXIT) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_SEND_ENTRY) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_SEND_EXIT) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_RECV_REDUCE_SEND_ENTRY) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_RECV_REDUCE_SEND_EXIT) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_SEND_ENTRY) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_SEND_EXIT) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_SEND_FROM_OUTPUT_ENTRY) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_SEND_FROM_OUTPUT_EXIT) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_PRIM_SIMPLE_WAIT_PEER_ENTRY) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_PRIM_SIMPLE_WAIT_PEER_EXIT) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_PRIM_LL_WAIT_SEND_ENTRY) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_PRIM_LL_WAIT_SEND_EXIT) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_PRIM_LL128_WAIT_SEND_ENTRY) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_PRIM_LL128_WAIT_SEND_EXIT) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_ENTRY) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_EXIT) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_NET_SEND_ENTRY) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_NET_SEND_EXIT) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_NET_TEST_ENTRY) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_NET_TEST_EXIT) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_NET_RECV_ENTRY) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_NET_RECV_EXIT) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_SEND_ENTRY) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_SEND_EXIT) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_ENTRY) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_EXIT) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_ENTRY) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_EXIT) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_ENTRY) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_EXIT) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_ENTRY) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_EXIT) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_ENTRY) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_EXIT) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_ENTRY) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_EXIT) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_ENTRY) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_EXIT) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_ENTRY) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_EXIT) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_ENTRY) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_EXIT) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_SEND_RECV_LOCAL_COPY_ENTRY) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_SEND_RECV_LOCAL_COPY_EXIT) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_SEND_RECV_SEND_ENTRY) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_SEND_RECV_SEND_EXIT) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_SEND_RECV_RECV_ENTRY) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_SEND_RECV_RECV_EXIT) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_GATHER_RING_ENTRY) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_GATHER_RING_EXIT) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_GATHER_RING_SEND_ENTRY) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_GATHER_RING_SEND_EXIT) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_GATHER_RING_RECV_COPY_SEND_ENTRY) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_GATHER_RING_RECV_COPY_SEND_EXIT) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_GATHER_RING_DIRECT_RECV_ENTRY) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_ALL_GATHER_RING_DIRECT_RECV_EXIT) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_MSCCL_GENERIC_OP_ENTRY) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_MSCCL_GENERIC_OP_EXIT) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_MSCCL_REDUCE_ENTRY) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_MSCCL_REDUCE_EXIT) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_MSCCL_SEND_ENTRY) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_MSCCL_SEND_EXIT) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_MSCCL_RECV_ENTRY) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_MSCCL_RECV_EXIT) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_MSCCL_RUN_ENTRY) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_MSCCL_RUN_EXIT) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_MSCCL_RECV_REDUCE_COPY_ENTRY) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_MSCCL_RECV_REDUCE_COPY_EXIT) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_MSCCL_INIT_ENTRY) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_MSCCL_INIT_EXIT) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_BROADCAST_RING_ENTRY) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_BROADCAST_RING_EXIT) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_ENTRY) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_EXIT) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_SEND_ENTRY) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_SEND_EXIT) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_RECV_REDUCE_SEND_ENTRY) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_RECV_REDUCE_SEND_EXIT) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_RECV_REDUCE_COPY_ENTRY) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_RECV_REDUCE_COPY_EXIT) target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME) endif() if(PROFILE) target_compile_definitions(rccl PRIVATE ENABLE_PROFILING) endif() if(ROCTX_ENABLE) target_compile_definitions(rccl PRIVATE ROCTX_ENABLE) else() target_compile_definitions(rccl PRIVATE NVTX_NO_IMPL) target_compile_definitions(rccl PRIVATE NVTX_DISABLE) endif() if(TRACE) target_compile_definitions(rccl PRIVATE ENABLE_TRACE) endif() if(${HIP_CONTIGUOUS_MEMORY}) target_compile_definitions(rccl PRIVATE HIP_CONTIGUOUS_MEMORY) message(STATUS "HIP_CONTIGUOUS_MEMORY enabled") else() message(STATUS "HIP_CONTIGUOUS_MEMORY disabled") endif() if("${hip_version_string}" VERSION_GREATER_EQUAL "5.7.31920") target_compile_definitions(rccl PRIVATE HIP_UNCACHED_MEMORY) message(STATUS "HIP_UNCACHED_MEMORY enabled") else() message(STATUS "HIP_UNCACHED_MEMORY disabled - requires HIP version >= 5.7.31920") # keep --hipcc-func-supp on older HIP and compiler if(NOT IFC_ENABLED) target_compile_options(rccl PRIVATE --hipcc-func-supp) message(STATUS "--hipcc-func-supp enabled") else() message(STATUS "--hipcc-func-supp disabled") endif() endif() if (HIP_HOST_UNCACHED_MEMORY) target_compile_definitions(rccl PRIVATE HIP_HOST_UNCACHED_MEMORY) message(STATUS "HIP_HOST_UNCACHED_MEMORY enabled") else() message(STATUS "HIP_HOST_UNCACHED_MEMORY disabled") endif() if (BUILD_BFD) if (HAVE_BFD) target_compile_definitions(rccl PRIVATE HAVE_BFD) endif() if (HAVE_DECL_BFD_GET_SECTION_FLAGS) target_compile_definitions(rccl PRIVATE HAVE_DECL_BFD_GET_SECTION_FLAGS) endif() if (HAVE_DECL_BFD_GET_SECTION_VMA) target_compile_definitions(rccl PRIVATE HAVE_DECL_BFD_GET_SECTION_VMA) endif() if (HAVE_TWO_ARG_BFD_SECTION_SIZE) target_compile_definitions(rccl PRIVATE HAVE_TWO_ARG_BFD_SECTION_SIZE) endif() endif() if (IFC_ENABLED) target_compile_definitions(rccl PRIVATE USE_INDIRECT_FUNCTION_CALL) endif() if(DEMANGLE_DIR) target_compile_definitions(rccl PRIVATE "HAVE_CPLUS_DEMANGLE=1") target_compile_definitions(rccl PRIVATE "HAVE_DECL_BASENAME=1") endif() if(LL128_ENABLED) target_compile_definitions(rccl PRIVATE ENABLE_LL128) endif() ## Set RCCL compile options if (HAVE_PARALLEL_JOBS) target_compile_options(rccl PRIVATE -parallel-jobs=12) endif() if (ROCM_VERSION VERSION_GREATER_EQUAL "60200") target_compile_options(rccl PRIVATE --offload-compress) # Compress GPU code at compile time. target_link_libraries(rccl PRIVATE --offload-compress) # Compress GPU code at link time. message(STATUS "--offload-compress enabled - ROCm version >= 6.2.0") else() message(STATUS "--offload-compress disabled - ROCm version < 6.2.0") endif() target_compile_options(rccl PRIVATE -Werror=uninitialized) target_compile_options(rccl PRIVATE -Werror=sometimes-uninitialized) target_compile_options(rccl PRIVATE -Wall) target_compile_options(rccl PRIVATE -Werror=deprecated-copy-with-user-provided-copy) target_compile_options(rccl PRIVATE -Wno-format-nonliteral) target_compile_options(rccl PRIVATE -Wno-unused-function) target_compile_options(rccl PRIVATE -fgpu-rdc) if(QUIET_WARNINGS) target_compile_options(rccl PRIVATE -Wno-invalid-offsetof) target_compile_options(rccl PRIVATE -Wno-unused-result) target_compile_options(rccl PRIVATE -Wno-macro-redefined) target_compile_options(rccl PRIVATE -Wno-unused-label) target_compile_options(rccl PRIVATE -Wno-unused-variable) target_compile_options(rccl PRIVATE -Wno-unused-private-field) target_compile_options(rccl PRIVATE -Wno-null-conversion) target_compile_options(rccl PRIVATE -Wno-missing-braces) endif() ## Set RCCL compile and linker options for unit tests and code coverage if(ENABLE_CODE_COVERAGE) if(NOT CMAKE_BUILD_TYPE MATCHES "Debug") message(FATAL_ERROR "Code coverage is enabled, but the build type is '${CMAKE_BUILD_TYPE}'. " "Code coverage requires 'Debug' build types to expose internal symbols. " "Please set CMAKE_BUILD_TYPE to 'Debug' and reconfigure.") endif() message(STATUS "Code coverage is enabled with build type '${CMAKE_BUILD_TYPE}'.") target_compile_options(rccl PRIVATE -fvisibility=default -Xarch_host -fprofile-instr-generate -Xarch_host -fcoverage-mapping) set(COVERAGE_SHARED_LINKER_FLAGS -fprofile-generate -Wl,--enable-new-dtags,--build-id=sha1,--rpath,$ORIGIN ) set(COVERAGE_EXE_LINKER_FLAGS -fprofile-generate -Wl,--enable-new-dtags,--build-id=sha1,--rpath,$ORIGIN/../lib ) target_link_options(rccl PRIVATE ${COVERAGE_SHARED_LINKER_FLAGS}) target_link_options(rccl PRIVATE ${COVERAGE_EXE_LINKER_FLAGS}) elseif(BUILD_TESTS) # Enable default/hidden visibility based on build type and ROCM_VERSION if (ROCM_VERSION VERSION_GREATER_EQUAL "60400" AND CMAKE_BUILD_TYPE MATCHES "Debug") target_compile_options(rccl PRIVATE -fvisibility=default) else() target_compile_options(rccl PRIVATE -fvisibility=hidden) endif() else() # Enable hidden visibility for library without tests/code coverage enabled target_compile_options(rccl PRIVATE -fvisibility=hidden) endif() if (HAVE_KERNARG_PRELOAD) target_compile_options(rccl PRIVATE -mllvm --amdgpu-kernarg-preload-count=16) endif() if (REPORT_KERNEL_RESOURCE_USE) target_link_options(rccl PRIVATE -Rpass-analysis=kernel-resource-usage) endif() if (DUMP_ASM) # Save temporary files from kernel compilation message(STATUS "Disassembling librccl.so to asm") # Maintain symbols but without changing code. Keep additional data in dwarf section of binary. target_compile_options(rccl PRIVATE -gline-tables-only) set(OBJ_DUMP ${ROCM_PATH}/llvm/bin/llvm-objdump) add_custom_command(TARGET rccl POST_BUILD COMMENT "Disassembling RCCL library" COMMAND /bin/bash -c "${OBJ_DUMP} --offload-fatbin librccl.so" VERBATIM ) foreach(GPUARCH ${GPU_TARGETS}) add_custom_command(TARGET rccl POST_BUILD COMMENT "Disassembling RCCL library to dump assembly for ${GPUARCH}" COMMAND /bin/bash -c "${OBJ_DUMP} -d -l --source --symbolize-operands librccl.so.0.hipv4-amdgcn-amd-amdhsa--${GPUARCH} > librccl.${GPUARCH}.s" VERBATIM ) endforeach() endif() ## NOTE: This is currently being handled by rocm-cmake, however may need to be re-enabled in the future #foreach(target ${GPU_TARGETS}) # target_compile_options(rccl PRIVATE --offload-arch=${target}) #endforeach() if(BUILD_ADDRESS_SANITIZER) target_compile_options(rccl PRIVATE -fsanitize=address -shared-libasan) endif() if(TIMETRACE) target_compile_options(rccl PRIVATE -ftime-trace) endif() if (FAULT_INJECTION) target_compile_definitions(rccl PRIVATE ENABLE_FAULT_INJECTION) message(STATUS "Fault injection enabled") endif() ## Set RCCL linked library directories target_link_directories(rccl PRIVATE ${SMI_LIB_DIR}) if (ROCM_VERSION VERSION_GREATER_EQUAL "60100") option(RCCL_ROCPROFILER_REGISTER "Enable rocprofiler-register support" ON) else() if(RCCL_ROCPROFILER_REGISTER) message(AUTHOR_WARNING "RCCL_ROCPROFILER_REGISTER is not valid option for ROCm < 6.2. Current ROCm version: ${ROCM_VERSION}") endif() set(RCCL_ROCPROFILER_REGISTER OFF CACHE BOOL "" FORCE) endif() if(RCCL_ROCPROFILER_REGISTER) find_package(rocprofiler-register REQUIRED) target_compile_definitions(rccl PRIVATE RCCL_ROCPROFILER_REGISTER=1) target_link_libraries( rccl PRIVATE rocprofiler-register::rocprofiler-register) endif() ## Set RCCL linked libraries if (HAVE_BFD) target_link_libraries(rccl PRIVATE bfd) if(HAVE_IBERTY) target_link_libraries(rccl PRIVATE iberty z) endif() endif() if (ROCTX_ENABLE) target_link_libraries(rccl PRIVATE ${ROCTX_LIB}) endif() target_link_libraries(rccl PRIVATE -fgpu-rdc) # Required when linking relocatable device code target_link_libraries(rccl PRIVATE Threads::Threads) target_link_libraries(rccl INTERFACE hip::host) target_link_libraries(rccl PRIVATE hip::device) target_link_libraries(rccl PRIVATE dl) target_link_libraries(rccl PRIVATE ${SMI_LIBRARIES}) target_link_libraries(rccl PRIVATE fmt::fmt-header-only) if(ENABLE_MSCCLPP) target_link_libraries(rccl PRIVATE mscclpp_nccl) endif() if(ENABLE_ROCSHMEM) target_link_libraries(rccl PRIVATE ${ROCSHMEM_LIBRARY}) target_link_libraries(rccl PRIVATE ${IBVERBS}) endif() ## Set RCCL link options ## Find out available memory execute_process( COMMAND bash "-c" "cat /sys/fs/cgroup/memory.max" OUTPUT_VARIABLE memory_max_string) if (${memory_max_string} MATCHES "^[0-9]+") math(EXPR memory_in_gb "${memory_max_string} / (1024 * 1024 * 1024)") else() execute_process( COMMAND bash "-c" "free | grep -o '[[:digit:]]*' | head -1" OUTPUT_VARIABLE memory_max_string) ## memory_max_string holds the free memory in KB if (${memory_max_string} MATCHES "^[0-9]+") math(EXPR memory_in_gb "${memory_max_string} / (1024 * 1024)") ## KB to GB conversion else() cmake_host_system_information(RESULT memory_max_string QUERY AVAILABLE_PHYSICAL_MEMORY ) math(EXPR memory_in_gb "${memory_max_string} / 1024") endif() endif() ## Reserve 16GB for each linker job. Limit max number of linker jobs to 16 if (HAVE_PARALLEL_JOBS) math(EXPR num_linker_jobs "(${memory_in_gb} + 15) / 16") if (${num_linker_jobs} GREATER_EQUAL "16") set(num_linker_jobs "16") endif() message(STATUS "Use ${num_linker_jobs} jobs for linking") target_link_options(rccl PRIVATE -parallel-jobs=${num_linker_jobs}) # Use multiple threads to link endif() if(BUILD_ADDRESS_SANITIZER) target_link_options(rccl PRIVATE -fuse-ld=lld) endif() if(TIMETRACE) target_link_options(rccl PRIVATE -ftime-trace) endif() if(NOT BUILD_SHARED_LIBS) message(STATUS "Building static RCCL library") else() message(STATUS "Building shared RCCL library") endif() if (HAVE_KERNARG_PRELOAD) target_link_options(rccl PRIVATE "SHELL:-Xoffload-linker -mllvm=-amdgpu-kernarg-preload-count=16") endif() if(ENABLE_MSCCLPP) include(cmake/MSCCLPP.cmake) endif() ## Track linking time set_property(TARGET rccl PROPERTY RULE_LAUNCH_LINK "${CMAKE_COMMAND} -E time") ## Setup librccl.so version rocm_set_soversion(rccl "1.0") if(NOT BUILD_SHARED_LIBS) # To create a static lib with `-fgpu-rdc`, you need `--emit-static-lib` and `--hip-link`. # You also need to invoke amdclang++ again to trigger GPU code generation. set(static_link_flags ${CXXFLAGS} --hip-link -fgpu-rdc --emit-static-lib ) # Find all the libraries we need to link at link time to include them in the clang link # command line. get_target_property(rccl_libs rccl LINK_LIBRARIES) foreach(target ${rccl_libs}) if(TARGET ${target}) get_target_property(location ${target} LOCATION) if(location) LIST(APPEND static_link_flags -l${location}) endif() endif() endforeach() foreach(target ${GPU_TARGETS}) list(APPEND static_link_flags --offload-arch=${target}) endforeach() list(JOIN static_link_flags " " flags_str) # Invoking amdclang++ this way will produce a static archive, so just override ARCHIVE_CREATE. set(CMAKE_CXX_ARCHIVE_CREATE " ${flags_str} -o ") endif() # Install settings #================================================================================================== ## Specify install targets rocm_install_targets(TARGETS rccl) rocm_install(FILES ${PROJECT_BINARY_DIR}/include/rccl/rccl.h src/include/plugin/nccl_net.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rccl) rocm_install(FILES src/include/api_trace.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rccl/amd_detail) file(COPY tools/msccl-algorithms DESTINATION ${PROJECT_BINARY_DIR}) file(COPY tools/msccl-unit-test-algorithms DESTINATION ${PROJECT_BINARY_DIR}) ## Install Algorithm files under share folder rocm_install(DIRECTORY ${PROJECT_BINARY_DIR}/msccl-algorithms DESTINATION ${CMAKE_INSTALL_DATADIR}/rccl) rocm_install(DIRECTORY ${PROJECT_BINARY_DIR}/msccl-unit-test-algorithms DESTINATION ${CMAKE_INSTALL_DATADIR}/rccl) rocm_export_targets( NAMESPACE roc:: TARGETS rccl DEPENDS hip) ## Set package dependencies if(BUILD_ADDRESS_SANITIZER) set(DEPENDS_HIP_RUNTIME "hip-runtime-amd-asan" ) else() set(DEPENDS_HIP_RUNTIME "hip-runtime-amd" ) endif() rocm_package_add_dependencies(DEPENDS "${DEPENDS_HIP_RUNTIME} >= 4.5.0" "${SMI_LIB_NAME}") set(CPACK_DEB_COMPONENT_INSTALL ON) set(CPACK_DEBIAN_PACKAGE_SHLIBDEPS ON) set(CPACK_RPM_COMPONENT_INSTALL ON) set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt" "${ROCM_PATH}") find_file (DEBIAN debian_version debconf.conf PATHS /etc) if(DEBIAN) # Write copyright file file(WRITE "${CMAKE_BINARY_DIR}/copyright" "Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ Upstream-Name: rccl Source: https://github.com/ROCm/rccl Files: * Copyright: (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. Modifications Copyright (c) 2020-2023 Advanced Micro Devices, Inc. All rights reserved. Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License. License: See LICENSE.txt for license information\n") rocm_install(FILES "${CMAKE_BINARY_DIR}/copyright" DESTINATION ${CMAKE_INSTALL_DATADIR}/rccl) # Write changelog file find_program( date_executable date ) execute_process(COMMAND ${date_executable} -R OUTPUT_VARIABLE TIMESTAMP) file(WRITE "${CMAKE_BINARY_DIR}/changelog" "rccl (${VERSION_STRING}-1) unstable; urgency=medium * Initial release. -- RCCL Maintainer ${TIMESTAMP}\n") find_program( gzip_executable gzip ) execute_process(COMMAND bash "-c" "${gzip_executable} -9 -c -n ${CMAKE_BINARY_DIR}/changelog" WORKING_DIRECTORY ${CMAKE_BINARY_DIR} OUTPUT_FILE "${CMAKE_BINARY_DIR}/changelog.Debian.gz") rocm_install(FILES "${CMAKE_BINARY_DIR}/changelog.Debian.gz" DESTINATION ${CMAKE_INSTALL_DATADIR}/rccl) set(CPACK_DEBIAN_PACKAGE_DESCRIPTION "ROCm Communication Collectives Library Optimized primitives for collective multi-GPU communication") endif() ## Building RCCL RAS include(cmake/rcclRAS.cmake) if(BUILD_TESTS) rocm_package_setup_component(clients) rocm_package_setup_client_component(tests PACKAGE_NAME unittests) add_subdirectory(test) if(BUILD_SHARED_LIBS) add_custom_command(TARGET rccl POST_BUILD COMMENT "Extracting metadata from librccl.so" COMMAND COMMAND ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_SOURCE_DIR}/cmake/scripts/extract_metadata.cmake VERBATIM ) endif() endif() rocm_create_package( NAME rccl DESCRIPTION "ROCm Communication Collectives Library" MAINTAINER "RCCL Maintainer " LDCONFIG) ================================================ FILE: CppCheckSuppressions.txt ================================================ arrayIndexThenCheck:src/bootstrap.cc:304 arrayIndexThenCheck:src/debug.cc:88 arrayIndexThenCheck:src/graph/search.cc:844 arrayIndexThenCheck:src/graph/search.cc:916 arrayIndexThenCheck:src/graph/search.cc:927 clarifyCalculation:src/graph/topo.cc:702 clarifyCalculation:src/graph/topo.cc:720 clarifyCondition:src/enqueue.cc:416 funcArgNamesDifferent:src/graph/topo.cc:135 funcArgNamesDifferent:src/graph/topo.h:144 nullPointerRedundantCheck:src/misc/utils.cc:102 nullPointerRedundantCheck:src/misc/utils.cc:109 nullPointerRedundantCheck:src/proxy.cc:143 nullPointerRedundantCheck:src/proxy.cc:144 nullPointerRedundantCheck:src/proxy.cc:147 nullPointerRedundantCheck:src/proxy.cc:148 nullPointerRedundantCheck:src/proxy.cc:149 nullPointerRedundantCheck:src/proxy.cc:150 nullPointerRedundantCheck:src/proxy.cc:151 nullPointerRedundantCheck:src/proxy.cc:155 nullPointerRedundantCheck:src/proxy.cc:159 nullPointerRedundantCheck:src/proxy.cc:160 nullPointerRedundantCheck:src/proxy.cc:161 nullPointerRedundantCheck:src/proxy.cc:163 nullPointerRedundantCheck:src/proxy.cc:165 nullPointerRedundantCheck:src/proxy.cc:167 nullPointerRedundantCheck:src/proxy.cc:168 nullPointerRedundantCheck:src/proxy.cc:340 nullPointerRedundantCheck:src/proxy.cc:342 nullPointerRedundantCheck:src/proxy.cc:93 nullPointerRedundantCheck:src/proxy.cc:94 redundantAssignment:src/proxy.cc:161 redundantAssignment:src/proxy.cc:163 redundantCopy:src/graph/rings.cc:16 redundantCopy:src/graph/rings.cc:17 terminateStrncpy:src/misc/utils.cc:99 terminateStrncpy:src/transport/net_socket.cc:245 unreachableCode:src/transport/net.cc:555 unreadVariable:src/graph/tuning.cc:109 unreadVariable:src/graph/tuning.cc:110 unreadVariable:src/graph/tuning.cc:113 unusedFunction:src/graph/topo.cc:37 unusedFunction:src/graph/topo.cc:836 unusedFunction:src/misc/gdrwrap.cc:109 unusedFunction:src/misc/gdrwrap.cc:117 unusedFunction:src/misc/gdrwrap.cc:130 unusedFunction:src/misc/gdrwrap.cc:144 unusedFunction:src/misc/gdrwrap.cc:158 unusedFunction:src/misc/gdrwrap.cc:172 unusedFunction:src/misc/gdrwrap.cc:186 unusedFunction:src/misc/gdrwrap.cc:200 unusedFunction:src/misc/gdrwrap.cc:209 unusedFunction:src/misc/gdrwrap.cc:218 unusedFunction:src/misc/gdrwrap.cc:232 unusedFunction:src/misc/gdrwrap.cc:52 unusedFunction:src/misc/ibvwrap.cc:203 unusedFunction:src/misc/ibvwrap.cc:239 unusedFunction:src/misc/ibvwrap.cc:255 unusedFunction:src/misc/nvmlwrap.cc:112 unusedFunction:src/misc/nvmlwrap_stub.cc:31 unusedFunction:src/misc/nvmlwrap_stub.cc:35 unusedFunction:src/transport.cc:71 unusedLabel:src/bootstrap.cc:349 unusedLabel:src/clique/ShmObject.h:112 unusedLabel:src/clique/ShmObject.h:204 unusedLabel:src/enqueue.cc:108 unusedLabel:src/enqueue.cc:1093 unusedLabel:src/enqueue.cc:989 unusedLabel:src/init.cc:1189 unusedLabel:src/init.cc:1240 unusedLabel:src/init.cc:1267 unusedLabel:src/transport.cc:238 unusedStructMember:src/graph/xml.cc:410 unusedStructMember:src/graph/xml.cc:411 unusedStructMember:src/graph/xml.cc:412 unusedStructMember:src/graph/xml.cc:428 unusedStructMember:src/graph/xml.cc:431 unusedStructMember:src/graph/xml.cc:432 unusedStructMember:src/graph/xml.cc:435 unusedStructMember:src/graph/xml.cc:437 variableScope:src/graph/search.cc:494 variableScope:src/init.cc:240 variableScope:src/transport/net_ib.cc:117 variableScope:src/transport/net_socket.cc:431 ================================================ FILE: LICENSE.txt ================================================ Attributions Contains contributions from NVIDIA. Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. Modifications Copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved. Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National Laboratory, the U.S. Department of Energy, nor the names of their contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. The U.S. Department of Energy funded the development of this software under subcontract 7078610 with Lawrence Berkeley National Laboratory. This code also includes files from the NVIDIA Tools Extension SDK project. See: https://github.com/NVIDIA/NVTX for more information and license details. ================================================ FILE: Makefile ================================================ # # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # .PHONY : all clean default : src.build install : src.install BUILDDIR ?= $(abspath ./build) ABSBUILDDIR := $(abspath $(BUILDDIR)) TARGETS := src pkg clean: ${TARGETS:%=%.clean} test.build: src.build LICENSE_FILES := LICENSE.txt LICENSE_TARGETS := $(LICENSE_FILES:%=$(BUILDDIR)/%) lic: $(LICENSE_TARGETS) ${BUILDDIR}/%.txt: %.txt @printf "Copying %-35s > %s\n" $< $@ mkdir -p ${BUILDDIR} cp $< $@ src.%: ${MAKE} -C src $* BUILDDIR=${ABSBUILDDIR} pkg.%: ${MAKE} -C pkg $* BUILDDIR=${ABSBUILDDIR} pkg.debian.prep: lic pkg.txz.prep: lic ================================================ FILE: NOTICES.txt ================================================ Notices and Licenses file _______________________________________________________________ Dependencies on nvidia-nccl v2.27.3-1 (BSD3) Copyright (c) 2015-2025, NVIDIA CORPORATION. All rights reserved. Modifications Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All rights reserved. Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National Laboratory, the U.S. Department of Energy, nor the names of their contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. The U.S. Department of Energy funded the development of this software under subcontract 7078610 with Lawrence Berkeley National Laboratory. This code also includes files from the NVIDIA Tools Extension SDK project. See: https://github.com/NVIDIA/NVTX for more information and license details. _______________________________________________________________ Dependencies on NPKit (MIT License) Copyright (c) Microsoft Corporation. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE _______________________________________________________________ Dependencies on MSCCL++ (MIT License) Copyright (c) Microsoft Corporation. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE See: https://github.com/microsoft/mscclpp for more information and license details. _______________________________________________________________ Dependencies on Latency Profiler (MIT License) Copyright (c) Meta Platforms, Inc. and affiliates. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. See: src/include/latency_profiler src/misc/latency_profiler ================================================ FILE: README.md ================================================ # RCCL > [!CAUTION] > The rccl repository is retired, please use the [ROCm/rocm-systems](https://github.com/ROCm/rocm-systems) repository ROCm Communication Collectives Library [![RCCL](https://dev.azure.com/ROCm-CI/ROCm-CI/_apis/build/status%2Frccl?repoName=ROCm%2Frccl&branchName=develop)](https://dev.azure.com/ROCm-CI/ROCm-CI/_build/latest?definitionId=107&repoName=ROCm%2Frccl&branchName=develop) [![TheRock CI](https://github.com/ROCm/rccl/actions/workflows/therock-ci.yml/badge.svg?branch=develop&event=push)](https://github.com/ROCm/rccl/actions/workflows/therock-ci.yml) > **Note:** The published documentation is available at [RCCL](https://rocm.docs.amd.com/projects/rccl/en/latest/index.html) in an organized easy-to-read format that includes a table of contents and search functionality. The documentation source files reside in the [rccl/docs](https://github.com/ROCm/rccl/tree/develop/docs) folder in this repository. As with all ROCm projects, the documentation is open source. For more information, see [Contribute to ROCm documentation](https://rocm.docs.amd.com/en/latest/contribute/contributing.html). ## Introduction RCCL (pronounced "Rickle") is a stand-alone library of standard collective communication routines for GPUs, implementing all-reduce, all-gather, reduce, broadcast, reduce-scatter, gather, scatter, and all-to-all. There is also initial support for direct GPU-to-GPU send and receive operations. It has been optimized to achieve high bandwidth on platforms using PCIe, xGMI as well as networking using InfiniBand Verbs or TCP/IP sockets. RCCL supports an arbitrary number of GPUs installed in a single node or multiple nodes, and can be used in either single- or multi-process (e.g., MPI) applications. The collective operations are implemented using ring and tree algorithms and have been optimized for throughput and latency. For best performance, small operations can be either batched into larger operations or aggregated through the API. ## Requirements 1. ROCm supported GPUs 2. ROCm stack installed on the system (HIP runtime & HIP-Clang) ## Quickstart RCCL Build RCCL directly depends on HIP runtime plus the HIP-Clang compiler, which are part of the ROCm software stack. For ROCm installation instructions, see https://github.com/ROCm/ROCm. The root of this repository has a helper script `install.sh` to build and install RCCL with a single command. It hard-codes configurations that can be specified through invoking cmake directly, but it's a great way to get started quickly and can serve as an example of how to build/install RCCL. ### To build the library using the install script: ```shell ./install.sh ``` For more info on build options/flags when using the install script, use `./install.sh --help` ```shell ./install.sh --help RCCL build & installation helper script Options: --address-sanitizer Build with address sanitizer enabled -c|--enable-code-coverage Enable code coverage -d|--dependencies Install RCCL dependencies --debug Build debug library --enable_backtrace Build with custom backtrace support --disable-colltrace Build without collective trace --enable-msccl-kernel Build with MSCCL kernels --enable-mscclpp Build with MSCCL++ support --enable-mscclpp-clip Build MSCCL++ with clip wrapper on bfloat16 and half addition routines --disable-roctx Build without ROCTX logging -f|--fast Quick-build RCCL (local gpu arch only, no backtrace, and collective trace support) -h|--help Prints this help message -i|--install Install RCCL library (see --prefix argument below) -j|--jobs Specify how many parallel compilation jobs to run ($nproc by default) -l|--local_gpu_only Only compile for local GPU architecture --amdgpu_targets Only compile for specified GPU architecture(s). For multiple targets, separate by ';' (builds for all supported GPU architectures by default) --no_clean Don't delete files if they already exist --npkit-enable Compile with npkit enabled --log-trace Build with log trace enabled (i.e. NCCL_DEBUG=TRACE) --openmp-test-enable Enable OpenMP in rccl unit tests -p|--package_build Build RCCL package --prefix Specify custom directory to install RCCL to (default: `/opt/rocm`) --run_tests_all Run all rccl unit tests (must be built already) -r|--run_tests_quick Run small subset of rccl unit tests (must be built already) --static Build RCCL as a static library instead of shared library -t|--tests_build Build rccl unit tests, but do not run --time-trace Plot the build time of RCCL (requires `ninja-build` package installed on the system) --verbose Show compile commands ``` By default, RCCL builds for all GPU targets defined in `DEFAULT_GPUS` in `CMakeLists.txt`. To target specific GPU(s), and potentially reduce build time, use `--amdgpu_targets` as a `;` separated string listing GPU(s) to target. ## Manual build ### To build the library using CMake: ```shell $ git clone --recursive https://github.com/ROCm/rccl.git $ cd rccl $ mkdir build $ cd build $ cmake .. $ make -j 16 # Or some other suitable number of parallel jobs ``` If you have already cloned, you can checkout the external submodules manually. ```shell $ git submodule update --init --recursive --depth=1 ``` You may substitute an installation path of your own choosing by passing `CMAKE_INSTALL_PREFIX`. For example: ```shell $ cmake -DCMAKE_INSTALL_PREFIX=$PWD/rccl-install -DCMAKE_BUILD_TYPE=Release .. ``` Note: ensure rocm-cmake is installed, `apt install rocm-cmake`. ### To build the RCCL package and install package : Assuming you have already cloned this repository and built the library as shown in the previous section: ```shell $ cd rccl/build $ make package $ sudo dpkg -i *.deb ``` RCCL package install requires sudo/root access because it installs under `/opt/rocm/`. This is an optional step as RCCL can instead be used directly by including the path containing `librccl.so`. ## Docker build Refer to [docker/README.md](docker/README.md "docker/README.md") ## Tests There are rccl unit tests implemented with the Googletest framework in RCCL. The rccl unit tests require Googletest 1.10 or higher to build and execute properly (installed with the -d option to install.sh). To invoke the rccl unit tests, go to the build folder, then the test subfolder, and execute the appropriate rccl unit test executable(s). rccl unit test names are now of the format: CollectiveCall.[Type of test] Filtering of rccl unit tests should be done with environment variable and by passing the `--gtest_filter` command line flag, for example: ```shell UT_DATATYPES=ncclBfloat16 UT_REDOPS=prod ./rccl-UnitTests --gtest_filter="AllReduce.C*" ``` will run only AllReduce correctness tests with float16 datatype. A list of available filtering environment variables appears at the top of every run. See "Running a Subset of the Tests" at https://google.github.io/googletest/advanced.html#running-a-subset-of-the-tests for more information on how to form more advanced filters. There are also other performance and error-checking tests for RCCL. These are maintained separately at https://github.com/ROCm/rccl-tests. See the rccl-tests README for more information on how to build and run those tests. ## Library and API Documentation Please refer to the [RCCL Documentation Site](https://rocm.docs.amd.com/projects/rccl/en/latest/) for current documentation. ### How to build documentation Run the steps below to build documentation locally. ```shell cd docs pip3 install -r sphinx/requirements.txt python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html ``` ## Copyright All source code and accompanying documentation is copyright (c) 2015-2025, NVIDIA CORPORATION. All rights reserved. All modifications are copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved. ================================================ FILE: cmake/CheckSymbolExistsNoWarn.cmake ================================================ # MIT License # # Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # These overrides are due to CMake CHECK_SYMBOL_EXISTS modifying CMAKE_CXX_FLAGS to do a test compile, # while ROCMChecks gives a warning if this variable is modified manually without a target. # We now choose to disable ROCMChecks for this one case. set(DISABLE_ROCM_CHECK OFF) function(rocm_check_toolchain_var var access value list_file) if(NOT DISABLE_ROCM_CHECK) _rocm_check_toolchain_var("${var}" "${access}" "${value}" "${list_file}") endif() endfunction() macro(CHECK_SYMBOL_EXISTS) set(DISABLE_ROCM_CHECK ON) _check_symbol_exists(${ARGN}) set(DISABLE_ROCM_CHECK OFF) endmacro() ================================================ FILE: cmake/Dependencies.cmake ================================================ # MIT License # # Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # Dependencies # HIP dependency is handled earlier in the project cmake file # when VerifyCompiler.cmake is included. # GIT # Test dependencies # For downloading, building, and installing required dependencies include(cmake/DownloadProject.cmake) include(FetchContent) if(NOT INSTALL_DEPENDENCIES) find_package(GTest 1.11) endif() if(NOT GTest_FOUND AND BUILD_TESTS OR INSTALL_DEPENDENCIES) if(CMAKE_CXX_COMPILER MATCHES ".*/hipcc$") # hip-clang cannot compile googlebenchmark for some reason set(COMPILER_OVERRIDE "-DCMAKE_CXX_COMPILER=g++") endif() # unset(GTEST_INCLUDE_DIR CACHE) # unset(GTEST_INCLUDE_DIRS CACHE) message(STATUS "GTest not found. Downloading and building GTest.") # Download, build and install googletest library set(GTEST_ROOT ${CMAKE_CURRENT_BINARY_DIR}/gtest CACHE PATH "") download_project(PROJ googletest GIT_REPOSITORY https://github.com/google/googletest.git GIT_TAG release-1.12.0 INSTALL_DIR ${GTEST_ROOT} CMAKE_ARGS -DBUILD_GTEST=ON -DCMAKE_INSTALL_PREFIX= ${COMPILER_OVERRIDE} -DBUILD_SHARED_LIBS=OFF LOG_DOWNLOAD TRUE LOG_CONFIGURE TRUE LOG_BUILD TRUE LOG_INSTALL TRUE UPDATE_DISCONNECTED TRUE ) set(GTEST_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/gtest/include CACHE PATH "") set(GMOCK_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/gmock/include CACHE PATH "") if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/gtest/lib) set(GTEST_BOTH_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/gtest/lib/libgtest.a;${CMAKE_CURRENT_BINARY_DIR}/gtest/lib/libgtest_main.a CACHE PATH "") set(GMOCK_BOTH_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/gtest/lib/libgmock.a;${CMAKE_CURRENT_BINARY_DIR}/gtest/lib/libgmock_main.a CACHE PATH "") elseif(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/gtest/lib64) set(GTEST_BOTH_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/gtest/lib64/libgtest.a;${CMAKE_CURRENT_BINARY_DIR}/gtest/lib64/libgtest_main.a CACHE PATH "") set(GMOCK_BOTH_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/gtest/lib64/libgmock.a;${CMAKE_CURRENT_BINARY_DIR}/gtest/lib64/libgmock_main.a CACHE PATH "") else() message(FATAL_ERROR "Cannot find gtest library installation path.") find_package(GTest REQUIRED CONFIG PATHS ${GTEST_ROOT}) find_package(GMock REQUIRED CONFIG PATHS ${GTEST_ROOT}) endif() elseif(GTest_FOUND AND BUILD_TESTS) set(GTEST_BOTH_LIBRARIES "GTest::gtest;GTest::gtest_main") set(GMOCK_BOTH_LIBRARIES "GTest::gmock;GTest::gmock_main") endif() # Find or download/install rocm-cmake project set( PROJECT_EXTERN_DIR ${CMAKE_CURRENT_BINARY_DIR}/extern ) find_package(ROCM 0.7.3 QUIET CONFIG PATHS /opt/rocm) if(NOT ROCM_FOUND) set(rocm_cmake_tag "master" CACHE STRING "rocm-cmake tag to download") file( DOWNLOAD https://github.com/ROCm/rocm-cmake/archive/${rocm_cmake_tag}.zip ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag}.zip STATUS rocm_cmake_download_status LOG rocm_cmake_download_log ) list(GET rocm_cmake_download_status 0 rocm_cmake_download_error_code) if(rocm_cmake_download_error_code) message(FATAL_ERROR "Error: downloading " "https://github.com/ROCm/rocm-cmake/archive/${rocm_cmake_tag}.zip failed " "error_code: ${rocm_cmake_download_error_code} " "log: ${rocm_cmake_download_log} " ) endif() execute_process( COMMAND ${CMAKE_COMMAND} -E tar xzf ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag}.zip WORKING_DIRECTORY ${PROJECT_EXTERN_DIR} RESULT_VARIABLE rocm_cmake_unpack_error_code ) execute_process( COMMAND ${CMAKE_COMMAND} -DCMAKE_INSTALL_PREFIX=${PROJECT_EXTERN_DIR}/rocm-cmake . WORKING_DIRECTORY ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag} ) execute_process( COMMAND ${CMAKE_COMMAND} --build rocm-cmake-${rocm_cmake_tag} --target install WORKING_DIRECTORY ${PROJECT_EXTERN_DIR}) if(rocm_cmake_unpack_error_code) message(FATAL_ERROR "Error: unpacking ${CMAKE_CURRENT_BINARY_DIR}/rocm-cmake-${rocm_cmake_tag}.zip failed") endif() find_package( ROCM 0.7.3 REQUIRED CONFIG PATHS ${PROJECT_EXTERN_DIR}/rocm-cmake ) endif() set(CMAKE_INSTALL_LIBDIR lib CACHE STRING "Define install directory for libraries" FORCE) # Find or download/install fmt find_package(fmt QUIET) if(NOT fmt_FOUND) set(FMT_INSTALL OFF) message(STATUS "fmt not found, fetching from source...") FetchContent_Declare( fmt GIT_REPOSITORY https://github.com/fmtlib/fmt GIT_TAG e69e5f977d458f2650bb346dadf2ad30c5320281 # 10.2.1 ) FetchContent_MakeAvailable(fmt) else() message(STATUS "Using system fmt") get_target_property(FMT_INCLUDE_DIRS fmt::fmt-header-only INTERFACE_INCLUDE_DIRECTORIES) message(STATUS "fmt include directories: ${FMT_INCLUDE_DIRS}") endif() # Find available local ROCM targets # NOTE: This will eventually be part of ROCm-CMake and should be removed at that time function(rocm_local_targets VARIABLE) set(${VARIABLE} "NOTFOUND" PARENT_SCOPE) find_program(_rocm_agent_enumerator rocm_agent_enumerator HINTS /opt/rocm/bin ENV ROCM_PATH) if(NOT _rocm_agent_enumerator STREQUAL "_rocm_agent_enumerator-NOTFOUND") execute_process( COMMAND "${_rocm_agent_enumerator}" RESULT_VARIABLE _found_agents OUTPUT_VARIABLE _rocm_agents ERROR_QUIET ) if (_found_agents EQUAL 0) string(REPLACE "\n" ";" _rocm_agents "${_rocm_agents}") unset(result) foreach (agent IN LISTS _rocm_agents) if (NOT agent STREQUAL "gfx000") list(APPEND result "${agent}") endif() endforeach() if(result) list(REMOVE_DUPLICATES result) set(${VARIABLE} "${result}" PARENT_SCOPE) endif() endif() endif() endfunction() # Iterate over the "source" list and check if there is a duplicate file name # NOTE: This is due to compiler bug '--save-temps' and can be removed when fix availabe function(add_file_unique FILE_LIST FILE) get_filename_component(FILE_NAME "${FILE}" NAME) # Iterate over whatever is in the list so far foreach(curr_file IN LISTS ${FILE_LIST}) get_filename_component(curr_file_name ${curr_file} NAME) # Check if duplicate if(${FILE_NAME} STREQUAL ${curr_file_name}) get_filename_component(DIR_PATH "${FILE}" DIRECTORY) get_filename_component(FILE_NAME_WE "${FILE}" NAME_WE) get_filename_component(FILE_EXT "${FILE}" EXT) # Construct a new file name by adding _tmp set(HIP_FILE "${DIR_PATH}/${FILE_NAME_WE}_tmp${FILE_EXT}" PARENT_SCOPE) endif() endforeach() endfunction() include(ROCMSetupVersion) include(ROCMCreatePackage) include(ROCMInstallTargets) include(ROCMPackageConfigHelpers) include(ROCMInstallSymlinks) include(ROCMCheckTargetIds) include(ROCMClients) include(ROCMHeaderWrapper) ================================================ FILE: cmake/DownloadProject.CMakeLists.cmake.in ================================================ # Distributed under the OSI-approved MIT License. See accompanying # file LICENSE or https://github.com/Crascit/DownloadProject for details. cmake_minimum_required(VERSION 2.8.2) project(${DL_ARGS_PROJ}-download NONE) include(ExternalProject) ExternalProject_Add(${DL_ARGS_PROJ}-download ${DL_ARGS_UNPARSED_ARGUMENTS} SOURCE_DIR "${DL_ARGS_SOURCE_DIR}" BUILD_IN_SOURCE TRUE TEST_COMMAND "" ) ================================================ FILE: cmake/DownloadProject.cmake ================================================ # Distributed under the OSI-approved MIT License. See accompanying # file LICENSE or https://github.com/Crascit/DownloadProject for details. # # MODULE: DownloadProject # # PROVIDES: # download_project( PROJ projectName # [PREFIX prefixDir] # [DOWNLOAD_DIR downloadDir] # [SOURCE_DIR srcDir] # [BINARY_DIR binDir] # [QUIET] # ... # ) # # Provides the ability to download and unpack a tarball, zip file, git repository, # etc. at configure time (i.e. when the cmake command is run). How the downloaded # and unpacked contents are used is up to the caller, but the motivating case is # to download source code which can then be included directly in the build with # add_subdirectory() after the call to download_project(). Source and build # directories are set up with this in mind. # # The PROJ argument is required. The projectName value will be used to construct # the following variables upon exit (obviously replace projectName with its actual # value): # # projectName_SOURCE_DIR # projectName_BINARY_DIR # # The SOURCE_DIR and BINARY_DIR arguments are optional and would not typically # need to be provided. They can be specified if you want the downloaded source # and build directories to be located in a specific place. The contents of # projectName_SOURCE_DIR and projectName_BINARY_DIR will be populated with the # locations used whether you provide SOURCE_DIR/BINARY_DIR or not. # # The DOWNLOAD_DIR argument does not normally need to be set. It controls the # location of the temporary CMake build used to perform the download. # # The PREFIX argument can be provided to change the base location of the default # values of DOWNLOAD_DIR, SOURCE_DIR and BINARY_DIR. If all of those three arguments # are provided, then PREFIX will have no effect. The default value for PREFIX is # CMAKE_BINARY_DIR. # # The QUIET option can be given if you do not want to show the output associated # with downloading the specified project. # # In addition to the above, any other options are passed through unmodified to # ExternalProject_Add() to perform the actual download, patch and update steps. # # Only those ExternalProject_Add() arguments which relate to downloading, patching # and updating of the project sources are intended to be used. Also note that at # least one set of download-related arguments are required. # # If using CMake 3.2 or later, the UPDATE_DISCONNECTED option can be used to # prevent a check at the remote end for changes every time CMake is run # after the first successful download. See the documentation of the ExternalProject # module for more information. It is likely you will want to use this option if it # is available to you. Note, however, that the ExternalProject implementation contains # bugs which result in incorrect handling of the UPDATE_DISCONNECTED option when # using the URL download method or when specifying a SOURCE_DIR with no download # method. Fixes for these have been created, the last of which is scheduled for # inclusion in CMake 3.8.0. Details can be found here: # # https://gitlab.kitware.com/cmake/cmake/commit/bdca68388bd57f8302d3c1d83d691034b7ffa70c # https://gitlab.kitware.com/cmake/cmake/issues/16428 # # If you experience build errors related to the update step, consider avoiding # the use of UPDATE_DISCONNECTED. # # EXAMPLE USAGE: # # include(DownloadProject) # download_project(PROJ googletest # GIT_REPOSITORY https://github.com/google/googletest.git # GIT_TAG master # UPDATE_DISCONNECTED 1 # QUIET # ) # # add_subdirectory(${googletest_SOURCE_DIR} ${googletest_BINARY_DIR}) # #======================================================================================== set(_DownloadProjectDir "${CMAKE_CURRENT_LIST_DIR}") include(CMakeParseArguments) function(download_project) set(options QUIET) set(oneValueArgs PROJ PREFIX DOWNLOAD_DIR SOURCE_DIR BINARY_DIR ) set(multiValueArgs "") cmake_parse_arguments(DL_ARGS "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) # Hide output if requested if (DL_ARGS_QUIET) set(OUTPUT_QUIET "OUTPUT_QUIET") else() unset(OUTPUT_QUIET) message(STATUS "Downloading/updating ${DL_ARGS_PROJ}") endif() # Set up where we will put our temporary CMakeLists.txt file and also # the base point below which the default source and binary dirs will be. # The prefix must always be an absolute path. if (NOT DL_ARGS_PREFIX) set(DL_ARGS_PREFIX "${CMAKE_BINARY_DIR}") else() get_filename_component(DL_ARGS_PREFIX "${DL_ARGS_PREFIX}" ABSOLUTE BASE_DIR "${CMAKE_CURRENT_BINARY_DIR}") endif() if (NOT DL_ARGS_DOWNLOAD_DIR) set(DL_ARGS_DOWNLOAD_DIR "${DL_ARGS_PREFIX}/${DL_ARGS_PROJ}-download") endif() # Ensure the caller can know where to find the source and build directories if (NOT DL_ARGS_SOURCE_DIR) set(DL_ARGS_SOURCE_DIR "${DL_ARGS_PREFIX}/${DL_ARGS_PROJ}-src") endif() if (NOT DL_ARGS_BINARY_DIR) set(DL_ARGS_BINARY_DIR "${DL_ARGS_PREFIX}/${DL_ARGS_PROJ}-build") endif() set(${DL_ARGS_PROJ}_SOURCE_DIR "${DL_ARGS_SOURCE_DIR}" PARENT_SCOPE) set(${DL_ARGS_PROJ}_BINARY_DIR "${DL_ARGS_BINARY_DIR}" PARENT_SCOPE) # The way that CLion manages multiple configurations, it causes a copy of # the CMakeCache.txt to be copied across due to it not expecting there to # be a project within a project. This causes the hard-coded paths in the # cache to be copied and builds to fail. To mitigate this, we simply # remove the cache if it exists before we configure the new project. It # is safe to do so because it will be re-generated. Since this is only # executed at the configure step, it should not cause additional builds or # downloads. file(REMOVE "${DL_ARGS_DOWNLOAD_DIR}/CMakeCache.txt") # Create and build a separate CMake project to carry out the download. # If we've already previously done these steps, they will not cause # anything to be updated, so extra rebuilds of the project won't occur. # Make sure to pass through CMAKE_MAKE_PROGRAM in case the main project # has this set to something not findable on the PATH. configure_file("${_DownloadProjectDir}/DownloadProject.CMakeLists.cmake.in" "${DL_ARGS_DOWNLOAD_DIR}/CMakeLists.txt") execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" -D "CMAKE_MAKE_PROGRAM:FILE=${CMAKE_MAKE_PROGRAM}" . RESULT_VARIABLE result ${OUTPUT_QUIET} WORKING_DIRECTORY "${DL_ARGS_DOWNLOAD_DIR}" ) if(result) message(FATAL_ERROR "CMake step for ${DL_ARGS_PROJ} failed: ${result}") endif() execute_process(COMMAND ${CMAKE_COMMAND} --build . -j16 RESULT_VARIABLE result ${OUTPUT_QUIET} WORKING_DIRECTORY "${DL_ARGS_DOWNLOAD_DIR}" ) if(result) message(FATAL_ERROR "Build step for ${DL_ARGS_PROJ} failed: ${result}") endif() endfunction() ================================================ FILE: cmake/FindIBVerbs.cmake ================================================ # MIT License # # Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. find_path(IBVERBS_INCLUDE_DIRS NAMES infiniband/verbs.h HINTS ${IBVERBS_INCLUDE_DIR} ${IBVERBS_ROOT_DIR} ${IBVERBS_ROOT_DIR}/include) find_library(IBVERBS_LIBRARIES NAMES ibverbs HINTS ${IBVERBS_LIB_DIR} ${IBVERBS_ROOT_DIR} ${IBVERBS_ROOT_DIR}/lib) include(FindPackageHandleStandardArgs) find_package_handle_standard_args(IBVerbs DEFAULT_MSG IBVERBS_INCLUDE_DIRS IBVERBS_LIBRARIES) mark_as_advanced(IBVERBS_INCLUDE_DIR IBVERBS_LIBRARIES) ================================================ FILE: cmake/Findmscclpp_nccl.cmake ================================================ # MIT License # # Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. find_path(MSCCLPP_INCLUDE_DIRS NAMES mscclpp/gpu.hpp HINTS ${MSCCLPP_ROOT}/include) find_library(MSCCLPP_LIBRARIES NAMES mscclpp_nccl HINTS ${MSCCLPP_ROOT}/lib) include (FindPackageHandleStandardArgs) find_package_handle_standard_args(mscclpp_nccl DEFAULT_MSG MSCCLPP_INCLUDE_DIRS MSCCLPP_LIBRARIES) mark_as_advanced(MSCCLPP_INCLUDE_DIRS MSCCLPP_LIBRARIES) ================================================ FILE: cmake/Findrocshmem_static.cmake ================================================ # MIT License # # Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. find_path(ROCSHMEM_INCLUDE_DIR NAMES rocshmem/rocshmem.hpp rocshmem/rocshmem.h HINTS ${ROCSHMEM_INSTALL_DIR}/include/) find_library(ROCSHMEM_LIBRARY NAMES rocshmem HINTS ${ROCSHMEM_INSTALL_DIR}/lib) ## -- todo --- what to do with verbs? add to handle args call below? -- ## find_library(IBVERBS ibverbs) find_package_handle_standard_args(rocshmem_static DEFAULT_MSG ROCSHMEM_INCLUDE_DIR ROCSHMEM_LIBRARY) ## mark_as_advanced(MSCCLPP_INCLUDE_DIRS MSCCLPP_NCCL_STATIC_LIB) add this for Rocshmem? ================================================ FILE: cmake/MSCCLPP.cmake ================================================ # MIT License # # Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # Dependencies # HIP dependency is handled earlier in the project cmake file # when VerifyCompiler.cmake is included. # GIT # Test dependencies # For downloading, building, and installing required dependencies include(cmake/DownloadProject.cmake) if(ENABLE_MSCCLPP) # Try to find the mscclpp install set(MSCCLPP_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/ext/mscclpp CACHE PATH "") execute_process( COMMAND mkdir -p ${MSCCLPP_ROOT} ) list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") find_package(mscclpp_nccl) #if(NOT mscclpp_nccl_FOUND) # Ensure the source code is checked out set(MSCCLPP_SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/mscclpp CACHE PATH "") set(JSON_SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/json CACHE PATH "") if((NOT EXISTS ${MSCCLPP_SOURCE}/CMakeLists.txt) OR (NOT EXISTS ${JSON_SOURCE}/CMakeLists.txt)) message(STATUS "Checking out external code") execute_process( COMMAND git submodule update --init --recursive WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} ) endif() execute_process( COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/cpx.patch WORKING_DIRECTORY ${MSCCLPP_SOURCE} ) execute_process( COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/read-allred.patch WORKING_DIRECTORY ${MSCCLPP_SOURCE} ) execute_process( COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/mscclpp_ibv_access_relaxed_ordering.patch WORKING_DIRECTORY ${MSCCLPP_SOURCE} ) execute_process( COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/mem-reg.patch WORKING_DIRECTORY ${MSCCLPP_SOURCE} ) execute_process( COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/non-multiple-128-fix.patch WORKING_DIRECTORY ${MSCCLPP_SOURCE} ) execute_process( COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/bf16-tuning.patch WORKING_DIRECTORY ${MSCCLPP_SOURCE} ) execute_process( COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/reg-fix.patch WORKING_DIRECTORY ${MSCCLPP_SOURCE} ) execute_process( COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/no-cache.patch WORKING_DIRECTORY ${MSCCLPP_SOURCE} ) execute_process( COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/device-flag.patch WORKING_DIRECTORY ${MSCCLPP_SOURCE} ) execute_process( COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/remove-clip.patch WORKING_DIRECTORY ${MSCCLPP_SOURCE} ) execute_process( COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/disable-executor.patch WORKING_DIRECTORY ${MSCCLPP_SOURCE} ) execute_process( COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/disable-format-checks.patch WORKING_DIRECTORY ${MSCCLPP_SOURCE} ) set(CMAKE_INHERITED_ARGS "") set(CMAKE_ARGS_LIST "CMAKE_PREFIX_PATH;CMAKE_INSTALL_RPATH_USE_LINK_PATH;HIP_COMPILER") foreach(arg IN LISTS CMAKE_ARGS_LIST) if(DEFINED ${arg}) string(REPLACE ";" "%" ARG_VALUE "${${arg}}") # Replace ; with new list separator symbol % to avoid CMake errors string(STRIP "${ARG_VALUE}" ARG_VALUE) # Eliminate whitespace, reducing to empty string if necessary # Only add a cmake argument if it has a value if("${ARG_VALUE}" STREQUAL "") continue() endif() string(APPEND CMAKE_INHERITED_ARGS "-D${arg}=\"${ARG_VALUE}\" ") endif() endforeach() if(NOT DEFINED CACHE{MSCCLPP_GPU_TARGETS}) message(STATUS "Building MSCCL++ only for supported variants: gfx942;gfx950") set(MSCCLPP_GPU_TARGETS "gfx942;gfx950") if(BUILD_ADDRESS_SANITIZER) set(MSCCLPP_GPU_TARGETS "gfx942:xnack+;gfx950:xnack+") endif() else() message(STATUS "Building MSCCL++ for ${MSCCLPP_GPU_TARGETS}") endif() string(REPLACE ";" "%" MSCCLPP_GPU_TARGETS "${MSCCLPP_GPU_TARGETS}") download_project(PROJ mscclpp_nccl #GIT_REPOSITORY https://github.com/microsoft/mscclpp.git #GIT_TAG 4ee15b7ad085daaf74349d4c49c9b8480d28f0dc INSTALL_DIR ${MSCCLPP_ROOT} LIST_SEPARATOR % CMAKE_ARGS "-DGPU_TARGETS=${MSCCLPP_GPU_TARGETS}" -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DMSCCLPP_BUILD_APPS_NCCL=ON -DMSCCLPP_BUILD_PYTHON_BINDINGS=OFF -DMSCCLPP_BUILD_TESTS=OFF -DMSCCLPP_CLIP_ENABLED=${ENABLE_MSCCLPP_CLIP} -DMSCCLPP_ENABLE_EXECUTOR=${ENABLE_MSCCLPP_EXECUTOR} -DMSCCLPP_ENABLE_FORMAT_CHECKS=${ENABLE_MSCCLPP_FORMAT_CHECKS} -DCMAKE_INSTALL_PREFIX= -DCMAKE_VERBOSE_MAKEFILE=1 "${CMAKE_INHERITED_ARGS}" -DFETCHCONTENT_SOURCE_DIR_JSON=${JSON_SOURCE} LOG_DOWNLOAD FALSE LOG_CONFIGURE FALSE LOG_BUILD FALSE LOG_INSTALL FALSE UPDATE_DISCONNECTED TRUE SOURCE_DIR ${MSCCLPP_SOURCE} ) find_package(mscclpp_nccl REQUIRED) execute_process( COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/disable-format-checks.patch WORKING_DIRECTORY ${MSCCLPP_SOURCE} ) execute_process( COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/disable-executor.patch WORKING_DIRECTORY ${MSCCLPP_SOURCE} ) execute_process( COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/remove-clip.patch WORKING_DIRECTORY ${MSCCLPP_SOURCE} ) execute_process( COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/device-flag.patch WORKING_DIRECTORY ${MSCCLPP_SOURCE} ) execute_process( COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/no-cache.patch WORKING_DIRECTORY ${MSCCLPP_SOURCE} ) execute_process( COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/reg-fix.patch WORKING_DIRECTORY ${MSCCLPP_SOURCE} ) execute_process( COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/bf16-tuning.patch WORKING_DIRECTORY ${MSCCLPP_SOURCE} ) execute_process( COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/non-multiple-128-fix.patch WORKING_DIRECTORY ${MSCCLPP_SOURCE} ) execute_process( COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/mem-reg.patch WORKING_DIRECTORY ${MSCCLPP_SOURCE} ) execute_process( COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/mscclpp_ibv_access_relaxed_ordering.patch WORKING_DIRECTORY ${MSCCLPP_SOURCE} ) execute_process( COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/read-allred.patch WORKING_DIRECTORY ${MSCCLPP_SOURCE} ) execute_process( COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/cpx.patch WORKING_DIRECTORY ${MSCCLPP_SOURCE} ) #endif() execute_process(COMMAND objcopy --redefine-syms=${CMAKE_CURRENT_SOURCE_DIR}/src/misc/mscclpp/mscclpp_nccl_syms.txt "${MSCCLPP_ROOT}/lib/libmscclpp_nccl_static.a" "${PROJECT_BINARY_DIR}/libmscclpp_nccl.a" ) add_library(mscclpp_nccl STATIC IMPORTED) set_target_properties(mscclpp_nccl PROPERTIES IMPORTED_LOCATION ${PROJECT_BINARY_DIR}/libmscclpp_nccl.a) endif() ================================================ FILE: cmake/ROCSHMEM.cmake ================================================ # MIT License # # Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. include(ExternalProject) function(add_rocshmem_targets) # Check for an existing installation via the user-provided prefix ROCSHMEM_INSTALL DIR if(ROCSHMEM_INSTALL_DIR) list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") find_package(rocshmem_static) if(NOT IBVERBS) find_library(IBVERBS ibverbs) if(IBVERBS) set(IBVERBS ${IBVERBS} PARENT_SCOPE) endif() endif() endif() # If no pre-existing installation, build from submodule into ext/rocshmem if(NOT rocshmem_static_FOUND) set(_rccl_root "${CMAKE_SOURCE_DIR}") set(ROCSHMEM_SOURCE "${_rccl_root}/ext-src/rocSHMEM") set(ROCSHMEM_INSTALL_DIR "${_rccl_root}/ext/rocshmem") # Make sure submodule exists (same style as MSCCL++: custom rule + target) add_custom_command( OUTPUT "${ROCSHMEM_SOURCE}/CMakeLists.txt" COMMAND git submodule update --init --recursive ext-src/rocSHMEM WORKING_DIRECTORY "${_rccl_root}" COMMENT "Checking out submodule: ext-src/rocSHMEM" VERBATIM ) add_custom_target(rocshmem_checkout_submodule DEPENDS "${ROCSHMEM_SOURCE}/CMakeLists.txt") # Where our patch files live (like MSCCL++) set(EXT_SOURCE "${_rccl_root}/ext-src") # Build and install rocSHMEM. We run `../build_scripts/gdx_bxnt` # from a 'build' dir just like the README shows. ExternalProject_Add(rocshmem_ext SOURCE_DIR "${ROCSHMEM_SOURCE}" INSTALL_DIR "${ROCSHMEM_INSTALL_DIR}" UPDATE_DISCONNECTED TRUE LOG_DOWNLOAD FALSE LOG_CONFIGURE FALSE LOG_BUILD FALSE LOG_INSTALL FALSE BUILD_IN_SOURCE TRUE DOWNLOAD_COMMAND "" # using the submodule checkout above TEST_COMMAND "" DEPENDS rocshmem_checkout_submodule # Rocshmem submodule commit hash -> commit b28a56bd54ccc581d05a439ffa466c3dacb3385 # The project has its own scripts; we replicate the README sequence: CONFIGURE_COMMAND "" BUILD_COMMAND ${CMAKE_COMMAND} -E make_directory build && ${CMAKE_COMMAND} -E chdir build bash -lc "../scripts/build_configs/gda_bnxt -DUSE_EXTERNAL_MPI=OFF -DUSE_IPC=ON -DBUILD_EXAMPLES=OFF " && ${CMAKE_COMMAND} -E chdir build ${CMAKE_COMMAND} -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DCMAKE_INSTALL_PREFIX= -DBUILD_EXAMPLES=OFF .. && ${CMAKE_COMMAND} -E chdir build ${CMAKE_MAKE_PROGRAM} -j INSTALL_COMMAND ${CMAKE_COMMAND} -E chdir build ${CMAKE_MAKE_PROGRAM} install ) # After build, define the variables RCCL expects set(ROCSHMEM_INCLUDE_DIR "${ROCSHMEM_INSTALL_DIR}/include" PARENT_SCOPE) set(ROCSHMEM_LIBRARY "${ROCSHMEM_INSTALL_DIR}/lib/librocshmem.a" PARENT_SCOPE) find_library(_IBVERBS ibverbs) if(NOT _IBVERBS) message(FATAL_ERROR "libibverbs not found (install rdma-core/libibverbs-dev)") endif() set(IBVERBS ${_IBVERBS} PARENT_SCOPE) # Provide a dummy target other code can depend on add_custom_target(rocshmem_static ALL DEPENDS rocshmem_ext) else() # We found a prebuilt rocSHMEM; export variables upward as-is set(ROCSHMEM_INCLUDE_DIR "${ROCSHMEM_INCLUDE_DIR}" PARENT_SCOPE) set(ROCSHMEM_LIBRARY "${ROCSHMEM_LIBRARY}" PARENT_SCOPE) find_library(_IBVERBS ibverbs) if(NOT _IBVERBS) message(FATAL_ERROR "libibverbs not found") endif() set(IBVERBS ${_IBVERBS} PARENT_SCOPE) endif() endfunction() ================================================ FILE: cmake/rcclRAS.cmake ================================================ # Copyright (c) Advanced Micro Devices, Inc., or its affiliates. cmake_minimum_required(VERSION 3.16) message("Building rccl RAS client executable") add_executable(rcclras "${PROJECT_BINARY_DIR}/hipify/src/ras/client.cc") target_include_directories(rcclras PRIVATE ${PROJECT_BINARY_DIR}/include) target_include_directories(rcclras PRIVATE ${HIPIFY_DIR}/src) target_include_directories(rcclras PRIVATE ${HIPIFY_DIR}/src/include) target_link_libraries(rcclras PRIVATE hip::host) target_link_libraries(rcclras PRIVATE dl) if(BUILD_SHARED_LIBS) target_link_libraries(rcclras PRIVATE rccl hip::device) else() add_dependencies(rcclras rccl) target_link_libraries(rcclras PRIVATE dl rt -lrccl -L${CMAKE_BINARY_DIR} -lamdhip64 -L${ROCM_PATH}/lib) endif() rocm_install(TARGETS rcclras) ================================================ FILE: cmake/rocmIb.cmake ================================================ # MIT License # # Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # Dependencies # HIP dependency is handled earlier in the project cmake file # when VerifyCompiler.cmake is included. # GIT # Test dependencies # For downloading, building, and installing required dependencies include(cmake/DownloadProject.cmake) message(STATUS "Generating ROCM NetIB... ") # ------------------------- # Configurable paths # ------------------------- # Path to RCCL source tree (local clone) set(RCCL_SRC_DIR "${CMAKE_SOURCE_DIR}" CACHE PATH "Path to RCCL source directory") # Path to patch file set(ROCM_NETIB_PATCH_FILE "${CMAKE_SOURCE_DIR}/ext-src/rocm_netib.patch" CACHE FILEPATH "ROCM NETIB Patch file to apply to RCCL") set(ROCM_NETIB_FILE "${CMAKE_SOURCE_DIR}/src/transport/net_ib_rocm.cc" CACHE FILEPATH "Generated ROCM NETIB file") # ------------------------- # Find tools # ------------------------- find_program(PATCH_EXECUTABLE patch) find_program(SED_EXECUTABLE sed) execute_process( COMMAND ${CMAKE_COMMAND} -E echo "Applying RCCL ROCM NetIB patch... to ${CMAKE_SOURCE_DIR}" COMMAND bash -c "patch -p1 -i ${ROCM_NETIB_PATCH_FILE} -o ${ROCM_NETIB_FILE}" WORKING_DIRECTORY ${RCCL_SRC_DIR} ) execute_process( COMMAND bash -c "sed -i 's/NCCL_PARAM(Ib/NCCL_PARAM(RocmIb/g' ${ROCM_NETIB_FILE}" WORKING_DIRECTORY ${RCCL_SRC_DIR} ) execute_process( COMMAND bash -c "sed -i 's/RCCL_PARAM(Ib/RCCL_PARAM(RocmIb/g' ${ROCM_NETIB_FILE}" WORKING_DIRECTORY ${RCCL_SRC_DIR} ) execute_process( COMMAND bash -c "sed -i 's/ncclParamIb/ncclParamRocmIb/g' ${ROCM_NETIB_FILE}" WORKING_DIRECTORY ${RCCL_SRC_DIR} ) execute_process( COMMAND bash -c "sed -i 's/rcclParamIb/rcclParamRocmIb/g' ${ROCM_NETIB_FILE}" WORKING_DIRECTORY ${RCCL_SRC_DIR} ) execute_process( COMMAND bash -c "sed -i 's/ncclIbMergedDevs/rocmIbMergedDevs/g' ${ROCM_NETIB_FILE}" WORKING_DIRECTORY ${RCCL_SRC_DIR} ) execute_process( COMMAND bash -c "sed -i 's/ncclIbDevs/rocmIbDevs/g' ${ROCM_NETIB_FILE}" WORKING_DIRECTORY ${RCCL_SRC_DIR} ) execute_process( COMMAND bash -c "sed -i 's/ncclIbLock/rocmIbLock/g' ${ROCM_NETIB_FILE}" WORKING_DIRECTORY ${RCCL_SRC_DIR} ) execute_process( COMMAND bash -c "sed -i 's/ibProviderName/rocmIbProviderName/g' ${ROCM_NETIB_FILE}" WORKING_DIRECTORY ${RCCL_SRC_DIR} ) execute_process( COMMAND bash -c "sed -i 's/ncclIbAsyncThread/rocmIbAsyncThread/g' ${ROCM_NETIB_FILE}" WORKING_DIRECTORY ${RCCL_SRC_DIR} ) execute_process( COMMAND bash -c "sed -i 's/ncclIbGdrSupport/rocmIbGdrSupport/g' ${ROCM_NETIB_FILE}" WORKING_DIRECTORY ${RCCL_SRC_DIR} ) execute_process( COMMAND bash -c "sed -i 's/ncclIbDmaBufSupport/rocmIbDmaBufSupport/g' ${ROCM_NETIB_FILE}" WORKING_DIRECTORY ${RCCL_SRC_DIR} ) execute_process( COMMAND bash -c "sed -i 's/ncclIbInitCommDevBase/rocmIbInitCommDevBase/g' ${ROCM_NETIB_FILE}" WORKING_DIRECTORY ${RCCL_SRC_DIR} ) execute_process( COMMAND bash -c "sed -i 's/ncclIbDestroyBase/rocmIbDestroyBase/g' ${ROCM_NETIB_FILE}" WORKING_DIRECTORY ${RCCL_SRC_DIR} ) execute_process( COMMAND bash -c "sed -i 's/ncclIbRtrQp/rocmIbRtrQp/g' ${ROCM_NETIB_FILE}" WORKING_DIRECTORY ${RCCL_SRC_DIR} ) execute_process( COMMAND bash -c "sed -i 's/ncclIbRtsQp/rocmIbRtsQp/g' ${ROCM_NETIB_FILE}" WORKING_DIRECTORY ${RCCL_SRC_DIR} ) execute_process( COMMAND bash -c "sed -i 's/ForceEnableGdrdma/RocmForceEnableGdrdma/g' ${ROCM_NETIB_FILE}" WORKING_DIRECTORY ${RCCL_SRC_DIR} ) execute_process( COMMAND bash -c "sed -i 's/ncclIbCheckVProps/rocmIbCheckVProps/g' ${ROCM_NETIB_FILE}" WORKING_DIRECTORY ${RCCL_SRC_DIR} ) execute_process( COMMAND bash -c "sed -i 's/ncclIbGetRequest/rocmIbGetRequest/g' ${ROCM_NETIB_FILE}" WORKING_DIRECTORY ${RCCL_SRC_DIR} ) execute_process( COMMAND bash -c "sed -i 's/ncclIbFreeRequest/rocmIbFreeRequest/g' ${ROCM_NETIB_FILE}" WORKING_DIRECTORY ${RCCL_SRC_DIR} ) execute_process( COMMAND bash -c "sed -i 's/ncclIbRegMrDmaBufInternal/rocmIbRegMrDmaBufInternal/g' ${ROCM_NETIB_FILE}" WORKING_DIRECTORY ${RCCL_SRC_DIR} ) execute_process( COMMAND bash -c "sed -i 's/ncclIbGetNetCommDevBase/rocmIbGetNetCommDevBase/g' ${ROCM_NETIB_FILE}" WORKING_DIRECTORY ${RCCL_SRC_DIR} ) execute_process( COMMAND bash -c "sed -i 's/ncclIbDeregMrInternal/rocmIbDeregMrInternal/g' ${ROCM_NETIB_FILE}" WORKING_DIRECTORY ${RCCL_SRC_DIR} ) execute_process( COMMAND bash -c "sed -i 's/ncclIbPostFifo/rocmIbPostFifo/g' ${ROCM_NETIB_FILE}" WORKING_DIRECTORY ${RCCL_SRC_DIR} ) execute_process( COMMAND bash -c "sed -i 's/reqTypeStr/rocmIbReqTypeStr/g' ${ROCM_NETIB_FILE}" WORKING_DIRECTORY ${RCCL_SRC_DIR} ) execute_process( COMMAND bash -c "sed -i 's/rcclNetP2pPolicy/rcclRocmNetP2pPolicy/g' ${ROCM_NETIB_FILE}" WORKING_DIRECTORY ${RCCL_SRC_DIR} ) execute_process( COMMAND bash -c "sed -i 's/ncclIbMakeVDeviceInternal/rocmIbMakeVDeviceInternal/g' ${ROCM_NETIB_FILE}" WORKING_DIRECTORY ${RCCL_SRC_DIR} ) execute_process( COMMAND bash -c "sed -i 's/ncclIbMakeVDevice/rocmIbMakeVDevice/g' ${ROCM_NETIB_FILE}" WORKING_DIRECTORY ${RCCL_SRC_DIR} ) execute_process( COMMAND bash -c "sed -i 's/ncclIbInit/rocmIbInit/g' ${ROCM_NETIB_FILE}" WORKING_DIRECTORY ${RCCL_SRC_DIR} ) execute_process( COMMAND bash -c "sed -i 's/ncclIbDevices/rocmIbDevices/g' ${ROCM_NETIB_FILE}" WORKING_DIRECTORY ${RCCL_SRC_DIR} ) execute_process( COMMAND bash -c "sed -i 's/ncclIbGetPhysProperties/rocmIbGetPhysProperties/g' ${ROCM_NETIB_FILE}" WORKING_DIRECTORY ${RCCL_SRC_DIR} ) execute_process( COMMAND bash -c "sed -i 's/ncclIbGetProperties/rocmIbGetProperties/g' ${ROCM_NETIB_FILE}" WORKING_DIRECTORY ${RCCL_SRC_DIR} ) execute_process( COMMAND bash -c "sed -i 's/ncclIbListen\(/rocmIbListen\(/g' ${ROCM_NETIB_FILE}" WORKING_DIRECTORY ${RCCL_SRC_DIR} ) execute_process( COMMAND bash -c "sed -i 's/ncclIbListen,/rocmIbListen,/g' ${ROCM_NETIB_FILE}" WORKING_DIRECTORY ${RCCL_SRC_DIR} ) execute_process( COMMAND bash -c "sed -i 's/ncclIbConnect\(/rocmIbConnect\(/g' ${ROCM_NETIB_FILE}" WORKING_DIRECTORY ${RCCL_SRC_DIR} ) execute_process( COMMAND bash -c "sed -i 's/ncclIbConnect /rocmIbConnect /g' ${ROCM_NETIB_FILE}" WORKING_DIRECTORY ${RCCL_SRC_DIR} ) execute_process( COMMAND bash -c "sed -i 's/ncclIbConnect,/rocmIbConnect,/g' ${ROCM_NETIB_FILE}" WORKING_DIRECTORY ${RCCL_SRC_DIR} ) execute_process( COMMAND bash -c "sed -i 's/ncclIbAccept/rocmIbAccept/g' ${ROCM_NETIB_FILE}" WORKING_DIRECTORY ${RCCL_SRC_DIR} ) execute_process( COMMAND bash -c "sed -i 's/ncclIbTest/rocmIbTest/g' ${ROCM_NETIB_FILE}" WORKING_DIRECTORY ${RCCL_SRC_DIR} ) execute_process( COMMAND bash -c "sed -i 's/ncclIbRegMrDmaBuf/rocmIbRegMrDmaBuf/g' ${ROCM_NETIB_FILE}" WORKING_DIRECTORY ${RCCL_SRC_DIR} ) execute_process( COMMAND bash -c "sed -i 's/ncclIbRegMr/rocmIbRegMr/g' ${ROCM_NETIB_FILE}" WORKING_DIRECTORY ${RCCL_SRC_DIR} ) execute_process( COMMAND bash -c "sed -i 's/ncclIbDeregMr/rocmIbDeregMr/g' ${ROCM_NETIB_FILE}" WORKING_DIRECTORY ${RCCL_SRC_DIR} ) execute_process( COMMAND bash -c "sed -i 's/ncclIbIsend/rocmIbIsend/g' ${ROCM_NETIB_FILE}" WORKING_DIRECTORY ${RCCL_SRC_DIR} ) execute_process( COMMAND bash -c "sed -i 's/ncclIbIrecv/rocmIbIrecv/g' ${ROCM_NETIB_FILE}" WORKING_DIRECTORY ${RCCL_SRC_DIR} ) execute_process( COMMAND bash -c "sed -i 's/ncclIbIflush/rocmIbIflush/g' ${ROCM_NETIB_FILE}" WORKING_DIRECTORY ${RCCL_SRC_DIR} ) execute_process( COMMAND bash -c "sed -i 's/ncclIbCloseSend/rocmIbCloseSend/g' ${ROCM_NETIB_FILE}" WORKING_DIRECTORY ${RCCL_SRC_DIR} ) execute_process( COMMAND bash -c "sed -i 's/ncclIbCloseRecv/rocmIbCloseRecv/g' ${ROCM_NETIB_FILE}" WORKING_DIRECTORY ${RCCL_SRC_DIR} ) execute_process( COMMAND bash -c "sed -i 's/ncclIbCloseListen/rocmIbCloseListen/g' ${ROCM_NETIB_FILE}" WORKING_DIRECTORY ${RCCL_SRC_DIR} ) execute_process( COMMAND bash -c "sed -i 's/ncclNetIb/rocmNetIb/g' ${ROCM_NETIB_FILE}" WORKING_DIRECTORY ${RCCL_SRC_DIR} ) execute_process( COMMAND bash -c "sed -i 's/ncclIbFinalize/rocmNetIbFinalize/g' ${ROCM_NETIB_FILE}" WORKING_DIRECTORY ${RCCL_SRC_DIR} ) execute_process( COMMAND bash -c "sed -i 's/ncclIbSetNetAttr/rocmNetIbSetNetAttr/g' ${ROCM_NETIB_FILE}" WORKING_DIRECTORY ${RCCL_SRC_DIR} ) list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") ================================================ FILE: cmake/scripts/add_faults.sh ================================================ # Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. HIP_FILE=$1 if [[ "$HIP_FILE" =~ .*/src/device/.*\.h ]]; then sed -i "s/__syncthreads()/__syncthreads(); insert_random_delay_per_warp()/" "$HIP_FILE" echo "Added fault injection to $HIP_FILE" fi ================================================ FILE: cmake/scripts/add_unroll.sh ================================================ # Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. HIP_FILE=$1 if [[ "$HIP_FILE" =~ .*/src/device/.*\.h ]]; then perl -pi -e 's/(template/\1, int USE_ACC, int COLL_UNROLL, int Pipeline\2>/g' "$HIP_FILE" perl -pi -e 's/(template/\1, int USE_ACC, int COLL_UNROLL, int Pipeline\2>/g' "$HIP_FILE" perl -pi -e 's/(ProtoSimple<[^,]*?,[^,]+?)>/\1, USE_ACC, COLL_UNROLL>/g' "$HIP_FILE" perl -pi -e 's/(runRing\()/\1, USE_ACC, COLL_UNROLL\2/g' "$HIP_FILE" perl -pi -e 's/(runTreeUpDown\(/\1, USE_ACC, COLL_UNROLL>(/' "$HIP_FILE" perl -pi -e 's/(runTreeSplit\(/\1, USE_ACC, COLL_UNROLL>(/' "$HIP_FILE" perl -pi -e 's/(runTreeSplit/\1, 0>/' "$HIP_FILE" perl -pi -e 's/(runTreeUpDown/\1, 0>/' "$HIP_FILE" perl -pi -e 's/(runRing/\1, 0>/' "$HIP_FILE" perl -pi -e 's/(runRing/\1, 0>/' "$HIP_FILE" perl -pi -e 's/(runRing/\1, Pipeline>/' "$HIP_FILE" perl -pi -e 's/(runRing/\1, Pipeline>/' "$HIP_FILE" perl -pi -e 's/(runTreeSplit/\1, Pipeline>/' "$HIP_FILE" perl -pi -e 's/(runTreeUpDown/\1, Pipeline>/' "$HIP_FILE" sed -i "s/\\(struct RunWorkBatch]*\\)>*/\\1, USE_ACC, COLL_UNROLL, Pipeline>/" "$HIP_FILE" sed -i "s/\\(RunWorkColl<[^,]*,[^,]*,[^,]*,[^,]*,[^>]*\\)>/\\1, USE_ACC, COLL_UNROLL, Pipeline>/" "$HIP_FILE" fi ================================================ FILE: cmake/scripts/extract_metadata.cmake ================================================ # Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. set(EXTRACT_TIMEOUT 5 CACHE STRING "Timeout in seconds for roc-obj-* calls") ## List the objects for each gfx architecture execute_process( COMMAND roc-obj-ls librccl.so RESULT_VARIABLE list_result OUTPUT_VARIABLE cmd_output ERROR_VARIABLE cmd_error OUTPUT_STRIP_TRAILING_WHITESPACE ERROR_STRIP_TRAILING_WHITESPACE TIMEOUT ${EXTRACT_TIMEOUT} ) if(list_result EQUAL 0) ## Convert cmd output to list of lines string(REGEX REPLACE "\n$" "" cmd_output "${cmd_output}") string(REPLACE "\n" ";" cmd_output "${cmd_output}") ## Extract file paths for the selected gfx archs foreach(line ${cmd_output}) if(line MATCHES "(gfx90a|gfx942|gfx950)") string(REGEX MATCH "\\file://(.*)" file_match ${line}) if(file_match) list(APPEND file_paths ${file_match}) endif() endif() endforeach() ## Extract objects from files foreach(file ${file_paths}) execute_process( COMMAND roc-obj-extract ${file} RESULT_VARIABLE extraction_result ERROR_VARIABLE extraction_error OUTPUT_STRIP_TRAILING_WHITESPACE ERROR_STRIP_TRAILING_WHITESPACE TIMEOUT ${EXTRACT_TIMEOUT} ) if(extraction_result STREQUAL "TIMEOUT") message( WARNING "[Timeout] Extraction of '${file}' did not finish within ${EXTRACT_TIMEOUT}s. stderr: ${extraction_error}. Timeouts have been known to happen as a result of mismatched ROCm versions/executables/etc." ) elseif(NOT extraction_result EQUAL 0) message( WARNING "[Error ${extraction_result}] Could not extract objects from '${file}'. stderr: ${extraction_error}" ) endif() endforeach() elseif(list_result STREQUAL "TIMEOUT") message( WARNING "[Timeout] roc-obj-ls did not finish within ${EXTRACT_TIMEOUT}s. stderr: ${cmd_error}. Timeouts have been known to happen as a result of mismatched ROCm versions/executables/etc" ) else() ## We don't want to stop building unit-tests if this command fails. message(WARNING "[Error ${list_result}] roc-obj-ls failed. stderr: ${cmd_error}") endif() ================================================ FILE: cmake/scripts/git_version.cmake ================================================ # Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # Attempt to collect the latest git hash # Use RCCL_SOURCE_DIR if passed, otherwise fallback to CMAKE_CURRENT_SOURCE_DIR if(NOT DEFINED RCCL_SOURCE_DIR) set(RCCL_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) endif() if(NOT DEFINED RCCL_BINARY_DIR) set(RCCL_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}) endif() execute_process(COMMAND git log --pretty=format:'%h' -n 1 WORKING_DIRECTORY ${RCCL_SOURCE_DIR} OUTPUT_VARIABLE GIT_REV ERROR_QUIET) # Check if git information was found if ("${GIT_REV}" STREQUAL "") set(CURR_GIT_VERSION "const char *rcclGitHash =\"Unknown \";") else() # Check for changes (denote with a '+') after hash execute_process( COMMAND bash -c "git diff --quiet --exit-code || echo +" WORKING_DIRECTORY ${RCCL_SOURCE_DIR} OUTPUT_VARIABLE GIT_DIFF) # Collect branch information execute_process( COMMAND git rev-parse --abbrev-ref HEAD WORKING_DIRECTORY ${RCCL_SOURCE_DIR} OUTPUT_VARIABLE GIT_BRANCH) string(STRIP "${GIT_REV}" GIT_REV) string(SUBSTRING "${GIT_REV}" 1 7 GIT_REV) string(STRIP "${GIT_DIFF}" GIT_DIFF) string(STRIP "${GIT_BRANCH}" GIT_BRANCH) set(CURR_GIT_VERSION "const char *rcclGitHash =\"${GIT_BRANCH}:${GIT_REV}${GIT_DIFF}\";") endif() # Compare file with older git version file (git_version.cpp) if (EXISTS ${RCCL_BINARY_DIR}/git_version.cpp) #MESSAGE(STATUS "Found ${RCCL_BINARY_DIR}/git_version.cpp") file(READ ${RCCL_BINARY_DIR}/git_version.cpp PREV_GIT_VERSION) #message(STATUS "CURR GIT version: ${CURR_GIT_VERSION}") #message(STATUS "PREV GIT version: ${PREV_GIT_VERSION}") if (NOT "${CURR_GIT_VERSION}" STREQUAL "${PREV_GIT_VERSION}") message(STATUS "Updating git_version.cpp") file(WRITE ${RCCL_BINARY_DIR}/git_version.cpp "${CURR_GIT_VERSION}") else() message(STATUS "No changes to git_version.cpp required") endif() else() # Create git_version.cpp if it doesn't exist yet file(WRITE ${RCCL_BINARY_DIR}/git_version.cpp "${CURR_GIT_VERSION}") endif() ================================================ FILE: docker/Dockerfile.ubuntu ================================================ ## base docker image ARG ROCM_IMAGE_NAME=rocm/dev-ubuntu-22.04 ARG ROCM_IMAGE_TAG=latest FROM "${ROCM_IMAGE_NAME}:${ROCM_IMAGE_TAG}" ## rccl repo ARG RCCL_REPO=https://github.com/ROCm/rccl ARG RCCL_BRANCH=develop ## rccl-tests repo ARG RCCL_TESTS_REPO=https://github.com/ROCm/rccl-tests ARG RCCL_TESTS_BRANCH=develop ## AMD GPU Targets ARG GPU_TARGETS=gfx942 ## creating scratch space ENV WORKDIR /workspace RUN mkdir -p ${WORKDIR} WORKDIR ${WORKDIR} ## install dependencies RUN apt-get update \ && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ ca-certificates \ git \ make \ rocm-cmake \ ninja-build \ gfortran \ build-essential \ libomp5 \ libomp-dev \ libbfd-dev \ libboost-all-dev \ libnuma1 \ libnuma-dev \ libpthread-stubs0-dev \ libzstd-dev \ lcov \ zip \ zlib1g-dev \ wget \ pkg-config \ unzip \ chrpath \ doxygen \ lshw \ build-essential \ libssl-dev \ curl \ libncursesw5-dev \ xz-utils \ liblzma-dev \ python3-pip \ python3-setuptools \ python3-venv \ python3-dev \ python3-tk \ python3-yaml \ vim \ less \ && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* RUN wget https://github.com/Kitware/CMake/releases/download/v3.28.0/cmake-3.28.0-linux-x86_64.sh \ && chmod +x cmake-3.28.0-linux-x86_64.sh \ && bash ./cmake-3.28.0-linux-x86_64.sh --prefix=/usr --exclude-subdir --skip-license \ && rm cmake-3.28.0-linux-x86_64.sh ## Set ROCm path ENV ROCM_PATH=/opt/rocm ## Install UCX ENV UCX_INSTALL_PREFIX=/opt/ucx RUN wget https://github.com/openucx/ucx/releases/download/v1.16.0/ucx-1.16.0.tar.gz \ && mkdir -p ucx \ && tar -zxf ucx-1.16.0.tar.gz -C ucx --strip-components=1 \ && cd ucx \ && mkdir build \ && cd build \ && ../configure --prefix=${UCX_INSTALL_PREFIX} --with-rocm=${ROCM_PATH} \ && make -j16 install \ && cd ../.. \ && rm -rf ucx ucx-1.16.0.tar.gz ## Install OpenMPI ENV MPI_INSTALL_PREFIX=/opt/ompi RUN wget https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.6.tar.gz \ && mkdir -p ompi4 \ && tar -zxf openmpi-4.1.6.tar.gz -C ompi4 --strip-components=1 \ && cd ompi4 \ && mkdir build \ && cd build \ && ../configure --prefix=${MPI_INSTALL_PREFIX} --with-ucx=${UCX_INSTALL_PREFIX} --disable-oshmem --disable-mpi-fortran --enable-orterun-prefix-by-default \ && make -j16 install \ && cd ../.. \ && rm -rf ompi4 openmpi-4.1.6.tar.gz ## building RCCL ENV RCCL_INSTALL_PREFIX=${WORKDIR}/rccl/install RUN git clone --recurse-submodules -b "${RCCL_BRANCH}" "${RCCL_REPO}" \ && cd ./rccl \ && ./install.sh --amdgpu_targets=${GPU_TARGETS} --prefix=${RCCL_INSTALL_PREFIX} ## building RCCL-Tests RUN git clone -b "${RCCL_TESTS_BRANCH}" "${RCCL_TESTS_REPO}" ./rccl-tests \ && cd ./rccl-tests \ && mkdir build \ && cd build \ && cmake -DCMAKE_BUILD_TYPE=Release -DUSE_MPI=ON -DCMAKE_PREFIX_PATH="${RCCL_INSTALL_PREFIX};${MPI_INSTALL_PREFIX}" -DGPU_TARGETS=${GPU_TARGETS} .. \ && make -j16 ## set environment variables ENV PATH="${MPI_INSTALL_PREFIX}/bin:${ROCM_PATH}/bin:${PATH}" ENV LD_LIBRARY_PATH="${RCCL_INSTALL_PREFIX}:${MPI_INSTALL_PREFIX}/lib:${ROCM_PATH}/lib:${LD_LIBRARY_PATH}" ENV UCX_WARN_UNUSED_ENV_VARS=n ENV OMPI_ALLOW_RUN_AS_ROOT=1 ENV OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 ENV NCCL_DEBUG=VERSION ================================================ FILE: docker/README.md ================================================ # Using RCCL/RCCL-Tests in a docker environment ## Docker build Assuming you have docker installed on your system: ### To build the docker image : By default, the given Dockerfile uses `docker.io/rocm/dev-ubuntu-22.04:latest` as the base docker image, and then installs RCCL (develop branch) and RCCL-Tests (develop branch), targetting `gfx942` GPUs. ```shell $ docker build -t rccl-tests -f Dockerfile.ubuntu --pull . ``` The base docker image, rccl repo, rccl-tests repo, and GPU targets can be modified using `--build-args` in the `docker build` command above. E.g., to use a different base docker image for the MI250 GPU: ```shell $ docker build -t rccl-tests -f Dockerfile.ubuntu --build-arg="ROCM_IMAGE_NAME=rocm/dev-ubuntu-20.04" --build-arg="ROCM_IMAGE_TAG=6.2" --build-arg="GPU_TARGETS=gfx90a" --pull . ``` ### To start an interactive docker container on a system with AMD GPUs : ```shell $ docker run --rm --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --network=host --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -it rccl-tests /bin/bash ``` ### To run rccl-tests (all\_reduce\_perf) on 8 AMD GPUs (inside the docker container) : If using ROCm 6.3.x or earlier ```shell $ mpirun --allow-run-as-root -np 8 --mca pml ucx --mca btl ^openib -x NCCL_DEBUG=VERSION -x HSA_NO_SCRATCH_RECLAIM=1 /workspace/rccl-tests/build/all_reduce_perf -b 1 -e 16G -f 2 -g 1 ``` If using ROCm 6.4.0 or later ```shell $ mpirun --allow-run-as-root -np 8 --mca pml ucx --mca btl ^openib -x NCCL_DEBUG=VERSION /workspace/rccl-tests/build/all_reduce_perf -b 1 -e 16G -f 2 -g 1 ``` For more information on rccl-tests options, refer to the [Usage](https://github.com/ROCm/rccl-tests#usage) section of rccl-tests. ## Copyright All modifications are copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved. ================================================ FILE: docs/.gitignore ================================================ _build/ _doxygen/ doxygen/html doxygen/xml sphinx/_toc.yml ================================================ FILE: docs/api-reference/api-library.rst ================================================ .. meta:: :description: RCCL is a stand-alone library that provides multi-GPU and multi-node collective communication primitives optimized for AMD GPUs :keywords: RCCL, ROCm, library, API .. _api-library: *********** API library *********** RCCL (pronounced "Rickle") is a stand-alone library of standard collective communication routines for GPUs, implementing all-reduce, all-gather, reduce, broadcast, reduce-scatter, gather, scatter, and all-to-all. There is also initial support for direct GPU-to-GPU send and receive operations. It has been optimized to achieve high bandwidth on platforms using PCIe, xGMI as well as networking using InfiniBand Verbs or TCP/IP sockets. RCCL supports an arbitrary number of GPUs installed in a single node or multiple nodes, and can be used in either single- or multi-process (e.g., MPI) applications. The collective operations are implemented using ring and tree algorithms and have been optimized for throughput and latency. For best performance, small operations can be either batched into larger operations or aggregated through the API. Operations ========== .. doxygenindex:: ================================================ FILE: docs/api-reference/env-variables.rst ================================================ .. meta:: :description: RCCL is a stand-alone library that provides multi-GPU and multi-node collective communication primitives optimized for AMD GPUs :keywords: RCCL, ROCm, library, API, reference, environment variable, environment .. _env-variables: ******************************************************************** RCCL environment variables ******************************************************************** This section describes the most important RCCL environment variables, which are grouped by functionality. Configuration and setup ======================== The configuration and setup environment variables for RCCL are collected in the following table. .. list-table:: :header-rows: 1 :widths: 70,30 * - **Environment variable** - **Value** * - | ``NCCL_CONF_FILE`` | Specifies the path to the RCCL configuration file. - | String path to configuration file | Default: ``~/.rccl.conf`` or ``/etc/rccl.conf`` * - | ``NCCL_HOSTID`` | Sets the host identifier for multi-node communication. - | String value for host identification | Used for host hash generation Logging and debugging ===================== The logging and debugging environment variables for RCCL are collected in the following table. .. list-table:: :header-rows: 1 :widths: 70,30 * - **Environment variable** - **Value** * - | ``RCCL_LOG_LEVEL`` | Controls RCCL logging verbosity. - | Integer value (default: ``1``) | Higher values increase logging detail * - | ``NCCL_DEBUG_SUBSYS`` | Controls which subsystems generate debug output. - | Comma-separated list of subsystems (e.g., ``INIT,COLL``) | Prefix with ``^`` to invert selection Algorithm and protocol control ============================== The algorithm and protocol control environment variables for RCCL are collected in the following table. .. list-table:: :header-rows: 1 :widths: 70,30 * - **Environment variable** - **Value** * - | ``NCCL_ALGO`` | Forces specific algorithm selection for collectives. - | Algorithm name string | Used to override automatic algorithm selection * - | ``NCCL_PROTO`` | Forces specific protocol selection for communication. - | Protocol name string | Used to override automatic protocol selection Network and topology ==================== The network and topology environment variables for RCCL are collected in the following table. .. list-table:: :header-rows: 1 :widths: 70,30 * - **Environment variable** - **Value** * - | ``NCCL_IB_HCA`` | Specifies InfiniBand device:port to use. - | Device specification string | Prefix with ``^`` for exclusion, ``=`` for exact match * - | ``NCCL_IB_GID_INDEX`` | Defines the Global ID index used in RoCE mode. - | Integer value (default: ``-1``) | See InfiniBand ``show_gids`` command for valid values * - | ``NCCL_SOCKET_IFNAME`` | Specifies which IP interfaces to use for communication. - | Interface prefix string or list | Multiple prefixes separated by ``,`` | Prefix with ``^`` for exclusion, ``=`` for exact match | Example: ``eth`` (all eth interfaces), ``=eth0`` (exact match) * - | ``NCCL_SOCKET_FAMILY`` | Forces IPv4/IPv6 interface selection. - | ``AF_INET``: Force IPv4 | ``AF_INET6``: Force IPv6 | Unset: Use first available * - | ``NCCL_NET_MERGE_LEVEL`` | Controls network device merging behavior. - | Integer value specifying merge level | Default: ``PATH_PORT`` * - | ``NCCL_NET_FORCE_MERGE`` | Forces merging of network devices. - | String specifying forced merge configuration * - | ``NCCL_RINGS`` | Defines custom ring topology. - | Ring topology specification string | Overrides automatic topology detection * - | ``RCCL_TREES`` | Defines custom tree topology. - | Tree topology specification string | Alternative to ring topology * - | ``NCCL_RINGS_REMAP`` | Controls ring remapping for specific topologies. - | Remapping specification string | Used with Rome 4P2H topology Development and testing (advanced) ================================== The development and testing environment variables for RCCL are collected in the following table. These variables are primarily intended for debugging and development purposes. .. list-table:: :header-rows: 1 :widths: 70,30 * - **Environment variable** - **Value** * - | ``CUDA_LAUNCH_BLOCKING`` | Controls CUDA kernel launch blocking behavior. - | ``0``: Non-blocking launches | ``1`` or non-zero: Blocking launches * - | ``NCCL_COMM_ID`` | Enables multi-process mode in test applications. - | Any non-empty value enables multi-process mode | Used with test executables for distributed testing ================================================ FILE: docs/api-reference/library-specification.rst ================================================ .. meta:: :description: RCCL is a stand-alone library that provides multi-GPU and multi-node collective communication primitives optimized for AMD GPUs :keywords: RCCL, ROCm, library, API .. _library-specification: ============================ RCCL library specification ============================ This document provides details of the API library. Communicator functions ---------------------- .. doxygenfunction:: ncclGetUniqueId .. doxygenfunction:: ncclCommInitRank .. doxygenfunction:: ncclCommInitAll .. doxygenfunction:: ncclCommDestroy .. doxygenfunction:: ncclCommAbort .. doxygenfunction:: ncclCommCount .. doxygenfunction:: ncclCommCuDevice .. doxygenfunction:: ncclCommUserRank Collective communication operations ----------------------------------- Collective communication operations must be called separately for each communicator in a communicator clique. They return when operations have been enqueued on the hipstream. Since they may perform inter-CPU synchronization, each call has to be done from a different thread or process, or need to use Group Semantics (see below). .. doxygenfunction:: ncclReduce .. doxygenfunction:: ncclBcast .. doxygenfunction:: ncclBroadcast .. doxygenfunction:: ncclAllReduce .. doxygenfunction:: ncclReduceScatter .. doxygenfunction:: ncclAllGather .. doxygenfunction:: ncclSend .. doxygenfunction:: ncclRecv .. doxygenfunction:: ncclGather .. doxygenfunction:: ncclScatter .. doxygenfunction:: ncclAllToAll Group semantics --------------- When managing multiple GPUs from a single thread, and since NCCL collective calls may perform inter-CPU synchronization, we need to "group" calls for different ranks/devices into a single call. Grouping NCCL calls as being part of the same collective operation is done using ncclGroupStart and ncclGroupEnd. ncclGroupStart will enqueue all collective calls until the ncclGroupEnd call, which will wait for all calls to be complete. Note that for collective communication, ncclGroupEnd only guarantees that the operations are enqueued on the streams, not that the operation is effectively done. Both collective communication and ncclCommInitRank can be used in conjunction of ncclGroupStart/ncclGroupEnd. .. doxygenfunction:: ncclGroupStart .. doxygenfunction:: ncclGroupEnd Library functions ----------------- .. doxygenfunction:: ncclGetVersion .. doxygenfunction:: ncclGetErrorString Types ----- There are few data structures that are internal to the library. The pointer types to these structures are given below. The user would need to use these types to create handles and pass them between different library functions. .. doxygentypedef:: ncclComm_t .. doxygenstruct:: ncclUniqueId Enumerations ------------ This section provides all the enumerations used. .. doxygenenum:: ncclResult_t .. doxygenenum:: ncclRedOp_t .. _rccl-supported-data-types: .. doxygenenum:: ncclDataType_t ================================================ FILE: docs/attributions.rst ================================================ .. meta:: :description: RCCL attributions information :keywords: RCCL, ROCm, library, API, attributions .. toctree:: :maxdepth: 4 :caption: Attributions Attributions ============ Contains contributions from NVIDIA. Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National Laboratory, the U.S. Department of Energy, nor the names of their contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. The U.S. Department of Energy funded the development of this software under subcontract 7078610 with Lawrence Berkeley National Laboratory. This code also includes files from the NVIDIA Tools Extension SDK project. For more information and license details, see `https://github.com/NVIDIA/NVTX `_ ================================================ FILE: docs/conf.py ================================================ # Configuration file for the Sphinx documentation builder. # # This file only contains a selection of the most common options. For a full # list see the documentation: # https://www.sphinx-doc.org/en/master/usage/configuration.html import subprocess from rocm_docs import ROCmDocs name = "RCCL" get_major = r'sed -n -e "s/^NCCL_MAJOR.*\([0-9]\+\).*/\1/p" ../makefiles/version.mk' get_minor = r'sed -n -e "s/^NCCL_MINOR.*\([0-9]\{2,\}\).*/\1/p" ../makefiles/version.mk' get_patch = r'sed -n -e "s/^NCCL_PATCH.*\([0-9]\+\).*/\1/p" ../makefiles/version.mk' major = subprocess.getoutput(get_major) minor = subprocess.getoutput(get_minor) patch = subprocess.getoutput(get_patch) version_number = f"{major}.{minor}.{patch}" # for PDF output on Read the Docs project = f"{name} Documentation" author = "Advanced Micro Devices, Inc." copyright = "Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved." version = version_number release = version_number external_toc_path = "./sphinx/_toc.yml" docs_core = ROCmDocs(f"{name} {version_number} Documentation") docs_core.run_doxygen(doxygen_root="doxygen", doxygen_path="doxygen/xml") docs_core.setup() external_projects_current_project = "rccl" for sphinx_var in ROCmDocs.SPHINX_VARS: globals()[sphinx_var] = getattr(docs_core, sphinx_var) ================================================ FILE: docs/doxygen/Doxyfile ================================================ # Doxyfile 1.8.17 # This file describes the settings to be used by the documentation system # doxygen (www.doxygen.org) for a project. # # All text after a double hash (##) is considered a comment and is placed in # front of the TAG it is preceding. # # All text after a single hash (#) is considered a comment and will be ignored. # The format is: # TAG = value [value, ...] # For lists, items can also be appended using: # TAG += value [value, ...] # Values that contain spaces should be placed between quotes (\" \"). #--------------------------------------------------------------------------- # Project related configuration options #--------------------------------------------------------------------------- # This tag specifies the encoding used for all characters in the configuration # file that follow. The default is UTF-8 which is also the encoding used for all # text before the first occurrence of this tag. Doxygen uses libiconv (or the # iconv built into libc) for the transcoding. See # https://www.gnu.org/software/libiconv/ for the list of possible encodings. # The default value is: UTF-8. DOXYFILE_ENCODING = UTF-8 # The PROJECT_NAME tag is a single word (or a sequence of words surrounded by # double-quotes, unless you are using Doxywizard) that should identify the # project for which the documentation is generated. This name is used in the # title of most generated pages and in a few other places. # The default value is: My Project. PROJECT_NAME = "RCCL" # The PROJECT_NUMBER tag can be used to enter a project or revision number. This # could be handy for archiving the generated documentation or if some version # control system is used. PROJECT_NUMBER = v2.18.3 # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a # quick idea about the purpose of the project. Keep the description short. PROJECT_BRIEF = "ROCm Collective Communications Library" # With the PROJECT_LOGO tag one can specify a logo or an icon that is included # in the documentation. The maximum height of the logo should not exceed 55 # pixels and the maximum width should not exceed 200 pixels. Doxygen will copy # the logo to the output directory. PROJECT_LOGO = # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path # into which the generated documentation will be written. If a relative path is # entered, it will be relative to the location where doxygen was started. If # left blank the current directory will be used. OUTPUT_DIRECTORY = . # If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub- # directories (in 2 levels) under the output directory of each output format and # will distribute the generated files over these directories. Enabling this # option can be useful when feeding doxygen a huge amount of source files, where # putting all generated files in the same directory would otherwise causes # performance problems for the file system. # The default value is: NO. CREATE_SUBDIRS = NO # If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII # characters to appear in the names of generated files. If set to NO, non-ASCII # characters will be escaped, for example _xE3_x81_x84 will be used for Unicode # U+3044. # The default value is: NO. ALLOW_UNICODE_NAMES = NO # The OUTPUT_LANGUAGE tag is used to specify the language in which all # documentation generated by doxygen is written. Doxygen will use this # information to generate all constant output in the proper language. # Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese, # Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States), # Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian, # Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages), # Korean, Korean-en (Korean with English messages), Latvian, Lithuanian, # Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian, # Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish, # Ukrainian and Vietnamese. # The default value is: English. OUTPUT_LANGUAGE = English # The OUTPUT_TEXT_DIRECTION tag is used to specify the direction in which all # documentation generated by doxygen is written. Doxygen will use this # information to generate all generated output in the proper direction. # Possible values are: None, LTR, RTL and Context. # The default value is: None. OUTPUT_TEXT_DIRECTION = None # If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member # descriptions after the members that are listed in the file and class # documentation (similar to Javadoc). Set to NO to disable this. # The default value is: YES. BRIEF_MEMBER_DESC = YES # If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief # description of a member or function before the detailed description # # Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the # brief descriptions will be completely suppressed. # The default value is: YES. REPEAT_BRIEF = YES # This tag implements a quasi-intelligent brief description abbreviator that is # used to form the text in various listings. Each string in this list, if found # as the leading text of the brief description, will be stripped from the text # and the result, after processing the whole list, is used as the annotated # text. Otherwise, the brief description is used as-is. If left blank, the # following values are used ($name is automatically replaced with the name of # the entity):The $name class, The $name widget, The $name file, is, provides, # specifies, contains, represents, a, an and the. ABBREVIATE_BRIEF = "The $name class" \ "The $name widget" \ "The $name file" \ is \ provides \ specifies \ contains \ represents \ a \ an \ the # If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then # doxygen will generate a detailed section even if there is only a brief # description. # The default value is: NO. ALWAYS_DETAILED_SEC = YES # If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all # inherited members of a class in the documentation of that class as if those # members were ordinary class members. Constructors, destructors and assignment # operators of the base classes will not be shown. # The default value is: NO. INLINE_INHERITED_MEMB = NO # If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path # before files name in the file list and in the header files. If set to NO the # shortest path that makes the file name unique will be used # The default value is: YES. FULL_PATH_NAMES = NO # The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path. # Stripping is only done if one of the specified strings matches the left-hand # part of the path. The tag can be used to show relative paths in the file list. # If left blank the directory from which doxygen is run is used as the path to # strip. # # Note that you can specify absolute paths here, but also relative paths, which # will be relative from the directory where doxygen is started. # This tag requires that the tag FULL_PATH_NAMES is set to YES. STRIP_FROM_PATH = # The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the # path mentioned in the documentation of a class, which tells the reader which # header file to include in order to use a class. If left blank only the name of # the header file containing the class definition is used. Otherwise one should # specify the list of include paths that are normally passed to the compiler # using the -I flag. STRIP_FROM_INC_PATH = # If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but # less readable) file names. This can be useful is your file systems doesn't # support long names like on DOS, Mac, or CD-ROM. # The default value is: NO. SHORT_NAMES = NO # If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the # first line (until the first dot) of a Javadoc-style comment as the brief # description. If set to NO, the Javadoc-style will behave just like regular Qt- # style comments (thus requiring an explicit @brief command for a brief # description.) # The default value is: NO. JAVADOC_AUTOBRIEF = NO # If the JAVADOC_BANNER tag is set to YES then doxygen will interpret a line # such as # /*************** # as being the beginning of a Javadoc-style comment "banner". If set to NO, the # Javadoc-style will behave just like regular comments and it will not be # interpreted by doxygen. # The default value is: NO. JAVADOC_BANNER = NO # If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first # line (until the first dot) of a Qt-style comment as the brief description. If # set to NO, the Qt-style will behave just like regular Qt-style comments (thus # requiring an explicit \brief command for a brief description.) # The default value is: NO. QT_AUTOBRIEF = NO # The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a # multi-line C++ special comment block (i.e. a block of //! or /// comments) as # a brief description. This used to be the default behavior. The new default is # to treat a multi-line C++ comment block as a detailed description. Set this # tag to YES if you prefer the old behavior instead. # # Note that setting this tag to YES also means that rational rose comments are # not recognized any more. # The default value is: NO. MULTILINE_CPP_IS_BRIEF = NO # If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the # documentation from any documented member that it re-implements. # The default value is: YES. INHERIT_DOCS = YES # If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new # page for each member. If set to NO, the documentation of a member will be part # of the file/class/namespace that contains it. # The default value is: NO. SEPARATE_MEMBER_PAGES = NO # The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen # uses this value to replace tabs by spaces in code fragments. # Minimum value: 1, maximum value: 16, default value: 4. TAB_SIZE = 4 # This tag can be used to specify a number of aliases that act as commands in # the documentation. An alias has the form: # name=value # For example adding # "sideeffect=@par Side Effects:\n" # will allow you to put the command \sideeffect (or @sideeffect) in the # documentation, which will result in a user-defined paragraph with heading # "Side Effects:". You can put \n's in the value part of an alias to insert # newlines (in the resulting output). You can put ^^ in the value part of an # alias to insert a newline as if a physical newline was in the original file. # When you need a literal { or } or , in the value part of an alias you have to # escape them by means of a backslash (\), this can lead to conflicts with the # commands \{ and \} for these it is advised to use the version @{ and @} or use # a double escape (\\{ and \\}) ALIASES = # This tag can be used to specify a number of word-keyword mappings (TCL only). # A mapping has the form "name=value". For example adding "class=itcl::class" # will allow you to use the command class in the itcl::class meaning. TCL_SUBST = # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources # only. Doxygen will then generate output that is more tailored for C. For # instance, some of the names that are used will be different. The list of all # members will be omitted, etc. # The default value is: NO. OPTIMIZE_OUTPUT_FOR_C = YES # Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or # Python sources only. Doxygen will then generate output that is more tailored # for that language. For instance, namespaces will be presented as packages, # qualified scopes will look different, etc. # The default value is: NO. OPTIMIZE_OUTPUT_JAVA = NO # Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran # sources. Doxygen will then generate output that is tailored for Fortran. # The default value is: NO. OPTIMIZE_FOR_FORTRAN = NO # Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL # sources. Doxygen will then generate output that is tailored for VHDL. # The default value is: NO. OPTIMIZE_OUTPUT_VHDL = NO # Set the OPTIMIZE_OUTPUT_SLICE tag to YES if your project consists of Slice # sources only. Doxygen will then generate output that is more tailored for that # language. For instance, namespaces will be presented as modules, types will be # separated into more groups, etc. # The default value is: NO. OPTIMIZE_OUTPUT_SLICE = NO # Doxygen selects the parser to use depending on the extension of the files it # parses. With this tag you can assign which parser to use for a given # extension. Doxygen has a built-in mapping, but you can override or extend it # using this tag. The format is ext=language, where ext is a file extension, and # language is one of the parsers supported by doxygen: IDL, Java, JavaScript, # Csharp (C#), C, C++, D, PHP, md (Markdown), Objective-C, Python, Slice, # Fortran (fixed format Fortran: FortranFixed, free formatted Fortran: # FortranFree, unknown formatted Fortran: Fortran. In the later case the parser # tries to guess whether the code is fixed or free formatted code, this is the # default for Fortran type files), VHDL, tcl. For instance to make doxygen treat # .inc files as Fortran files (default is PHP), and .f files as C (default is # Fortran), use: inc=Fortran f=C. # # Note: For files without extension you can use no_extension as a placeholder. # # Note that for custom extensions you also need to set FILE_PATTERNS otherwise # the files are not read by doxygen. EXTENSION_MAPPING = in=C # If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments # according to the Markdown format, which allows for more readable # documentation. See https://daringfireball.net/projects/markdown/ for details. # The output of markdown processing is further processed by doxygen, so you can # mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in # case of backward compatibilities issues. # The default value is: YES. MARKDOWN_SUPPORT = YES # When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up # to that level are automatically included in the table of contents, even if # they do not have an id attribute. # Note: This feature currently applies only to Markdown headings. # Minimum value: 0, maximum value: 99, default value: 5. # This tag requires that the tag MARKDOWN_SUPPORT is set to YES. TOC_INCLUDE_HEADINGS = 5 # When enabled doxygen tries to link words that correspond to documented # classes, or namespaces to their corresponding documentation. Such a link can # be prevented in individual cases by putting a % sign in front of the word or # globally by setting AUTOLINK_SUPPORT to NO. # The default value is: YES. AUTOLINK_SUPPORT = YES # If you use STL classes (i.e. std::string, std::vector, etc.) but do not want # to include (a tag file for) the STL sources as input, then you should set this # tag to YES in order to let doxygen match functions declarations and # definitions whose arguments contain STL classes (e.g. func(std::string); # versus func(std::string) {}). This also make the inheritance and collaboration # diagrams that involve STL classes more complete and accurate. # The default value is: NO. BUILTIN_STL_SUPPORT = NO # If you use Microsoft's C++/CLI language, you should set this option to YES to # enable parsing support. # The default value is: NO. CPP_CLI_SUPPORT = NO # Set the SIP_SUPPORT tag to YES if your project consists of sip (see: # https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen # will parse them like normal C++ but will assume all classes use public instead # of private inheritance when no explicit protection keyword is present. # The default value is: NO. SIP_SUPPORT = NO # For Microsoft's IDL there are propget and propput attributes to indicate # getter and setter methods for a property. Setting this option to YES will make # doxygen to replace the get and set methods by a property in the documentation. # This will only work if the methods are indeed getting or setting a simple # type. If this is not the case, or you want to show the methods anyway, you # should set this option to NO. # The default value is: YES. IDL_PROPERTY_SUPPORT = YES # If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC # tag is set to YES then doxygen will reuse the documentation of the first # member in the group (if any) for the other members of the group. By default # all members of a group must be documented explicitly. # The default value is: NO. DISTRIBUTE_GROUP_DOC = YES # If one adds a struct or class to a group and this option is enabled, then also # any nested class or struct is added to the same group. By default this option # is disabled and one has to add nested compounds explicitly via \ingroup. # The default value is: NO. GROUP_NESTED_COMPOUNDS = NO # Set the SUBGROUPING tag to YES to allow class member groups of the same type # (for instance a group of public functions) to be put as a subgroup of that # type (e.g. under the Public Functions section). Set it to NO to prevent # subgrouping. Alternatively, this can be done per class using the # \nosubgrouping command. # The default value is: YES. SUBGROUPING = YES # When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions # are shown inside the group in which they are included (e.g. using \ingroup) # instead of on a separate page (for HTML and Man pages) or section (for LaTeX # and RTF). # # Note that this feature does not work in combination with # SEPARATE_MEMBER_PAGES. # The default value is: NO. INLINE_GROUPED_CLASSES = NO # When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions # with only public data fields or simple typedef fields will be shown inline in # the documentation of the scope in which they are defined (i.e. file, # namespace, or group documentation), provided this scope is documented. If set # to NO, structs, classes, and unions are shown on a separate page (for HTML and # Man pages) or section (for LaTeX and RTF). # The default value is: NO. INLINE_SIMPLE_STRUCTS = NO # When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or # enum is documented as struct, union, or enum with the name of the typedef. So # typedef struct TypeS {} TypeT, will appear in the documentation as a struct # with name TypeT. When disabled the typedef will appear as a member of a file, # namespace, or class. And the struct will be named TypeS. This can typically be # useful for C code in case the coding convention dictates that all compound # types are typedef'ed and only the typedef is referenced, never the tag name. # The default value is: NO. TYPEDEF_HIDES_STRUCT = YES # The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This # cache is used to resolve symbols given their name and scope. Since this can be # an expensive process and often the same symbol appears multiple times in the # code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small # doxygen will become slower. If the cache is too large, memory is wasted. The # cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range # is 0..9, the default is 0, corresponding to a cache size of 2^16=65536 # symbols. At the end of a run doxygen will report the cache usage and suggest # the optimal cache size from a speed point of view. # Minimum value: 0, maximum value: 9, default value: 0. LOOKUP_CACHE_SIZE = 0 #--------------------------------------------------------------------------- # Build related configuration options #--------------------------------------------------------------------------- # If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in # documentation are documented, even if no documentation was available. Private # class members and static file members will be hidden unless the # EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES. # Note: This will also disable the warnings about undocumented members that are # normally produced when WARNINGS is set to YES. # The default value is: NO. EXTRACT_ALL = YES # If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will # be included in the documentation. # The default value is: NO. EXTRACT_PRIVATE = NO # If the EXTRACT_PRIV_VIRTUAL tag is set to YES, documented private virtual # methods of a class will be included in the documentation. # The default value is: NO. EXTRACT_PRIV_VIRTUAL = NO # If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal # scope will be included in the documentation. # The default value is: NO. EXTRACT_PACKAGE = NO # If the EXTRACT_STATIC tag is set to YES, all static members of a file will be # included in the documentation. # The default value is: NO. EXTRACT_STATIC = NO # If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined # locally in source files will be included in the documentation. If set to NO, # only classes defined in header files are included. Does not have any effect # for Java sources. # The default value is: YES. EXTRACT_LOCAL_CLASSES = YES # This flag is only useful for Objective-C code. If set to YES, local methods, # which are defined in the implementation section but not in the interface are # included in the documentation. If set to NO, only methods in the interface are # included. # The default value is: NO. EXTRACT_LOCAL_METHODS = NO # If this flag is set to YES, the members of anonymous namespaces will be # extracted and appear in the documentation as a namespace called # 'anonymous_namespace{file}', where file will be replaced with the base name of # the file that contains the anonymous namespace. By default anonymous namespace # are hidden. # The default value is: NO. EXTRACT_ANON_NSPACES = NO # If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all # undocumented members inside documented classes or files. If set to NO these # members will be included in the various overviews, but no documentation # section is generated. This option has no effect if EXTRACT_ALL is enabled. # The default value is: NO. HIDE_UNDOC_MEMBERS = NO # If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all # undocumented classes that are normally visible in the class hierarchy. If set # to NO, these classes will be included in the various overviews. This option # has no effect if EXTRACT_ALL is enabled. # The default value is: NO. HIDE_UNDOC_CLASSES = NO # If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend # declarations. If set to NO, these declarations will be included in the # documentation. # The default value is: NO. HIDE_FRIEND_COMPOUNDS = NO # If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any # documentation blocks found inside the body of a function. If set to NO, these # blocks will be appended to the function's detailed documentation block. # The default value is: NO. HIDE_IN_BODY_DOCS = NO # The INTERNAL_DOCS tag determines if documentation that is typed after a # \internal command is included. If the tag is set to NO then the documentation # will be excluded. Set it to YES to include the internal documentation. # The default value is: NO. INTERNAL_DOCS = NO # If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file # names in lower-case letters. If set to YES, upper-case letters are also # allowed. This is useful if you have classes or files whose names only differ # in case and if your file system supports case sensitive file names. Windows # (including Cygwin) ands Mac users are advised to set this option to NO. # The default value is: system dependent. CASE_SENSE_NAMES = NO # If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with # their full class and namespace scopes in the documentation. If set to YES, the # scope will be hidden. # The default value is: NO. HIDE_SCOPE_NAMES = NO # If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will # append additional text to a page's title, such as Class Reference. If set to # YES the compound reference will be hidden. # The default value is: NO. HIDE_COMPOUND_REFERENCE = NO # If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of # the files that are included by a file in the documentation of that file. # The default value is: YES. SHOW_INCLUDE_FILES = YES # If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each # grouped member an include statement to the documentation, telling the reader # which file to include in order to use the member. # The default value is: NO. SHOW_GROUPED_MEMB_INC = NO # If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include # files with double quotes in the documentation rather than with sharp brackets. # The default value is: NO. FORCE_LOCAL_INCLUDES = NO # If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the # documentation for inline members. # The default value is: YES. INLINE_INFO = YES # If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the # (detailed) documentation of file and class members alphabetically by member # name. If set to NO, the members will appear in declaration order. # The default value is: YES. SORT_MEMBER_DOCS = YES # If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief # descriptions of file, namespace and class members alphabetically by member # name. If set to NO, the members will appear in declaration order. Note that # this will also influence the order of the classes in the class list. # The default value is: NO. SORT_BRIEF_DOCS = NO # If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the # (brief and detailed) documentation of class members so that constructors and # destructors are listed first. If set to NO the constructors will appear in the # respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS. # Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief # member documentation. # Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting # detailed member documentation. # The default value is: NO. SORT_MEMBERS_CTORS_1ST = NO # If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy # of group names into alphabetical order. If set to NO the group names will # appear in their defined order. # The default value is: NO. SORT_GROUP_NAMES = NO # If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by # fully-qualified names, including namespaces. If set to NO, the class list will # be sorted only by class name, not including the namespace part. # Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. # Note: This option applies only to the class list, not to the alphabetical # list. # The default value is: NO. SORT_BY_SCOPE_NAME = NO # If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper # type resolution of all parameters of a function it will reject a match between # the prototype and the implementation of a member function even if there is # only one candidate or it is obvious which candidate to choose by doing a # simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still # accept a match between prototype and implementation in such cases. # The default value is: NO. STRICT_PROTO_MATCHING = NO # The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo # list. This list is created by putting \todo commands in the documentation. # The default value is: YES. GENERATE_TODOLIST = YES # The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test # list. This list is created by putting \test commands in the documentation. # The default value is: YES. GENERATE_TESTLIST = YES # The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug # list. This list is created by putting \bug commands in the documentation. # The default value is: YES. GENERATE_BUGLIST = YES # The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO) # the deprecated list. This list is created by putting \deprecated commands in # the documentation. # The default value is: YES. GENERATE_DEPRECATEDLIST = YES # The ENABLED_SECTIONS tag can be used to enable conditional documentation # sections, marked by \if ... \endif and \cond # ... \endcond blocks. ENABLED_SECTIONS = # The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the # initial value of a variable or macro / define can have for it to appear in the # documentation. If the initializer consists of more lines than specified here # it will be hidden. Use a value of 0 to hide initializers completely. The # appearance of the value of individual variables and macros / defines can be # controlled using \showinitializer or \hideinitializer command in the # documentation regardless of this setting. # Minimum value: 0, maximum value: 10000, default value: 30. MAX_INITIALIZER_LINES = 30 # Set the SHOW_USED_FILES tag to NO to disable the list of files generated at # the bottom of the documentation of classes and structs. If set to YES, the # list will mention the files that were used to generate the documentation. # The default value is: YES. SHOW_USED_FILES = YES # Set the SHOW_FILES tag to NO to disable the generation of the Files page. This # will remove the Files entry from the Quick Index and from the Folder Tree View # (if specified). # The default value is: YES. SHOW_FILES = YES # Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces # page. This will remove the Namespaces entry from the Quick Index and from the # Folder Tree View (if specified). # The default value is: YES. SHOW_NAMESPACES = YES # The FILE_VERSION_FILTER tag can be used to specify a program or script that # doxygen should invoke to get the current version for each file (typically from # the version control system). Doxygen will invoke the program by executing (via # popen()) the command command input-file, where command is the value of the # FILE_VERSION_FILTER tag, and input-file is the name of an input file provided # by doxygen. Whatever the program writes to standard output is used as the file # version. For an example see the documentation. FILE_VERSION_FILTER = # The LAYOUT_FILE tag can be used to specify a layout file which will be parsed # by doxygen. The layout file controls the global structure of the generated # output files in an output format independent way. To create the layout file # that represents doxygen's defaults, run doxygen with the -l option. You can # optionally specify a file name after the option, if omitted DoxygenLayout.xml # will be used as the name of the layout file. # # Note that if you run doxygen from a directory containing a file called # DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE # tag is left empty. LAYOUT_FILE = # The CITE_BIB_FILES tag can be used to specify one or more bib files containing # the reference definitions. This must be a list of .bib files. The .bib # extension is automatically appended if omitted. This requires the bibtex tool # to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info. # For LaTeX the style of the bibliography can be controlled using # LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the # search path. See also \cite for info how to create references. CITE_BIB_FILES = #--------------------------------------------------------------------------- # Configuration options related to warning and progress messages #--------------------------------------------------------------------------- # The QUIET tag can be used to turn on/off the messages that are generated to # standard output by doxygen. If QUIET is set to YES this implies that the # messages are off. # The default value is: NO. QUIET = NO # The WARNINGS tag can be used to turn on/off the warning messages that are # generated to standard error (stderr) by doxygen. If WARNINGS is set to YES # this implies that the warnings are on. # # Tip: Turn warnings on while writing the documentation. # The default value is: YES. WARNINGS = YES # If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate # warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag # will automatically be disabled. # The default value is: YES. WARN_IF_UNDOCUMENTED = YES # If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for # potential errors in the documentation, such as not documenting some parameters # in a documented function, or documenting parameters that don't exist or using # markup commands wrongly. # The default value is: YES. WARN_IF_DOC_ERROR = YES # This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that # are documented, but have no documentation for their parameters or return # value. If set to NO, doxygen will only warn about wrong or incomplete # parameter documentation, but not about the absence of documentation. If # EXTRACT_ALL is set to YES then this flag will automatically be disabled. # The default value is: NO. WARN_NO_PARAMDOC = YES # If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when # a warning is encountered. # The default value is: NO. WARN_AS_ERROR = YES # The WARN_FORMAT tag determines the format of the warning messages that doxygen # can produce. The string should contain the $file, $line, and $text tags, which # will be replaced by the file and line number from which the warning originated # and the warning text. Optionally the format may contain $version, which will # be replaced by the version of the file (if it could be obtained via # FILE_VERSION_FILTER) # The default value is: $file:$line: $text. WARN_FORMAT = "$file:$line: $text" # The WARN_LOGFILE tag can be used to specify a file to which warning and error # messages should be written. If left blank the output is written to standard # error (stderr). WARN_LOGFILE = #--------------------------------------------------------------------------- # Configuration options related to the input files #--------------------------------------------------------------------------- # The INPUT tag is used to specify the files and/or directories that contain # documented source files. You may enter file names like myfile.cpp or # directories like /usr/src/myproject. Separate the files or directories with # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING # Note: If this tag is empty the current directory is searched. INPUT = ../../src/nccl.h.in # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses # libiconv (or the iconv built into libc) for the transcoding. See the libiconv # documentation (see: https://www.gnu.org/software/libiconv/) for the list of # possible encodings. # The default value is: UTF-8. INPUT_ENCODING = UTF-8 # If the value of the INPUT tag contains directories, you can use the # FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and # *.h) to filter out the source-files in the directories. # # Note that for custom extensions or not directly supported extensions you also # need to set EXTENSION_MAPPING for the extension otherwise the files are not # read by doxygen. # # If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp, # *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, # *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, # *.m, *.markdown, *.md, *.mm, *.dox (to be provided as doxygen C comment), # *.doc (to be provided as doxygen C comment), *.txt (to be provided as doxygen # C comment), *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, *.f, *.for, *.tcl, *.vhd, # *.vhdl, *.ucf, *.qsf and *.ice. FILE_PATTERNS = *.c \ *.cc \ *.cxx \ *.cpp \ *.c++ \ *.java \ *.ii \ *.ixx \ *.ipp \ *.i++ \ *.inl \ *.idl \ *.ddl \ *.odl \ *.h \ *.hh \ *.hxx \ *.hpp \ *.h++ \ *.cs \ *.d \ *.php \ *.php4 \ *.php5 \ *.phtml \ *.inc \ *.m \ *.markdown \ *.md \ *.mm \ *.dox \ *.py \ *.f90 \ *.f \ *.for \ *.tcl \ *.vhd \ *.vhdl \ *.ucf \ *.qsf \ *.as \ *.js # The RECURSIVE tag can be used to specify whether or not subdirectories should # be searched for input files as well. # The default value is: NO. RECURSIVE = NO # The EXCLUDE tag can be used to specify files and/or directories that should be # excluded from the INPUT source files. This way you can easily exclude a # subdirectory from a directory tree whose root is specified with the INPUT tag. # # Note that relative paths are relative to the directory from which doxygen is # run. EXCLUDE = # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or # directories that are symbolic links (a Unix file system feature) are excluded # from the input. # The default value is: NO. EXCLUDE_SYMLINKS = NO # If the value of the INPUT tag contains directories, you can use the # EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude # certain files from those directories. # # Note that the wildcards are matched against the file with absolute path, so to # exclude all test directories for example use the pattern */test/* EXCLUDE_PATTERNS = # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names # (namespaces, classes, functions, etc.) that should be excluded from the # output. The symbol name can be a fully qualified name, a word, or if the # wildcard * is used, a substring. Examples: ANamespace, AClass, # AClass::ANamespace, ANamespace::*Test # # Note that the wildcards are matched against the file with absolute path, so to # exclude all test directories use the pattern */test/* EXCLUDE_SYMBOLS = # The EXAMPLE_PATH tag can be used to specify one or more files or directories # that contain example code fragments that are included (see the \include # command). EXAMPLE_PATH = # If the value of the EXAMPLE_PATH tag contains directories, you can use the # EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and # *.h) to filter out the source-files in the directories. If left blank all # files are included. EXAMPLE_PATTERNS = * # If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be # searched for input files to be used with the \include or \dontinclude commands # irrespective of the value of the RECURSIVE tag. # The default value is: NO. EXAMPLE_RECURSIVE = NO # The IMAGE_PATH tag can be used to specify one or more files or directories # that contain images that are to be included in the documentation (see the # \image command). IMAGE_PATH = # The INPUT_FILTER tag can be used to specify a program that doxygen should # invoke to filter for each input file. Doxygen will invoke the filter program # by executing (via popen()) the command: # # # # where is the value of the INPUT_FILTER tag, and is the # name of an input file. Doxygen will then use the output that the filter # program writes to standard output. If FILTER_PATTERNS is specified, this tag # will be ignored. # # Note that the filter must not add or remove lines; it is applied before the # code is scanned, but not when the output code is generated. If lines are added # or removed, the anchors will not be placed correctly. # # Note that for custom extensions or not directly supported extensions you also # need to set EXTENSION_MAPPING for the extension otherwise the files are not # properly processed by doxygen. INPUT_FILTER = # The FILTER_PATTERNS tag can be used to specify filters on a per file pattern # basis. Doxygen will compare the file name with each pattern and apply the # filter if there is a match. The filters are a list of the form: pattern=filter # (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how # filters are used. If the FILTER_PATTERNS tag is empty or if none of the # patterns match the file name, INPUT_FILTER is applied. # # Note that for custom extensions or not directly supported extensions you also # need to set EXTENSION_MAPPING for the extension otherwise the files are not # properly processed by doxygen. FILTER_PATTERNS = # If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using # INPUT_FILTER) will also be used to filter the input files that are used for # producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES). # The default value is: NO. FILTER_SOURCE_FILES = NO # The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file # pattern. A pattern will override the setting for FILTER_PATTERN (if any) and # it is also possible to disable source filtering for a specific pattern using # *.ext= (so without naming a filter). # This tag requires that the tag FILTER_SOURCE_FILES is set to YES. FILTER_SOURCE_PATTERNS = # If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that # is part of the input, its contents will be placed on the main page # (index.html). This can be useful if you have a project on for instance GitHub # and want to reuse the introduction page also for the doxygen output. USE_MDFILE_AS_MAINPAGE = #--------------------------------------------------------------------------- # Configuration options related to source browsing #--------------------------------------------------------------------------- # If the SOURCE_BROWSER tag is set to YES then a list of source files will be # generated. Documented entities will be cross-referenced with these sources. # # Note: To get rid of all source code in the generated output, make sure that # also VERBATIM_HEADERS is set to NO. # The default value is: NO. SOURCE_BROWSER = NO # Setting the INLINE_SOURCES tag to YES will include the body of functions, # classes and enums directly into the documentation. # The default value is: NO. INLINE_SOURCES = NO # Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any # special comment blocks from generated source code fragments. Normal C, C++ and # Fortran comments will always remain visible. # The default value is: YES. STRIP_CODE_COMMENTS = YES # If the REFERENCED_BY_RELATION tag is set to YES then for each documented # entity all documented functions referencing it will be listed. # The default value is: NO. REFERENCED_BY_RELATION = NO # If the REFERENCES_RELATION tag is set to YES then for each documented function # all documented entities called/used by that function will be listed. # The default value is: NO. REFERENCES_RELATION = NO # If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set # to YES then the hyperlinks from functions in REFERENCES_RELATION and # REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will # link to the documentation. # The default value is: YES. REFERENCES_LINK_SOURCE = YES # If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the # source code will show a tooltip with additional information such as prototype, # brief description and links to the definition and documentation. Since this # will make the HTML file larger and loading of large files a bit slower, you # can opt to disable this feature. # The default value is: YES. # This tag requires that the tag SOURCE_BROWSER is set to YES. SOURCE_TOOLTIPS = YES # If the USE_HTAGS tag is set to YES then the references to source code will # point to the HTML generated by the htags(1) tool instead of doxygen built-in # source browser. The htags tool is part of GNU's global source tagging system # (see https://www.gnu.org/software/global/global.html). You will need version # 4.8.6 or higher. # # To use it do the following: # - Install the latest version of global # - Enable SOURCE_BROWSER and USE_HTAGS in the configuration file # - Make sure the INPUT points to the root of the source tree # - Run doxygen as normal # # Doxygen will invoke htags (and that will in turn invoke gtags), so these # tools must be available from the command line (i.e. in the search path). # # The result: instead of the source browser generated by doxygen, the links to # source code will now point to the output of htags. # The default value is: NO. # This tag requires that the tag SOURCE_BROWSER is set to YES. USE_HTAGS = NO # If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a # verbatim copy of the header file for each class for which an include is # specified. Set to NO to disable this. # See also: Section \class. # The default value is: YES. VERBATIM_HEADERS = YES # If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the # clang parser (see: http://clang.llvm.org/) for more accurate parsing at the # cost of reduced performance. This can be particularly helpful with template # rich C++ code for which doxygen's built-in parser lacks the necessary type # information. # Note: The availability of this option depends on whether or not doxygen was # generated with the -Duse_libclang=ON option for CMake. # The default value is: NO. CLANG_ASSISTED_PARSING = NO # If clang assisted parsing is enabled you can provide the compiler with command # line options that you would normally use when invoking the compiler. Note that # the include paths will already be set by doxygen for the files and directories # specified with INPUT and INCLUDE_PATH. # This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES. CLANG_OPTIONS = # If clang assisted parsing is enabled you can provide the clang parser with the # path to the compilation database (see: # http://clang.llvm.org/docs/HowToSetupToolingForLLVM.html) used when the files # were built. This is equivalent to specifying the "-p" option to a clang tool, # such as clang-check. These options will then be passed to the parser. # Note: The availability of this option depends on whether or not doxygen was # generated with the -Duse_libclang=ON option for CMake. CLANG_DATABASE_PATH = #--------------------------------------------------------------------------- # Configuration options related to the alphabetical class index #--------------------------------------------------------------------------- # If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all # compounds will be generated. Enable this if the project contains a lot of # classes, structs, unions or interfaces. # The default value is: YES. ALPHABETICAL_INDEX = YES # The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in # which the alphabetical index list will be split. # Minimum value: 1, maximum value: 20, default value: 5. # This tag requires that the tag ALPHABETICAL_INDEX is set to YES. COLS_IN_ALPHA_INDEX = 5 # In case all classes in a project start with a common prefix, all classes will # be put under the same header in the alphabetical index. The IGNORE_PREFIX tag # can be used to specify a prefix (or a list of prefixes) that should be ignored # while generating the index headers. # This tag requires that the tag ALPHABETICAL_INDEX is set to YES. IGNORE_PREFIX = #--------------------------------------------------------------------------- # Configuration options related to the HTML output #--------------------------------------------------------------------------- # If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output # The default value is: YES. GENERATE_HTML = YES # The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a # relative path is entered the value of OUTPUT_DIRECTORY will be put in front of # it. # The default directory is: html. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_OUTPUT = html # The HTML_FILE_EXTENSION tag can be used to specify the file extension for each # generated HTML page (for example: .htm, .php, .asp). # The default value is: .html. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_FILE_EXTENSION = .html # The HTML_HEADER tag can be used to specify a user-defined HTML header file for # each generated HTML page. If the tag is left blank doxygen will generate a # standard header. # # To get valid HTML the header file that includes any scripts and style sheets # that doxygen needs, which is dependent on the configuration options used (e.g. # the setting GENERATE_TREEVIEW). It is highly recommended to start with a # default header using # doxygen -w html new_header.html new_footer.html new_stylesheet.css # YourConfigFile # and then modify the file new_header.html. See also section "Doxygen usage" # for information on how to generate the default header that doxygen normally # uses. # Note: The header is subject to change so you typically have to regenerate the # default header when upgrading to a newer version of doxygen. For a description # of the possible markers and block names see the documentation. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_HEADER = # The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each # generated HTML page. If the tag is left blank doxygen will generate a standard # footer. See HTML_HEADER for more information on how to generate a default # footer and what special commands can be used inside the footer. See also # section "Doxygen usage" for information on how to generate the default footer # that doxygen normally uses. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_FOOTER = # The HTML_STYLESHEET tag can be used to specify a user-defined cascading style # sheet that is used by each HTML page. It can be used to fine-tune the look of # the HTML output. If left blank doxygen will generate a default style sheet. # See also section "Doxygen usage" for information on how to generate the style # sheet that doxygen normally uses. # Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as # it is more robust and this tag (HTML_STYLESHEET) will in the future become # obsolete. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_STYLESHEET = # The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined # cascading style sheets that are included after the standard style sheets # created by doxygen. Using this option one can overrule certain style aspects. # This is preferred over using HTML_STYLESHEET since it does not replace the # standard style sheet and is therefore more robust against future updates. # Doxygen will copy the style sheet files to the output directory. # Note: The order of the extra style sheet files is of importance (e.g. the last # style sheet in the list overrules the setting of the previous ones in the # list). For an example see the documentation. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_EXTRA_STYLESHEET = # The HTML_EXTRA_FILES tag can be used to specify one or more extra images or # other source files which should be copied to the HTML output directory. Note # that these files will be copied to the base HTML output directory. Use the # $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these # files. In the HTML_STYLESHEET file, use the file name only. Also note that the # files will be copied as-is; there are no commands or markers available. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_EXTRA_FILES = # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen # will adjust the colors in the style sheet and background images according to # this color. Hue is specified as an angle on a colorwheel, see # https://en.wikipedia.org/wiki/Hue for more information. For instance the value # 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300 # purple, and 360 is red again. # Minimum value: 0, maximum value: 359, default value: 220. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_COLORSTYLE_HUE = 220 # The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors # in the HTML output. For a value of 0 the output will use grayscales only. A # value of 255 will produce the most vivid colors. # Minimum value: 0, maximum value: 255, default value: 100. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_COLORSTYLE_SAT = 100 # The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the # luminance component of the colors in the HTML output. Values below 100 # gradually make the output lighter, whereas values above 100 make the output # darker. The value divided by 100 is the actual gamma applied, so 80 represents # a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not # change the gamma. # Minimum value: 40, maximum value: 240, default value: 80. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_COLORSTYLE_GAMMA = 80 # If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML # page will contain the date and time when the page was generated. Setting this # to YES can help to show when doxygen was last run and thus if the # documentation is up to date. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_TIMESTAMP = NO # If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML # documentation will contain a main index with vertical navigation menus that # are dynamically created via JavaScript. If disabled, the navigation index will # consists of multiple levels of tabs that are statically embedded in every HTML # page. Disable this option to support browsers that do not have JavaScript, # like the Qt help browser. # The default value is: YES. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_DYNAMIC_MENUS = YES # If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML # documentation will contain sections that can be hidden and shown after the # page has loaded. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_DYNAMIC_SECTIONS = NO # With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries # shown in the various tree structured indices initially; the user can expand # and collapse entries dynamically later on. Doxygen will expand the tree to # such a level that at most the specified number of entries are visible (unless # a fully collapsed tree already exceeds this amount). So setting the number of # entries 1 will produce a full collapsed tree by default. 0 is a special value # representing an infinite number of entries and will result in a full expanded # tree by default. # Minimum value: 0, maximum value: 9999, default value: 100. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_INDEX_NUM_ENTRIES = 100 # If the GENERATE_DOCSET tag is set to YES, additional index files will be # generated that can be used as input for Apple's Xcode 3 integrated development # environment (see: https://developer.apple.com/xcode/), introduced with OSX # 10.5 (Leopard). To create a documentation set, doxygen will generate a # Makefile in the HTML output directory. Running make will produce the docset in # that directory and running make install will install the docset in # ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at # startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy # genXcode/_index.html for more information. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_DOCSET = NO # This tag determines the name of the docset feed. A documentation feed provides # an umbrella under which multiple documentation sets from a single provider # (such as a company or product suite) can be grouped. # The default value is: Doxygen generated docs. # This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_FEEDNAME = "Doxygen generated docs" # This tag specifies a string that should uniquely identify the documentation # set bundle. This should be a reverse domain-name style string, e.g. # com.mycompany.MyDocSet. Doxygen will append .docset to the name. # The default value is: org.doxygen.Project. # This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_BUNDLE_ID = org.doxygen.Project # The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify # the documentation publisher. This should be a reverse domain-name style # string, e.g. com.mycompany.MyDocSet.documentation. # The default value is: org.doxygen.Publisher. # This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_PUBLISHER_ID = org.doxygen.Publisher # The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher. # The default value is: Publisher. # This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_PUBLISHER_NAME = Publisher # If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three # additional HTML index files: index.hhp, index.hhc, and index.hhk. The # index.hhp is a project file that can be read by Microsoft's HTML Help Workshop # (see: https://www.microsoft.com/en-us/download/details.aspx?id=21138) on # Windows. # # The HTML Help Workshop contains a compiler that can convert all HTML output # generated by doxygen into a single compiled HTML file (.chm). Compiled HTML # files are now used as the Windows 98 help format, and will replace the old # Windows help format (.hlp) on all Windows platforms in the future. Compressed # HTML files also contain an index, a table of contents, and you can search for # words in the documentation. The HTML workshop also contains a viewer for # compressed HTML files. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_HTMLHELP = NO # The CHM_FILE tag can be used to specify the file name of the resulting .chm # file. You can add a path in front of the file if the result should not be # written to the html output directory. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. CHM_FILE = # The HHC_LOCATION tag can be used to specify the location (absolute path # including file name) of the HTML help compiler (hhc.exe). If non-empty, # doxygen will try to run the HTML help compiler on the generated index.hhp. # The file has to be specified with full path. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. HHC_LOCATION = # The GENERATE_CHI flag controls if a separate .chi index file is generated # (YES) or that it should be included in the master .chm file (NO). # The default value is: NO. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. GENERATE_CHI = NO # The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc) # and project file content. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. CHM_INDEX_ENCODING = # The BINARY_TOC flag controls whether a binary table of contents is generated # (YES) or a normal table of contents (NO) in the .chm file. Furthermore it # enables the Previous and Next buttons. # The default value is: NO. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. BINARY_TOC = NO # The TOC_EXPAND flag can be set to YES to add extra items for group members to # the table of contents of the HTML help documentation and to the tree view. # The default value is: NO. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. TOC_EXPAND = NO # If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and # QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that # can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help # (.qch) of the generated HTML documentation. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_QHP = NO # If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify # the file name of the resulting .qch file. The path specified is relative to # the HTML output folder. # This tag requires that the tag GENERATE_QHP is set to YES. QCH_FILE = # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help # Project output. For more information please see Qt Help Project / Namespace # (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace). # The default value is: org.doxygen.Project. # This tag requires that the tag GENERATE_QHP is set to YES. QHP_NAMESPACE = org.doxygen.Project # The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt # Help Project output. For more information please see Qt Help Project / Virtual # Folders (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual- # folders). # The default value is: doc. # This tag requires that the tag GENERATE_QHP is set to YES. QHP_VIRTUAL_FOLDER = doc # If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom # filter to add. For more information please see Qt Help Project / Custom # Filters (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom- # filters). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_CUST_FILTER_NAME = # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the # custom filter to add. For more information please see Qt Help Project / Custom # Filters (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom- # filters). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_CUST_FILTER_ATTRS = # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this # project's filter section matches. Qt Help Project / Filter Attributes (see: # https://doc.qt.io/archives/qt-4.8/qthelpproject.html#filter-attributes). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_SECT_FILTER_ATTRS = # The QHG_LOCATION tag can be used to specify the location of Qt's # qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the # generated .qhp file. # This tag requires that the tag GENERATE_QHP is set to YES. QHG_LOCATION = # If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be # generated, together with the HTML files, they form an Eclipse help plugin. To # install this plugin and make it available under the help contents menu in # Eclipse, the contents of the directory containing the HTML and XML files needs # to be copied into the plugins directory of eclipse. The name of the directory # within the plugins directory should be the same as the ECLIPSE_DOC_ID value. # After copying Eclipse needs to be restarted before the help appears. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_ECLIPSEHELP = NO # A unique identifier for the Eclipse help plugin. When installing the plugin # the directory name containing the HTML and XML files should also have this # name. Each documentation set should have its own identifier. # The default value is: org.doxygen.Project. # This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES. ECLIPSE_DOC_ID = org.doxygen.Project # If you want full control over the layout of the generated HTML pages it might # be necessary to disable the index and replace it with your own. The # DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top # of each HTML page. A value of NO enables the index and the value YES disables # it. Since the tabs in the index contain the same information as the navigation # tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. DISABLE_INDEX = NO # The GENERATE_TREEVIEW tag is used to specify whether a tree-like index # structure should be generated to display hierarchical information. If the tag # value is set to YES, a side panel will be generated containing a tree-like # index structure (just like the one that is generated for HTML Help). For this # to work a browser that supports JavaScript, DHTML, CSS and frames is required # (i.e. any modern browser). Windows users are probably better off using the # HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can # further fine-tune the look of the index. As an example, the default style # sheet generated by doxygen has an example that shows how to put an image at # the root of the tree instead of the PROJECT_NAME. Since the tree basically has # the same information as the tab index, you could consider setting # DISABLE_INDEX to YES when enabling this option. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_TREEVIEW = NO # The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that # doxygen will group on one line in the generated HTML documentation. # # Note that a value of 0 will completely suppress the enum values from appearing # in the overview section. # Minimum value: 0, maximum value: 20, default value: 4. # This tag requires that the tag GENERATE_HTML is set to YES. ENUM_VALUES_PER_LINE = 1 # If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used # to set the initial width (in pixels) of the frame in which the tree is shown. # Minimum value: 0, maximum value: 1500, default value: 250. # This tag requires that the tag GENERATE_HTML is set to YES. TREEVIEW_WIDTH = 250 # If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to # external symbols imported via tag files in a separate window. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. EXT_LINKS_IN_WINDOW = NO # Use this tag to change the font size of LaTeX formulas included as images in # the HTML documentation. When you change the font size after a successful # doxygen run you need to manually remove any form_*.png images from the HTML # output directory to force them to be regenerated. # Minimum value: 8, maximum value: 50, default value: 10. # This tag requires that the tag GENERATE_HTML is set to YES. FORMULA_FONTSIZE = 10 # Use the FORMULA_TRANSPARENT tag to determine whether or not the images # generated for formulas are transparent PNGs. Transparent PNGs are not # supported properly for IE 6.0, but are supported on all modern browsers. # # Note that when changing this option you need to delete any form_*.png files in # the HTML output directory before the changes have effect. # The default value is: YES. # This tag requires that the tag GENERATE_HTML is set to YES. FORMULA_TRANSPARENT = YES # The FORMULA_MACROFILE can contain LaTeX \newcommand and \renewcommand commands # to create new LaTeX commands to be used in formulas as building blocks. See # the section "Including formulas" for details. FORMULA_MACROFILE = # Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see # https://www.mathjax.org) which uses client side JavaScript for the rendering # instead of using pre-rendered bitmaps. Use this if you do not have LaTeX # installed or if you want to formulas look prettier in the HTML output. When # enabled you may also need to install MathJax separately and configure the path # to it using the MATHJAX_RELPATH option. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. USE_MATHJAX = YES # When MathJax is enabled you can set the default output format to be used for # the MathJax output. See the MathJax site (see: # http://docs.mathjax.org/en/latest/output.html) for more details. # Possible values are: HTML-CSS (which is slower, but has the best # compatibility), NativeMML (i.e. MathML) and SVG. # The default value is: HTML-CSS. # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_FORMAT = HTML-CSS # When MathJax is enabled you need to specify the location relative to the HTML # output directory using the MATHJAX_RELPATH option. The destination directory # should contain the MathJax.js script. For instance, if the mathjax directory # is located at the same level as the HTML output directory, then # MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax # Content Delivery Network so you can quickly see the result without installing # MathJax. However, it is strongly recommended to install a local copy of # MathJax from https://www.mathjax.org before deployment. # The default value is: https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/. # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_RELPATH = http://cdn.mathjax.org/mathjax/latest # The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax # extension names that should be enabled during MathJax rendering. For example # MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_EXTENSIONS = # The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces # of code that will be used on startup of the MathJax code. See the MathJax site # (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an # example see the documentation. # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_CODEFILE = # When the SEARCHENGINE tag is enabled doxygen will generate a search box for # the HTML output. The underlying search engine uses javascript and DHTML and # should work on any modern browser. Note that when using HTML help # (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET) # there is already a search function so this one should typically be disabled. # For large projects the javascript based search engine can be slow, then # enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to # search using the keyboard; to jump to the search box use + S # (what the is depends on the OS and browser, but it is typically # , /