Repository: eth-cscs/COSMA
Branch: master
Commit: 2df0432ce9b2
Files: 155
Total size: 2.0 MB
Directory structure:
gitextract_zu2ke5gg/
├── .clang-format
├── .gitattributes
├── .github/
│ ├── tag-issue.md
│ └── workflows/
│ └── version_checker.yml
├── .gitignore
├── .gitlab-ci.yml
├── .gitmodules
├── ATTRIBUTIONS.md
├── CMakeLists.txt
├── INSTALL.md
├── LICENSE
├── README.md
├── _config.yml
├── benchmarks/
│ ├── CMakeLists.txt
│ ├── allgather-volume.cpp
│ ├── bcast-volume.cpp
│ ├── blocking_vs_non_blocking.cpp
│ ├── dgemm_perf_model.cpp
│ ├── gpu_gemm_cublas.cpp
│ ├── gpu_gemm_libsci_acc.cpp
│ ├── reduce-scatter.cpp
│ ├── run_ubench.sh
│ ├── scalapack_transformer.cpp
│ ├── sendrecv.cpp
│ ├── transpose.cpp
│ └── ubench-allgather.cpp
├── bors.toml
├── ci/
│ ├── baseimage.cuda.Dockerfile
│ ├── build.Dockerfile
│ ├── cscs.yml
│ └── mps-wrapper.sh
├── cmake/
│ ├── FindARMPL.cmake
│ ├── FindATLAS.cmake
│ ├── FindBLIS.cmake
│ ├── FindBlas.cmake
│ ├── FindCRAY_LIBSCI.cmake
│ ├── FindFLEXIBLAS.cmake
│ ├── FindGenericBLAS.cmake
│ ├── FindMKL.cmake
│ ├── FindNCCL.cmake
│ ├── FindNVPL.cmake
│ ├── FindOPENBLAS.cmake
│ ├── FindSCALAPACK.cmake
│ ├── GitSubmodule.cmake
│ ├── adjust_mpiexec_flags.cmake
│ ├── build_type.cmake
│ ├── cosma.pc.in
│ ├── cosmaConfig.cmake.in
│ └── find_cuda_version.cmake
├── docker/
│ ├── asan/
│ │ ├── build-env.Dockerfile
│ │ └── deploy.Dockerfile
│ ├── cpu-release/
│ │ ├── build-env.Dockerfile
│ │ └── deploy.Dockerfile
│ └── gpu/
│ ├── build-env.Dockerfile
│ └── deploy.Dockerfile
├── libs/
│ └── gtest_mpi/
│ ├── CMakeLists.txt
│ ├── LICENSE
│ ├── README.md
│ ├── external/
│ │ └── gtest/
│ │ ├── CMakeLists.txt
│ │ ├── include/
│ │ │ └── gtest/
│ │ │ └── gtest.h
│ │ └── src/
│ │ └── gtest-all.cpp
│ └── include/
│ └── gtest_mpi/
│ ├── gtest_mpi.hpp
│ └── gtest_mpi_internal.hpp
├── miniapp/
│ ├── CMakeLists.txt
│ ├── cosma_miniapp.cpp
│ ├── cosma_statistics.cpp
│ ├── layout_miniapp.cpp
│ └── pxgemm_miniapp.cpp
├── scripts/
│ ├── build.sh
│ ├── daint-mc_env.sh
│ ├── install_dependencies.py
│ ├── piz_daint_cpu.sh
│ ├── piz_daint_gpu.sh
│ ├── piz_daint_gpu_aware_mpi.sh
│ ├── run_gpu.sh
│ ├── schedule_miniapp_on_daint_cpu.sh
│ ├── schedule_miniapp_on_daint_gpu.sh
│ └── schedule_tests_on_daint.sh
├── spack/
│ └── packages/
│ └── costa/
│ └── package.py
├── spack_repo/
│ └── cosma/
│ ├── packages/
│ │ ├── cosma/
│ │ │ ├── fj-ssl2.patch
│ │ │ └── package.py
│ │ └── tiled-mm/
│ │ └── package.py
│ └── repo.yaml
├── src/
│ └── cosma/
│ ├── CMakeLists.txt
│ ├── aligned_allocator.hpp
│ ├── blacs.hpp
│ ├── blas.cpp
│ ├── blas.hpp
│ ├── buffer.cpp
│ ├── buffer.hpp
│ ├── cinterface.cpp
│ ├── cinterface.hpp
│ ├── communicator.cpp
│ ├── communicator.hpp
│ ├── context.cpp
│ ├── context.hpp
│ ├── cosma_pxgemm.cpp
│ ├── cosma_pxgemm.hpp
│ ├── environment_variables.cpp
│ ├── environment_variables.hpp
│ ├── gpu/
│ │ ├── gpu_aware_mpi_utils.cpp
│ │ ├── gpu_aware_mpi_utils.hpp
│ │ ├── gpu_runtime_api.hpp
│ │ ├── nccl_mapper.hpp
│ │ ├── nccl_utils.cpp
│ │ ├── nccl_utils.hpp
│ │ └── utils.hpp
│ ├── interpose.h
│ ├── interval.cpp
│ ├── interval.hpp
│ ├── layout.cpp
│ ├── layout.hpp
│ ├── local_multiply.cpp
│ ├── local_multiply.hpp
│ ├── mapper.cpp
│ ├── mapper.hpp
│ ├── math_utils.cpp
│ ├── math_utils.hpp
│ ├── matrix.cpp
│ ├── matrix.hpp
│ ├── memory_pool.cpp
│ ├── memory_pool.hpp
│ ├── mpi_mapper.hpp
│ ├── multiply.cpp
│ ├── multiply.hpp
│ ├── one_sided_communicator.cpp
│ ├── one_sided_communicator.hpp
│ ├── pinned_buffers.cpp
│ ├── pinned_buffers.hpp
│ ├── prefixed_pxgemm.cpp
│ ├── prefixed_pxgemm.h
│ ├── profiler.hpp
│ ├── pxgemm.cpp
│ ├── pxgemm.h
│ ├── pxgemm_params.hpp
│ ├── random_generator.hpp
│ ├── scalapack.cpp
│ ├── scalapack.hpp
│ ├── statistics.hpp
│ ├── strategy.cpp
│ ├── strategy.hpp
│ ├── timer.hpp
│ ├── two_sided_communicator.cpp
│ └── two_sided_communicator.hpp
├── tests/
│ ├── CMakeLists.txt
│ ├── main_gtest.cpp
│ ├── main_gtest_mpi.cpp
│ ├── mapper.cpp
│ ├── multiply.cpp
│ ├── multiply_using_layout.cpp
│ ├── pdgemm.cpp
│ └── scalar_matmul.cpp
└── utils/
├── cosma_utils.hpp
├── parse_strategy.hpp
└── pxgemm_utils.hpp
================================================
FILE CONTENTS
================================================
================================================
FILE: .clang-format
================================================
---
# Used for all options not set in this file
BasedOnStyle: LLVM
AllowAllParametersOfDeclarationOnNextLine: false
BinPackArguments: false
BinPackParameters: false
BreakConstructorInitializersBeforeComma: true
AlwaysBreakTemplateDeclarations: Yes
IndentWidth: 4
================================================
FILE: .gitattributes
================================================
*.git export-ignore
*.github export-ignore
/ci export-ignore
/docker export-ignore
*.DS_Store export-ignore
*.gitattributes export-ignore
/*.clang-format export-ignore
*.gitignore export-ignore
/_config.yml export-ignore
/bors.toml export-ignore
*.gitmodules export-ignore
/.gitlab-ci.yml export-ignore
================================================
FILE: .github/tag-issue.md
================================================
---
title: cmake project version {{ env.CMAKE_VERSION }} does not match git tag {{ env.GIT_VERSION }}
labels: bug
---
The cmake version should be in sync with the git version to ensure the correct file names and sonames of shared libraries.
================================================
FILE: .github/workflows/version_checker.yml
================================================
name: VersionChecker
on:
push:
tags:
- 'v*'
jobs:
checker:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
with:
fetch-depth: 0
- name: Version check
id: check
run: |
mkdir build
cd build
cmake .. || true
CMAKE_VERSION="v$(cat CMakeCache.txt | grep '^CMAKE_PROJECT_VERSION\b' | cut -d "=" -f2)"
GIT_VERSION=$(git describe --tags)
if [ "$CMAKE_VERSION" != "$GIT_VERSION" ]; then
echo ::set-output name=CMAKE_ISSUE::yes
echo ::set-output name=CMAKE_VERSION::$CMAKE_VERSION
echo ::set-output name=GIT_VERSION::$GIT_VERSION
fi
- uses: JasonEtco/create-an-issue@v2.4.0
if: steps.check.outputs.CMAKE_ISSUE == 'yes'
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
CMAKE_VERSION: ${{ steps.check.outputs.CMAKE_VERSION }}
GIT_VERSION: ${{ steps.check.outputs.GIT_VERSION }}
with:
filename: .github/tag-issue.md
================================================
FILE: .gitignore
================================================
.DS_Store*
*.swp
__pycache__
build
exports
doc
.idea*
CMakeLists.txt.user
.vscode*
================================================
FILE: .gitlab-ci.yml
================================================
include:
- remote: 'https://gitlab.com/cscs-ci/recipes/-/raw/master/templates/v2/.cscs.yml'
stages:
- build
- test
##
## BUILDS
##
.build_common:
extends: .dind
stage: build
only: ['master', 'staging', 'trying']
variables:
GIT_SUBMODULE_STRATEGY: recursive
before_script:
- docker login -u $CSCS_REGISTRY_USER -p $CSCS_REGISTRY_PASSWORD $CSCS_REGISTRY
script:
- docker build --network=host --cache-from $BUILD_IMAGE --build-arg BUILDKIT_INLINE_CACHE=1 -t $BUILD_IMAGE -f $BUILD_DOCKERFILE .
- docker push $BUILD_IMAGE
- docker build -t $DEPLOY_IMAGE --network=host --build-arg BUILDKIT_INLINE_CACHE=1 --build-arg BUILD_ENV=$BUILD_IMAGE -f $DEPLOY_DOCKERFILE .
- docker push $DEPLOY_IMAGE
# Builds a Docker image for the current commit, cpu / gpu
build sanitizer cpu:
extends: .build_common
variables:
BUILD_DOCKERFILE: docker/asan/build-env.Dockerfile
BUILD_IMAGE: $CSCS_REGISTRY_IMAGE/build-env-asan:latest
DEPLOY_DOCKERFILE: docker/asan/deploy.Dockerfile
DEPLOY_IMAGE: $CSCS_REGISTRY_IMAGE/deploy-cpu-asan:$CI_COMMIT_SHA
build cpu:
extends: .build_common
variables:
BUILD_DOCKERFILE: docker/cpu-release/build-env.Dockerfile
BUILD_IMAGE: $CSCS_REGISTRY_IMAGE/build-env-cpu:latest
DEPLOY_DOCKERFILE: docker/cpu-release/deploy.Dockerfile
DEPLOY_IMAGE: $CSCS_REGISTRY_IMAGE/deploy-cpu:$CI_COMMIT_SHA
build gpu:
extends: .build_common
variables:
BUILD_DOCKERFILE: docker/gpu/build-env.Dockerfile
BUILD_IMAGE: $CSCS_REGISTRY_IMAGE/build-env-gpu:latest
DEPLOY_DOCKERFILE: docker/gpu/deploy.Dockerfile
DEPLOY_IMAGE: $CSCS_REGISTRY_IMAGE/deploy-gpu:$CI_COMMIT_SHA
sanitize:
stage: test
only: ['master', 'staging', 'trying']
trigger:
strategy: depend
include: /ci/sanitize.yml
cpu test:
stage: test
only: ['master', 'staging', 'trying']
trigger:
strategy: depend
include: /ci/cpu.yml
gpu test:
stage: test
only: ['master', 'staging', 'trying']
trigger:
strategy: depend
include: /ci/gpu.yml
================================================
FILE: .gitmodules
================================================
[submodule "libs/Tiled-MM"]
path = libs/Tiled-MM
url = https://github.com/eth-cscs/Tiled-MM.git
[submodule "libs/COSTA"]
path = libs/COSTA
url = https://github.com/eth-cscs/COSTA
[submodule "libs/cxxopts"]
path = libs/cxxopts
url = https://github.com/jarro2783/cxxopts
================================================
FILE: ATTRIBUTIONS.md
================================================
# COSMA Attributions:
COSMA uses the following external projects:
- [COSTA](https://github.com/eth-cscs/COSTA): used for transforming between COSMA and SCALAPACK matrix data layouts and for transposing distributed matrices. Licensed under the [BSD-3-Clause License](https://github.com/eth-cscs/COSTA/blob/master/LICENSE).
- [Tiled-MM](https://github.com/eth-cscs/Tiled-MM): used for performing `dgemm` calls with the GPU-backend. Licensed under the [BSD-3-Clause License](https://github.com/eth-cscs/Tiled-MM/blob/master/LICENSE).
- [semiprof](https://github.com/bcumming/semiprof): used for profiling the code. Licensed under the [BSD-3-Clause License](https://github.com/bcumming/semiprof/blob/master/LICENSE).
- [options](https://github.com/kabicm/options): used for parsing the command line options. Licensed under the [BSD-3-Clause License](https://github.com/kabicm/options/blob/master/LICENCE).
- [cxxopts](https://github.com/jarro2783/cxxopts): user for parsing the command line options. Licensed under the [MIT License](https://github.com/jarro2783/cxxopts/blob/master/LICENSE).
- [googletest](https://github.com/google/googletest): used for unit testing. Licensed under the [BSD-3-Clause License](https://github.com/google/googletest/blob/master/LICENSE).
- [gtest_mpi](https://github.com/AdhocMan/gtest_mpi): used as a plugin for googletest adding the MPI support. Licensed under the [BSD-3-Clause License](https://github.com/AdhocMan/gtest_mpi/blob/master/LICENSE).
- [interpose](https://github.com/ccurtsinger/interpose): used for dispatching some of the pxgemm calls to SCALAPACK. Licensed under the [MIT License](https://github.com/ccurtsinger/interpose/blob/master/COPYING.md).
Most of these projects are added as submodules and can be found in the `libs` folder.
================================================
FILE: CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.24 FATAL_ERROR)
project(cosma
DESCRIPTION "Communication Optimal Matrix Multiplication"
HOMEPAGE_URL "https://github.com/eth-cscs/COSMA"
VERSION 2.8.4
LANGUAGES CXX)
include(FetchContent)
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules")
include(cmake/build_type.cmake)
include(cmake/adjust_mpiexec_flags.cmake)
set(CMAKE_EXPORT_COMPILE_COMMANDS "YES") # always write compile_commands.json
# Options
#
set(COSMA_GPU_BACKENDS_LIST "CUDA" "ROCM")
set(COSMA_SCALAPACK_LIST "OFF" "MKL" "CRAY_LIBSCI" "NVPL" "CUSTOM")
set(COSMA_BLAS_LIST "auto" "MKL" "OPENBLAS" "CRAY_LIBSCI" "NVPL" "CUSTOM" "BLIS" "ATLAS" "CUDA" "ROCM" "OFF")
option(COSMA_WITH_TESTS "Generate the test target." ON)
option(COSMA_WITH_APPS "Generate the miniapp targets." ON)
option(COSMA_WITH_BENCHMARKS "Generate the benchmark targets." ON)
option(COSMA_WITH_PROFILING "Enable profiling." OFF)
option(COSMA_WITH_NCCL "Use NCCL as communication backend." OFF)
option(COSMA_WITH_RCCL "Use RCCL as communication backend." OFF)
option(COSMA_WITH_GPU_AWARE_MPI "Use gpu-aware MPI for communication." OFF)
option(COSMA_USE_UNIFIED_MEMORY "Use unified memory when GPU acceleration is ON" OFF)
option(BUILD_SHARED_LIBS "Build shared libraries." OFF)
set(COSMA_SCALAPACK "OFF" CACHE STRING "scalapack implementation. Can be MKL, CRAY_LIBSCI, NVPL, CUSTOM or OFF.")
set(COSMA_BLAS "OFF" CACHE STRING "Blas library for computations on host or GPU")
set(COSMA_BLAS_VENDOR "OFF")
set(COSMA_GPU_BACKEND "OFF")
set_property(CACHE COSMA_SCALAPACK PROPERTY STRINGS ${COSMA_SCALAPACK_LIST})
set_property(CACHE COSMA_BLAS PROPERTY STRINGS ${COSMA_BLAS_LIST})
# we keep the old cosma behavior of indicating GPU support as a blas
# implementation. We have to sort out what we should find for the FindBLAS and
# GPU supports since they are treated as separate components
if(COSMA_BLAS STREQUAL "OFF")
message(FATAL_ERROR "A Blas implementation is needed when running on CPU only: choices are : auto, MKL, OPENBLAS, CRAY_LIBSCI, CUSTOM, BLIS, ATLAS, FLEXIBLAS, ARMPL, GenericBLAS, CUDA or ROCM")
endif()
if (COSMA_BLAS MATCHES "CUDA|ROCM")
set(COSMA_GPU_BACKEND ${COSMA_BLAS})
set(COSMA_BLAS_VENDOR "OFF")
else()
set(COSMA_BLAS_VENDOR ${COSMA_BLAS})
set(COSMA_GPU_BACKEND "OFF")
endif()
if ((COSMA_WITH_NCCL OR COSMA_WITH_RCCL) AND NOT COSMA_GPU_BACKEND IN_LIST COSMA_GPU_BACKENDS_LIST)
message(FATAL_ERROR "NCCL (RCCL) can only be used with the GPU backend set to CUDA (ROCM).")
endif()
if (COSMA_WITH_GPU_AWARE_MPI AND NOT COSMA_GPU_BACKEND IN_LIST COSMA_GPU_BACKENDS_LIST)
message(FATAL_ERROR "GPU-aware MPI can only be used with the GPU backend set to CUDA or ROCM.")
endif()
if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE "Release")
endif()
# Dependencies
# MPI
set(MPI_DETERMINE_LIBRARY_VERSION TRUE)
find_package(MPI COMPONENTS CXX REQUIRED)
adjust_mpiexec_flags()
# check if scalapack backend is valid
message(STATUS "Selected SCALAPACK backend for COSMA: ${COSMA_SCALAPACK}")
if(NOT COSMA_SCALAPACK IN_LIST COSMA_SCALAPACK_LIST)
message(FATAL_ERROR "Invalid value for COSMA_SCALAPACK!")
endif()
# the blas targets are only defined when COSMA_SCALAPACK is ON whatever value of COSMA_GPU_BACKEND
if (NOT COSMA_SCALAPACK MATCHES "OFF")
if (COSMA_SCALAPACK MATCHES "MKL" OR COSMA_SCALAPACK MATCHES "CRAY_LIBSCI" OR COSMA_SCALAPACK MATCHES "NVPL")
set(COSMA_BLAS_VENDOR ${COSMA_SCALAPACK})
else()
set(COSMA_BLAS_VENDOR "auto")
endif()
endif()
if (NOT COSMA_BLAS_VENDOR MATCHES "OFF|CUDA|ROCM")
find_package(Blas REQUIRED)
endif()
if (NOT COSMA_SCALAPACK MATCHES "OFF")
find_package(SCALAPACK REQUIRED)
endif ()
set(COSTA_WITH_PROFILING ${COSMA_WITH_PROFILING} CACHE INTERNAL "")
set(COSTA_SCALAPACK ${COSMA_SCALAPACK} CACHE INTERNAL "")
FetchContent_Declare(
costa
GIT_REPOSITORY https://github.com/eth-cscs/costa.git
GIT_TAG 2484769535772f807d402901ffca63bb6678dd42 # v2.3.0
FIND_PACKAGE_ARGS NAMES costa
)
# the joy of fetch_content. if we build costa and cosma together
# fetch_content will pick up the FindSCALAPACK from cosma NOT costa.
if (NOT TARGET costa::scalapack::scalapack AND NOT COSMA_SCALAPACK MATCHES "OFF")
add_library(costa::scalapack::scalapack ALIAS cosma::scalapack::scalapack)
endif ()
FetchContent_MakeAvailable(costa)
# these are only GPU-backends
if (COSMA_GPU_BACKEND MATCHES "CUDA|ROCM")
set(TILEDMM_GPU_BACKEND ${COSMA_GPU_BACKEND} CACHE INTERNAL "")
FetchContent_Declare(
Tiled-MM
GIT_REPOSITORY https://github.com/eth-cscs/Tiled-MM.git
GIT_TAG 0eb75179e670a04c649b50ae5e91bb71b43e4d06 # v2.3.2
FIND_PACKAGE_ARGS NAMES tiled-MM
)
FetchContent_MakeAvailable(Tiled-MM)
if (COSMA_WITH_NCCL)
find_package(CUDAToolkit REQUIRED)
find_package(NCCL REQUIRED)
elseif (COSMA_WITH_RCCL)
find_package(hip REQUIRED)
find_package(rccl REQUIRED)
endif()
if (NOT TARGET Tiled-MM::Tiled-MM)
message("Tiled-mm target not found")
endif ()
endif()
if (COSMA_WITH_PROFILING)
FetchContent_Declare(
semiprof
GIT_REPOSITORY https://github.com/bcumming/semiprof.git
GIT_TAG f132142ff2215dfa073e416fa7911d8877d62752
FIND_PACKAGE_ARGS NAMES semiprof
)
FetchContent_MakeAvailable(semiprof)
endif ()
if (COSMA_WITH_TESTS OR COSMA_WITH_APPS)
FetchContent_Declare(
cxxopts
GIT_REPOSITORY https://github.com/jarro2783/cxxopts.git
GIT_TAG 4bf61f08697b110d9e3991864650a405b3dd515d # v3.2.1
FIND_PACKAGE_ARGS NAMES cxxopts
)
FetchContent_MakeAvailable(cxxopts)
endif()
if (NOT TARGET Tiled-MM::Tiled-MM)
message("Tiled-mm target not found")
endif ()
# preserve rpaths when installing and make the install folder relocatable
# use `CMAKE_SKIP_INSTALL_RPATH` to skip this
# https://spack.readthedocs.io/en/latest/workflows.html#write-the-cmake-build
list(FIND CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES
"${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}" isSystemDir)
# skip RPATH if COSMA is installed to system directories
if(isSystemDir STREQUAL "-1")
set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
if(APPLE)
set(basePoint @loader_path)
else()
set(basePoint $ORIGIN)
endif()
file(RELATIVE_PATH relDir ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_INSTALL_BINDIR}
${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_INSTALL_LIBDIR})
set(CMAKE_INSTALL_RPATH ${basePoint} ${basePoint}/${relDir})
endif()
# COSMA
#
include(CMakePackageConfigHelpers)
include(GNUInstallDirs)
add_subdirectory(src/cosma)
install(DIRECTORY "${cosma_SOURCE_DIR}/src/cosma"
DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
FILES_MATCHING
PATTERN "*.hpp")
write_basic_package_version_file(
"${cosma_BINARY_DIR}/cosmaConfigVersion.cmake"
VERSION ${cosma_VERSION}
COMPATIBILITY SameMajorVersion)
configure_file("${cosma_SOURCE_DIR}/cmake/cosma.pc.in"
"${cosma_BINARY_DIR}/cosma.pc"
@ONLY)
configure_file("${cosma_SOURCE_DIR}/cmake/cosmaConfig.cmake.in"
"${cosma_BINARY_DIR}/cosmaConfig.cmake"
@ONLY)
write_basic_package_version_file(
"${cosma_BINARY_DIR}/cosmaConfigVersion.cmake"
VERSION "${cosma_VERSION}"
COMPATIBILITY SameMajorVersion)
install(FILES "${cosma_BINARY_DIR}/cosmaConfig.cmake"
"${cosma_BINARY_DIR}/cosmaConfigVersion.cmake"
"${cosma_BINARY_DIR}/cosmaConfigVersion.cmake"
"${cosma_SOURCE_DIR}/cmake/FindMKL.cmake"
"${cosma_SOURCE_DIR}/cmake/FindNVPL.cmake"
"${cosma_SOURCE_DIR}/cmake/FindBlas.cmake"
"${cosma_SOURCE_DIR}/cmake/FindSCALAPACK.cmake"
"${cosma_SOURCE_DIR}/cmake/FindOPENBLAS.cmake"
"${cosma_SOURCE_DIR}/cmake/FindFLEXIBLAS.cmake"
"${cosma_SOURCE_DIR}/cmake/FindARMPL.cmake"
"${cosma_SOURCE_DIR}/cmake/FindATLAS.cmake"
"${cosma_SOURCE_DIR}/cmake/FindCRAY_LIBSCI.cmake"
"${cosma_SOURCE_DIR}/cmake/FindGenericBLAS.cmake"
"${cosma_SOURCE_DIR}/cmake/FindNCCL.cmake"
"${cosma_SOURCE_DIR}/cmake/FindBLIS.cmake"
DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/cosma")
install(FILES "${cosma_BINARY_DIR}/cosma.pc"
DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
if(COSMA_WITH_TESTS)
add_subdirectory(libs/gtest_mpi)
enable_testing()
add_subdirectory(tests)
endif()
if(COSMA_WITH_APPS)
add_subdirectory(miniapp)
endif()
if(COSMA_WITH_BENCHMARKS AND NOT COSMA_BLAS MATCHES "OPENBLAS")
add_subdirectory(benchmarks)
endif()
================================================
FILE: INSTALL.md
================================================
## Building COSMA
To build COSMA, do the following steps:
```bash
# clone the repository
git clone --recursive https://github.com/eth-cscs/COSMA.git
cd COSMA
# create a build directory within COSMA
mkdir build
cd build
# set up the compiler, e.g. with:
export CC=`which cc`
export CXX=`which CC`
# Choose which BLAS and SCALAPACK backends to use (e.g. MKL)
cmake -DCOSMA_BLAS=MKL -DCOSMA_SCALAPACK=MKL ..
# compile
make -j 8
```
> !! Note the *--recursive* flag !!
Other important options that can be passed to `cmake` are the following:
- `COSMA_BLAS:` `MKL` (default), `OPENBLAS`, `CRAY_LIBSCI`, `CUSTOM`, `CUDA` or `ROCM`. Determines which backend will be used for the local matrix multiplication calls.
- `COSMA_SCALAPACK:` OFF (default), `MKL`, `CRAY_LIBSCI`, `CUSTOM`. If specified, `COSMA` will also provide ScaLAPACK wrappers, thus offering `pdgemm`, `psgemm`, `pzgemm` and `pcgemm` functions, which completely match the ScaLAPACK API.
## Building COSMA on Multi-GPU Systems
COSMA can take advantage of fast GPU-to-GPU interconnects like NV-Links, through the use of the following:
- NCCL library (for NVIDIA GPUs), i.e. RCCL library (for AMD GPUs): when `-DCOSMA_WITH_NCCL=ON`, i.e. `-DCOSMA_WITH_RCCL=ON` is specified in `cmake`, all the collective communication is performed through these libraries, which can utilize fast gpu-to-gpu interconnects.
- GPU-aware MPI: when `-DCOSMA_WITH_GPU_AWARE_MPI=ON` is specified in `cmake`, cuda-aware MPI for NVIDIA GPUs (i.e. rocm-aware MPI for AMD GPUs) will be used for collective communication. The user must make sure that the gpu-aware MPI is enabled. For example, on Cray-systems, this can be done by setting the following environment variables:
- `export MPICH_RDMA_ENABLED_CUDA=1`
- `export MPICH_GPU_SUPPORT_ENABLED=1`
## Building COSMA on Cray Systems
There are already prepared scripts for loading the necessary dependencies for COSMA on Cray-Systems:
- `Cray XC40` (CPU-only version): `source ./scripts/piz_daint_cpu.sh` loads `MKL` and other neccessary modules.
- `Cray XC50` (Hybrid version): `source ./scripts/piz_daint_gpu.sh` loads `cublas` and other necessary modules.
After the right modules are loaded, the instructions from the beginning of this file can be followed.
## Installing COSMA
To install do `make install`.
> !! Note: To set custom installation directory use `CMAKE_INSTALL_PREFIX` when building.
COSMA is CMake friendly and provides a cosmaConfig.cmake module for easy
integration into 3rd-party CMake projects with
```
find_package(cosma REQUIRED)
target_link_libraries( ... cosma::cosma)
```
COSMA's dependencies are taken care of internally, nothing else needs to be
linked. Make sure to set `CMAKE_INSTALL_PREFIX` to COSMA's installation directory
when building.
There is a rudimentary pkgconfig support; dependencies are handles explicitly by
consumers.
# Installing COSMA with Spack
- with OpenBLAS back end: `spack install cosma`
- with MKL back end: `spack install cosma ^mkl`
- with GPU back end: `spack install cosma +cuda`
- with Netlib LAPACK: `spack install cosma ^netlib-lapack`
- with MKL ScaLAPACK: `spack install cosma +scalapack ^mkl`
Notes:
- By default Spack builds in release mode with debug information included (-O2
-g). To build with -O3, add `build_type=Release` to the command line.
- By default Spack selects openmpi as the MPI implementation, to select MPICH,
add `^mpich`
For more information on Spack: [Spack 101 Tutorial](https://spack.readthedocs.io/en/latest/tutorial.html).
## Docker
COSMA can be installed into a Docker container in the following way:
```
docker build -f docker/gpu/build-env.Dockerfile -t cosma-build-env .
docker build --build-arg BUILD_ENV=cosma-build-env -f docker/gpu/deploy.Dockerfile -t cosma .
```
Then the `cosma` container can be deployed for testing:
```
docker run --rm -it -v (pwd):(pwd) --gpus all cosma
```
================================================
FILE: LICENSE
================================================
BSD 3-Clause License
Copyright (c) 2018, ETH Zürich.
Copyright (c) 2021, Advanced Micro Devices, Inc.
Copyright (c) 2018-2022, ETH Zürich.
All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its contributors
may be used to endorse or promote products derived from this software without
specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
================================================
FILE: README.md
================================================
[](https://gitlab.com/cscs-ci/eth-cscs/COSMA/-/commits/master)

## Table of Contents
- [Overview](#overview)
- [COSMA Literature](#cosma-literature)
- [Features](#features)
- [Building COSMA](#building-cosma)
- [COSMA Dependencies](#cosma-dependencies)
- [Using COSMA](#using-cosma)
- [30 seconds Tutorial](#using-cosma-in-30-seconds)
- [COSMA on Multi-GPU Systems](#cosma-on-multi-gpu-systems)
- [Using NCCL/RCCL Libraries](using-ncclrccl-libraries)
- [Using GPU-aware MPI](#using-gpu-aware-mpi)
- [COSMA in production](#cosma-in-production)
- [CP2K](#cp2k)
- [Julia language](#julia-language)
- [Examples - Miniapps](#miniapps)
- [Matrix Multiplication with COSMA](#matrix-multiplication)
- [COSMA pxgemm wrapper](#cosma-pxgemm-wrapper)
- [Tunable Parameters](#tunable-parameters)
- [Parameters Overview](#parameters-overview)
- [Controlling GPU memory](#controlling-gpu-memory)
- [Controlling CPU memory](#controlling-cpu-memory)
- [Performance Profiling](#profiling)
- [Authors](#authors)
- [Questions?](#questions)
- [Acknowledgements](#acknowledgements)
## Overview
COSMA is a parallel, high-performance, GPU-accelerated, matrix-matrix multiplication algorithm that is communication-optimal for all combinations of matrix dimensions, number of processors and memory sizes, without the need for any parameter tuning. The key idea behind COSMA is to first derive a tight optimal sequential schedule and only then parallelize it, preserving I/O optimality between processes. This stands in contrast with the 2D and 3D algorithms, which fix process domain decomposition upfront and then map it to the matrix dimensions, which may result in asymptotically more communication. The final design of COSMA facilitates the overlap of computation and communication, ensuring speedups and applicability of modern mechanisms such as RDMA. COSMA allows to not utilize some processors in order to optimize the processor grid, which reduces the communication volume even further and increases the computation volume per processor.
COSMA got the **Best Student Paper Award** at the prestigious **Supercomputing 2019** conference in Denver, US.
COSMA alleviates the issues of current state-of-the-art algorithms, which can be summarized as follows:
- `2D (SUMMA)`: Requires manual tuning and not communication-optimal in the presence of extra memory.
- `2.5D`: Optimal for `m=n`, but inefficient for `m << n` or `n << m` and for some numbers of processes `p`.
- `Recursive (CARMA)`: Asymptotically communication-optimal for all `m, n, k, p`, but splitting always the largest dimension might lead up to `√3` increase in communication volume.
- `COSMA (this work)`: Strictly communication-optimal (not just asymptotically) for all `m, n, k, p` and memory sizes that yields the speedups by factor of up to 8.3x over the second-fastest algorithm.
In addition to being communication-optimal, this implementation is higly-optimized to reduce the memory footprint in the following sense:
- `Buffer Reuse`: all the buffers are pre-allocated and carefully reused during execution, including the buffers necessary for the communication, which reduces the total memory usage.
- `Reduced Local Data Movement`: the assignment of data blocks to processes is fully adapted to communication pattern, which minimizes the need of local data reshuffling that arise after each communication step.
The library supports both one-sided and two-sided MPI communication backends. It uses `dgemm` for the local computations, but also has a support for the `GPU` acceleration through our `Tiled-MM` library using `cublas` or `rocBLAS`.
## COSMA Literature
The paper and other materials on COSMA are available under the following link:
- **ACM Digital Library (Best Student Paper Award at SC19):** https://dl.acm.org/doi/10.1145/3295500.3356181
- **Arxiv:** https://arxiv.org/abs/1908.09606
- **YouTube Presentation:** https://www.youtube.com/watch?v=5wiZWw5ltR0
- **Press Release:** https://www.cscs.ch/science/computer-science-hpc/2019/new-matrix-multiplication-algorithm-pushes-the-performance-to-the-limits/
## Features
- **[NEW] Multi-GPU Systems Support:** COSMA is now able to take advantage of fast GPU-to-GPU interconnects either through the use of NCCL/RCCL libraries or by using the GPU-aware MPI. Both, NVIDIA and AMD GPUs are supported.
- **ScaLAPACK API Support:** it is enough to link to COSMA, without changing the code and all `p?gemm` calls will use ScaLAPACK wrappers provided by COSMA.
- **C/Fortran Interface:** written in `C++`, but provides `C` and `Fortran` interfaces.
- **Custom Types:** fully templatized types.
- **GPU acceleration:** supports both **NVIDIA** and **AMD** GPUs.
- **Supported BLAS (CPU) backends:** MKL, LibSci, NETLIB, BLIS, ATLAS.
- **Custom Data Layout Support:** natively uses its own blocked data layout of matrices, but supports arbitrary grid-like data layout of matrices.
- **Tranposition/Conjugation Support:** matrices `A` and `B` can be transposed and/or conjugated.
- **Communication and Computation Overlap:** supports overlapping of communication and computation.
- **Spack Installation:** can be built and installed with `Spack` since v14.1
- **Julia Package:** see https://github.com/haampie/COSMA.jl/ on how to use COSMA in the Julia language.
## Building COSMA
See [Installation Instructions](INSTALL.md).
## COSMA Dependencies
COSMA is a CMake project and requires a recent CMake(>=3.17).
External dependencies:
- `MPI 3`: (required)
- `BLAS`: when the problem becomes local, COSMA uses provided `?gemm` backend, which can be one of the following:
- `MKL` (default)
- `OPENBLAS`
- `BLIS`
- `ATLAS`
- `CRAY_LIBSCI`: `Cray-libsci` or `Cray-libsci_acc` (GPU-accelerated)
- `CUDA`: `cublas` is used for NVIDIA GPUs
- `ROCM`: `rocBLAS` is used for AMD GPUs
- `CUSTOM`: user-provided BLAS API
Some dependencies are bundled as submodules and need not be installed explicitly:
- `TiledMM` - cublasXt GEMM replacement, that is also ported to AMD GPUs.
- `COSTA` - distributed matrix reshuffle and transpose algorithm.
- `semiprof` - profiling utlility
- `gtest_mpi` - MPI utlility wrapper over GoogleTest (unit testing library)
## Using COSMA
To allow easy integration, COSMA can be used in the following ways:
- **without changing your code:** if your code already uses the `ScaLAPACK API`, then you can just link to COSMA, before linking to any other library providing `pxgemm` and all `pxgemm` calls will be using COSMA, without the need to change your code at all. To get a feeling of the performance you can expect to get, please have a look at the [pdgemm miniapp](#cosma-pdgemm-wrapper). To see how you can link your code to COSMA `pxgemm`, have a look at the [30 seconds tutorial](#using-cosma-in-30-seconds) on how to do this. In this way, we integrated COSMA into CP2K quantum chemistry simulator, which you can read more about in the [production example](#cosma-in-production).
- **adapting your code:** if your code is not using ScaLAPACK, then there are two interfaces that can be used:
- **custom layout:** if you matrices are distributed in a custom way, then it is eanough to pass the descriptors of your data layout to `multiply_using_layout` function, which will then adapt COSMA to your own layout.
- **native COSMA layout:** to get the maximum performance, the native COSMA matrix layout should be used. To get an idea of the performance you can expect to get, please have a look at the [matrix multiplication miniapp](#matrix-multiplication).
The documentation for the latter option will soon be published here.
## Using COSMA in 30 seconds
For easy integration, it is enough to build COSMA with ScaLAPACK API and then link your code to COSMA before linking to any other library providing ScaLAPACK `pxgemm`. This way, all `pxgemm` calls will be using COSMA `pxgemm` wrappers. To achieve this, please follow these steps:
1) Build COSMA with ScaLAPACK API:
```bash
###############
# get COSMA
###############
git clone --recursive https://github.com/eth-cscs/COSMA cosma && cd cosma
##############################
# build and install COSMA
##############################
mkdir build && cd build
# set up the compiler, e.g. with:
export CC=`which cc`
export CXX=`which CC`
# choose BLAS and SCALAPACK versions you want to use
# COSMA_BLAS can be: MKL, OpenBLAS, CRAY_LIBSCI, CUDA, ROCM, CUSTOM
# COSMA_SCALAPACK can be MKL, CRAY_LIBSCI, CUSTOM
cmake -DCOSMA_BLAS=CUDA -DCOSMA_SCALAPACK=MKL -DCMAKE_INSTALL_PREFIX=/cosma ..
make -j 8
make install
```
> !! Note the *--recursive* flag !!
2) Link your code to COSMA:
- **CPU-only** version of COSMA:
- link your code to:
> -L/cosma/lib64 -lcosma_pxgemm -lcosma -lcosta_scalapack
- then link to the BLAS and ScaLAPACK you built COSMA with (see `COSMA_BLAS` and `COSMA_SCALAPACK` flags in cmake):
> -L${MKLROOT}/lib/intel64 -Wl,--no-as-needed -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lmkl_blacs_intelmpi_lp64 -lgomp -lpthread -lm
- using **GPU-accelerated** version of COSMA:
- link your code to:
>-L/cosma/lib64 -lcosma_pxgemm -lcosma -lcosta_scalapack -lTiled-MM
- link to the GPU backend you built COSMA with (see `COSMA_BLAS` flag in cmake):
>-lcublas -lcudart -lrt
- then link to the ScaLAPACK you built COSMA with (see `COSMA_SCALAPACK` flag in cmake):
>-L${MKLROOT}/lib/intel64 -Wl,--no-as-needed -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lmkl_blacs_intelmpi_lp64 -lgomp -lpthread -lm
3) Include headers:
>-I/cosma/include
## COSMA on Multi-GPU Systems
COSMA is able to take advantage of fast GPU-to-GPU interconnects on multi-gpu systems. This can be achieved in one of the following ways.
### Using `NCCL/RCCL` Libraries
When running `cmake` for COSMA, make sure to specify `-DCOSMA_WITH_NCCL=ON`, e.g. by doing:
```bash
# NVIDIA GPUs
# this will looks for NCCL library in the following environment variables:
# - NCCL_ROOT: Base directory where all NCCL components are found
# - NCCL_INCLUDE_DIR: Directory where NCCL header is found
# - NCCL_LIB_DIR: Directory where NCCL library is found
cmake -DCOSMA_BLAS=CUDA -DCOSMA_SCALAPACK=MKL -DCOSMA_WITH_NCCL=ON ..
# AMD GPUs
# this will looks for RCCL library in the following environment variables:
# - RCCL_ROOT_DIR: Base directory where all RCCL components are found
# - RCCL_INCLUDE_DIR: Directory where RCCL header is found
# - RCCL_LIB_DIR: Directory where RCCL library is found
cmake -DCOSMA_BLAS=CUDA -DCOSMA_SCALAPACK=MKL -DCOSMA_WITH_NCCL=ON ..
```
### Using GPU-aware MPI
When running `cmake` for COSMA, make sure that GPU-aware MPI is enabled in your environment and specify `-DCOSMA_WITH_GPU_AWARE_MPI=ON` when running cmake for COSMA, e.g. by doing:
```bash
# Before running cmake, make sure that GPU-aware MPI is enabled on your system.
# For example, on Cray-systems, this can be done by setting the following environment variables:
# - export MPICH_RDMA_ENABLED_CUDA=1
# - export MPICH_GPU_SUPPORT_ENABLED=1
cmake -DCOSMA_BLAS=CUDA -DCOSMA_SCALAPACK=MKL -DCOSMA_WITH_GPU_AWARE_MPI=ON ..
```
## COSMA in Production
### CP2K
COSMA is integrated into the [CP2K](https://www.cp2k.org) quantum chemistry simulator. Since COSMA provides ScaLAPACK API, it is enough to link CP2K to COSMA, without changing CP2K code at all, which makes the integration trivial even if (as in the case of CP2K) the simulation code is in written Fortran.
In the production run, we ran *Random-Phase Approximation (RPA)* benchmark of 128 water molecules, using the *Resolution of Identity (RI)*. The benchmark was run once on 1024 and once on 128 nodes of the GPU partition on [Piz Daint supercomputer](https://www.cscs.ch/computers/piz-daint/) (Cray XC50). Computationally, the most dominant part of this benchmark consists of 46 **tall-and-skinny** dense matrix multiplications, with the parameters shown in the table below:

On **1024 nodes**, we compared the performance of CP2K using `COSMA` and `Cray-libsci_acc` (version: 19.10.1), both being GPU accelerated, for all dense matrix-matrix multiplications (`pdgemm` routine). As can be seen in the following table, the version with COSMA was approximately **2x faster**.

On **128 nodes**, we compared the performance of CP2K using the following algorithms for multiplying matrices (`pdgemm` routine): `MKL` (version: 19.0.1.144), `Cray-libsci` (version: 19.06.1), `Cray-libsci_acc` (version: 19.10.1, GPU accelerated) and `COSMA` (both CPU-only and GPU-accelerated versions) libraries. The version with COSMA was the fastest on both CPU and GPU. The CPU version of COSMA achieved the peak performance, whereas the GPU version achieved more than 65\% of the peak performance of GPUs. Keep in mind that the peak performance of GPUs assumes the data is already residing on GPUs which is not the case here, since matrices were initially residing on CPU. This is one of the reasons why the peak performance is not achieved with the GPU version. Still, the GPU version of COSMA was **25-27\%** faster than the second best in this case. The results are summarized in the following table:

With COSMA, even higher speedups are possible, depending on matrix shapes. To illustrate possible performance gains, we also ran different **square matrix** multiplications on the same number of nodes (**=128**) of [Piz Daint supercomputer](https://www.cscs.ch/computers/piz-daint/). The block size is `128x128` and the processor grid is also square: `16x16` (2 ranks per node). The performance of COSMA is compared against Intel MKL ScaLAPACK (version: 19.0.1.144). The results on Cray XC50 (GPU-accelerated) and Cray XC40 (CPU-only) are summarized in the following table:

All the results from this section assumed matrices given in (block-cyclic) ScaLAPACK data layout. However, if the native COSMA layout is used, even higher throughput is possible.
### Julia language
The [COSMA.jl](https://github.com/haampie/COSMA.jl/) Julia package uses COSMA's C-interface to provide COSMA-based matrix-matrix multiplication for the [DistributedArrays.jl](https://github.com/JuliaParallel/DistributedArrays.jl/) package. A minimal working example to multiply two random matrices looks as follows:
```julia
using MPIClusterManager, DistributedArrays, Distributed
manager = MPIManager(np = 6)
addprocs(manager)
COSMA.use_manager(manager)
@everywhere using COSMA
A = drand(8000, 8000) * drand(8000, 8000)
```
## Miniapps
```bash
# for CPU-only version
sbatch schedule_miniapp_on_daint_cpu.sh
# for Hybrid (CPU+GPU) version
sbatch schedule_miniapp_on_daint_gpu.sh
```
The script will use SLURM to submit a job on 10 nodes. The job will run 2 matrix
multiplications and output the time COSMA algorithm took.
### Matrix Multiplication
The project contains a miniapp that produces two random matrices `A` and `B`,
computes their product `C` with the COSMA algorithm and outputs the time of the
multiplication.
The miniapp consists of an executable `./build/miniapp/cosma_miniapp` which can
be run with the following command line (assuming we are in the root folder of
the project):
```bash
# set the number of threads to be used by each MPI rank
export OMP_NUM_THREADS=18
# if using CPU version with MKL backend, set MKL_NUM_THREADS as well
export MKL_NUM_THREADS=18
# run the miniapp
mpirun -np 4 ./build/miniapp/cosma_miniapp -m 1000 -n 1000 -k 1000 -r 2
```
The overview of all supported options is given below:
- `-m (--m_dim)` (default: `1000`): number of rows of matrices `A` and `C`.
- `-n (--n_dim)` (default: `1000`): number of columns of matrices `B` and `C`.
- `-k (--k_dim)` (default: `1000`): number of columns of matrix `A` and rows of matrix `B`.
- `-s (--steps)` (optional): string of triplets divided by comma defining the
splitting strategy. Each triplet defines one step of the algorithm. The first
character in the triplet defines whether it is a parallel (p) or a sequential
(s) step. The second character defines the dimension that is splitted in this
step. The third parameter is an integer which defines the divisor. This
parameter can be omitted. In that case the default strategy will be used. An example of a possible value for the upper example: `--steps=sm2,pn2,pk2`.
- `-r (--n_rep)` (optional, default: `2`): the number of repetitions.
- `-t (--type)` (optional, default: `double`): data type of matrix entries. Can be one of: `float`, `double`, `zfloat` and `zdouble`. The last two correspond to complex numbers.
- `--test` (optional): if present, the result of COSMA will be verified with the result of the available SCALAPACK.
- `-h (--help) (optional)`: print available options.
### COSMA pxgemm wrapper
COSMA also contains a wrapper for ScaLAPACK `pxgemm` calls which offers scalapack interface (pxgemm functions with exactly the same signatures as ScaLAPACK). Running these functions will take care of transforming the matrices between ScaLAPACK and COSMA data layout, perform the multiplication using COSMA algorithm and transform the result back to the specified ScaLAPACK data layout.
The miniapp consists of an executable `./build/miniapp/pxgemm_miniapp` which can be run as follows (assuming we are in the root folder of the project):
```bash
# set the number of threads to be used by each MPI rank
export OMP_NUM_THREADS=18
# if using CPU version with MKL backend, set MKL_NUM_THREADS as well
export MKL_NUM_THREADS=18
# run the miniapp
mpirun -np 4 ./build/miniapp/pxgemm_miniapp -m 1000 -n 1000 -k 1000 \
--block_a=128,128 \
--block_b=128,128 \
--block_c=128,128 \
--p_grid=2,2 \
--transpose=NN \
--type=double \
--algorithm=cosma
```
The overview of all supported options is given below:
- `-m (--m_dim)` (default: `1000`): number of rows of matrices `A` and `C`.
- `-n (--n_dim)` (default: `1000`): number of columns of matrices `B` and `C`.
- `-k (--k_dim)` (default: `1000`): number of columns of matrix `A` and rows of matrix `B`.
- `--block_a` (optional, default: `128,128`): 2D-block size for matrix A.
- `--block_b` (optional, default `128,128`): 2D-block size for matrix B.
- `--block_c` (optional, default `128,128`): 2D-block size for matrix C.
- `-p (--p_grid)` (optional, default: `1,P`): 2D-processor grid. By default `1xP` where `P` is the total number of MPI ranks.
- `--transpose` (optional, default: `NN`): transpose/conjugate flags to A and B.
- `--alpha` (optional, default: 1): alpha parameter in `C = alpha*A*B + beta*C`.
- `--beta` (optional, default: 0): beta parameter in `C = alpha*A*B + beta*C`.
- `-r (--n_rep)` (optional, default: 2): number of repetitions.
- `-t (--type)` (optional, default: `double`): data type of matrix entries. Can be one of: `float`, `double`, `zfloat` and `zdouble`. The last two correspond to complex numbers.
- `--test` (optional): if present, the result of COSMA will be verified with the result of the available SCALAPACK.
- `--algorithm` (optional, default: `both`): defines which algorithm (`cosma`, `scalapack` or `both`) to run.
- `-h (--help) (optional)`: print available options.
## Tunable Parameters
### Parameters Overview
The overview of tunable parameters, that can be set through environment variables is given in the table below. The default values are given in **bold**.
ENVIRONMENT VARIABLE | POSSIBLE VALUES | DESCRIPTION
| :------------------- | :------------------- |:------------------- |
`COSMA_OVERLAP_COMM_AND_COMP` | ON, **OFF** | If enabled, commmunication and computation might be overlapped, depending on the built-in heuristics.
`COSMA_ADAPT_STRATEGY` | **ON**, OFF | If enabled, COSMA will try to natively use the scalapack layout, without transforming to the COSMA layout. Used only in the pxgemm wrapper.
`COSMA_CPU_MAX_MEMORY` | integer (`size_t`), by default: **infinite** | CPU memory limit in megabytes per MPI process (rank). Allowing too little memory might reduce the performance.
`COSMA_GPU_MEMORY_PINNING` | **ON**, OFF | If enabled, COSMA will pin parts of the host memory to speed up CPU-GPU memory transfers. Used only in the GPU backend.
`COSMA_GPU_MAX_TILE_M`, `COSMA_GPU_MAX_TILE_N`, `COSMA_GPU_MAX_TILE_K` | integer (`size_t`), by default: **5000** | Tile sizes for each dimension, that are used to pipeline the local CPU matrices to GPU. `K` refers to the shared dimension and `MxN` refer to the dimensions of matrix `C`
`COSMA_GPU_STREAMS` | integer (`size_t`), by default: **2** | The number of GPU streams that each rank should use.
`COSMA_MEMORY_POOL_AMORTIZATION` | real (`double`), by default **1.2** | The growth factor for the memory pool. If equal to 1.2, then 1.2x the requested size is allocated (thus, 20% more than needed). Higher values better amortize the cost of the memory pool resizing which can occur when the algorithm is invoked for different matrix sizes. However, higher amortization values also mean that potentially more memory is allocated than used which can be a problem when the memory resource is tight.
`COSMA_MIN_LOCAL_DIMENSION` | integer (`size_t`), by default: **200** | If any matrix dimension becomes smaller than this threshold (after splitting the matrices among the available MPI ranks), then the actual number of ranks is reduced so that all matrix dimensions stay at or above this limit.
`COSMA_DIM_THRESHOLD` | integer (`size_t`), by default: **0** | In SCALAPACK wrappers, if any matrix dimension is less than this threshold, the problem is considered too small and is dispatched to SCALAPACK for computation. This only affects the SCALAPACK wrappers.
`COSMA_CPU_MEMORY_ALIGNMENT` | integer (`size_t`), by default: **0** | The number of bytes to which all cpu (host) buffers will be aligned.
These are all optional parameters. They are used in runtime and hence changing any of those does not require the code to be recompiled.
We further discuss in details how to set the limits for both CPU and GPU memory that COSMA is allowed to use.
### Controlling GPU memory
Controlling how much GPU memory COSMA is allowed to use can be done by specifying the tile dimensions as:
```bash
export COSMA_GPU_MAX_TILE_M=5000
export COSMA_GPU_MAX_TILE_N=5000
export COSMA_GPU_MAX_TILE_K=5000
```
where `K` refers to the shared dimension and `MxN` refer to the dimensions of matrix `C`. By default, all tiles are square and have dimensions `5000x5000`.
These are only the maximum tiles and the actual tile sizes that will be used might be less, depending on the problem size. These variables are only used in the GPU backend for pipelining the local matrices to GPUs.
It is also possible to specify the number of GPU streams:
```bash
export COSMA_GPU_STREAMS=2
```
The values given here are the default values.
The algorithm will then require device memory for at most this many elements:
```cpp
num_streams * (tile_m * tile_k + tile_k * tile_n + tile_m * tile_n)
```
Therefore, by changing the values of these variables, it is possible to control the usage of GPU memory.
### Controlling CPU memory
In case the available CPU memory is a scarce resource, it is possible to set the CPU memory limit to COSMA, by exporting the following environment variable:
```bash
export COSMA_CPU_MAX_MEMORY=1024 # in megabytes per MPI process (rank)
```
which will set the upper limit [in MB] on the memory that each MPI process (rank) is allowed to use. This might, however, reduce the performance.
In case the algorithm is not able to perform the multiplication within the given memory range, a `runtime_error` will be thrown.
> This parameter is still in the testing phase!
## Profiling
Use `-DCOSMA_WITH_PROFILING=ON` to instrument the code. We use the profiler, called `semiprof`, written by Benjamin Cumming (https://github.com/bcumming).
Running the miniapp locally (from the project root folder) with the following command:
```bash
mpirun --oversubscribe -np 4 ./build/miniapp/cosma-miniapp -m 1000 -n 1000 -k 1000 -P 4
```
Produces the following output from rank 0:
```
Matrix dimensions (m, n, k) = (1000, 1000, 1000)
Number of processors: 4
_p_ REGION CALLS THREAD WALL %
_p_ total - 0.110 0.110 100.0
_p_ multiply - 0.098 0.098 88.7
_p_ computation 2 0.052 0.052 47.1
_p_ communication - 0.046 0.046 41.6
_p_ copy 3 0.037 0.037 33.2
_p_ reduce 3 0.009 0.009 8.3
_p_ layout 18 0.000 0.000 0.0
_p_ preprocessing 3 0.012 0.012 11.3
```
The precentage is always relative to the first level above. All time measurements are in seconds.
## Authors
- Grzegorz Kwasniewski, Marko Kabic, Maciej Besta, Joost VandeVondele, Raffaele Solca, Torsten Hoefler
Cite as:
```
@inproceedings{cosma_algorithm_2019,
title={Red-blue pebbling revisited: Near optimal parallel matrix-matrix multiplication},
author={Kwasniewski, Grzegorz and Kabi{\'c}, Marko and Besta, Maciej and VandeVondele, Joost and Solc{\`a}, Raffaele and Hoefler, Torsten},
booktitle={Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis},
pages={1--22},
year={2019}
}
```
## Questions?
For questions, feel free to contact us, and we will soon get back to you:
- For questions regarding the implementation, contact Marko Kabic (marko.kabic@inf.ethz.ch), Teodor Nikolov (tnikolov@cscs.ch) or Simon Pintarelli (simon.pintarelli@cscs.ch).
- For questions regarding the theory, contact Grzegorz Kwasniewski (gkwasnie@inf.ethz.ch).
> If you need any help with the integration of COSMA into your library, we will be more than happy to help you!
## Acknowledgements
This work was funded in part by:
| [**ETH Zurich**](https://ethz.ch/en.html)**: Swiss Federal Institute of Technology in Zurich**
| :------------------- | :------------------- |
| [**CSCS**](https://www.cscs.ch)**: Swiss National Supercomputing Centre**
| [**PASC**](https://www.pasc-ch.org/)**: Platform for Advanced Scientific Computing**
| [**ERC**](https://erc.europa.eu): **European Research Council** (Horizon2020, grant agreement DAPP, No.678880)
| [**MaX**](http://www.max-centre.eu): **Materials design at the Exascale** (Horizon2020, grant agreement MaX CoE, No. 824143.)
We thank Thibault Notargiacomo, Sam Yates, Benjamin Cumming and Simon Pintarelli for their generous contribution to the project: great ideas, useful advices and fruitful discussions.
================================================
FILE: _config.yml
================================================
theme: jekyll-theme-slate
================================================
FILE: benchmarks/CMakeLists.txt
================================================
include(find_cuda_version)
################
# Build test #
################
set(executables "ubench-allgather"
"allgather-volume"
"sendrecv"
"reduce-scatter"
"blocking_vs_non_blocking"
"dgemm_perf_model")
# if (${COSMA_BLAS} STREQUAL "MKL")
# list(APPEND executables "transpose")
# endif()
foreach(exec ${executables})
add_executable(${exec} "${exec}.cpp")
target_link_libraries(${exec} cosma)
endforeach()
if (COSMA_GPU_BACKEND MATCHES "CUDA")
find_cuda_version()
# check if cuda toolkit version >= 10.1
# which is needed for cublasLt (used in the benchmark)
if (CUDA_TOOLKIT_MAJOR_VERSION GREATER 10 OR
(CUDA_TOOLKIT_MAJOR_VERSION EQUAL 10 AND CUDA_TOOLKIT_MINOR_VERSION GREATER_EQUAL 1))
add_executable(gpu_gemm_cublas "gpu_gemm_cublas.cpp")
target_link_libraries(gpu_gemm_cublas cosma Tiled-MM::Tiled-MM cublasLt cublas)
target_compile_definitions(gpu_gemm_cublas PRIVATE COSMA_HAVE_GPU)
endif()
endif()
================================================
FILE: benchmarks/allgather-volume.cpp
================================================
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
using namespace cosma;
int main(int argc, char **argv) {
MPI_Init(&argc, &argv);
int P, rank;
MPI_Comm_size(MPI_COMM_WORLD, &P);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
int base_size = 1 << 25;
int local_size = base_size;
int total_size = P * base_size;
std::vector in(local_size);
std::vector result(total_size);
const int n_rep = 10;
{
Timer time(n_rep, "MPI_Allgather");
for (int i = 0; i < n_rep; ++i) {
MPI_Allgather(in.data(),
local_size,
MPI_DOUBLE,
result.data(),
local_size,
MPI_DOUBLE,
MPI_COMM_WORLD);
}
}
MPI_Finalize();
return 0;
}
================================================
FILE: benchmarks/bcast-volume.cpp
================================================
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
using namespace cosma;
int main( int argc, char **argv ) {
MPI_Init(&argc, &argv);
int P, rank;
MPI_Comm_size(MPI_COMM_WORLD, &P);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
int base_size = 1 << 25;
int local_size = base_size;
int total_size = P * base_size;
std::vector in(local_size);
std::vector result(total_size);
const int n_rep = 10;
{
Timer time(n_rep, "MPI_Allgather");
for (int i = 0; i < n_rep; ++i) {
MPI_Allgather(in.data(), local_size, MPI_DOUBLE, result.data(),
local_size, MPI_DOUBLE, MPI_COMM_WORLD);
}
}
MPI_Finalize();
return 0;
}
================================================
FILE: benchmarks/blocking_vs_non_blocking.cpp
================================================
#include
#include
#include
#include
#include
#include
#include
#include
class Timer {
public:
using time_point =
std::chrono::time_point;
int n_rep_;
std::string region;
MPI_Comm comm_;
time_point start;
Timer(int n_rep, std::string reg = "", MPI_Comm comm = MPI_COMM_WORLD)
: n_rep_(n_rep)
, region(reg)
, comm_(comm) {
MPI_Barrier(comm);
start = std::chrono::high_resolution_clock::now();
}
~Timer() {
auto finish = std::chrono::high_resolution_clock::now();
std::chrono::duration elapsed = finish - start;
auto time =
std::chrono::duration_cast(elapsed)
.count();
long long max_time, min_time, sum_time;
MPI_Reduce(&time, &max_time, 1, MPI_LONG_LONG, MPI_MAX, 0, comm_);
MPI_Reduce(&time, &min_time, 1, MPI_LONG_LONG, MPI_MIN, 0, comm_);
MPI_Reduce(&time, &sum_time, 1, MPI_LONG_LONG, MPI_SUM, 0, comm_);
int rank, size;
MPI_Comm_rank(comm_, &rank);
MPI_Comm_size(comm_, &size);
if (rank == 0) {
std::cout << region << " MIN TIME [ms]: " << 1.0 * min_time / n_rep_
<< std::endl;
std::cout << region << " MAX TIME [ms]: " << 1.0 * max_time / n_rep_
<< std::endl;
std::cout << region
<< " AVG TIME [ms]: " << 1.0 * sum_time / (n_rep_ * size)
<< std::endl;
std::cout << "\n";
}
}
};
std::pair group_and_offset(int P, int divisor, int rank) {
int subset_size = P / divisor;
int subint_index = rank / subset_size;
int offset = rank - subint_index * subset_size;
return {subint_index, offset};
}
void solve(double *A, double *B, double *C, int m, int n, int k) {
// multiply square matrices with dimensions sqrt(local_size)
auto ctx = cosma::make_context();
bool copy_c_back = true;
cosma::local_multiply(ctx, A, B, C, m, n, k, 1.0, 0.0, copy_c_back);
}
int main(int argc, char **argv) {
MPI_Init(&argc, &argv);
int P, rank;
MPI_Comm_size(MPI_COMM_WORLD, &P);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
int divisor = 2;
int m = 5000;
int k = 2000;
int n = 2000;
int n_iter = 3;
size_t local_size = k * n / divisor;
float waiting_time = 0.7f;
std::vector local_buffer(local_size);
std::vector global_buffer(local_size * divisor);
std::vector a(m / divisor * k);
std::vector b(k * n);
std::vector c(m / divisor * n);
// initialize dgemm
for (int i = 0; i < 5; ++i) {
solve(a.data(), b.data(), c.data(), m / divisor, n / divisor, k);
}
{
Timer dgemm_small(10, "dgemm subproblem");
for (int i = 0; i < 10; ++i) {
solve(a.data(), b.data(), c.data(), m / divisor, n / divisor, k);
}
}
{
Timer dgemm_large(10, "dgemm large problem");
for (int i = 0; i < 10; ++i) {
solve(a.data(), b.data(), c.data(), m / divisor, n, k);
}
}
int gp, off;
std::tie(gp, off) = group_and_offset(P, divisor, rank);
MPI_Comm subcom;
MPI_Comm_split(MPI_COMM_WORLD, off, gp, &subcom);
MPI_Request req[2 * (divisor - 1)];
int reqi = 0;
for (int i = 0; i < divisor; ++i) {
if (i != gp) {
int offset = i * local_size;
MPI_Recv_init(global_buffer.data() + offset,
local_size,
MPI_DOUBLE,
i,
0,
subcom,
&req[reqi]);
MPI_Send_init(local_buffer.data(),
local_size,
MPI_DOUBLE,
i,
0,
subcom,
&req[divisor - 1 + reqi]);
reqi++;
}
}
{
Timer timer_async(1, "asynchronous");
MPI_Startall(2 * (divisor - 1), req);
// do the work
solve(a.data(), b.data(), c.data(), m / divisor, n / divisor, k);
// usleep(waiting_time * 1e6);
for (int i = 0; i < divisor - 1; ++i) {
int idx = -1;
MPI_Waitany(divisor - 1, req, &idx, MPI_STATUS_IGNORE);
// if (idx >= rank) idx++;
solve(a.data(), b.data(), c.data(), m / divisor, n / divisor, k);
// usleep(waiting_time * 1e6);
}
MPI_Waitall(divisor - 1, req + divisor - 1, MPI_STATUSES_IGNORE);
}
MPI_Barrier(MPI_COMM_WORLD);
{
Timer timer_sync(1, "synchronous");
MPI_Allgather(local_buffer.data(),
local_size,
MPI_DOUBLE,
global_buffer.data(),
local_size,
MPI_DOUBLE,
subcom);
solve(a.data(), b.data(), c.data(), m / divisor, n, k);
// usleep(1e6 * divisor * waiting_time);
}
MPI_Comm_free(&subcom);
MPI_Finalize();
}
================================================
FILE: benchmarks/dgemm_perf_model.cpp
================================================
#include
#include
#include
#include
#include
using namespace cosma;
double sq_score(double a, double b) {
double result = ((1.0 * a / b) + (1.0 * b / a)) /
(2.0 * std::max(1.0 * a / b, 1.0 * b / a));
// double result = std::min(a, b) / std::max(a, b);
return result;
}
double score(double m, double n, double k) {
double score_a = sq_score(m, k);
double score_b = sq_score(k, n);
double score_c = sq_score(m, n);
double result = score_a * score_b * score_c;
return result;
}
double throughput(double m, double n, double k, double time) {
return m * n * k * 2 / (1e6 * time);
}
struct problem {
int m;
int n;
int k;
double time;
double score;
double tps;
problem() = default;
problem(int mm, int nn, int kk, double tt, double ss, double thr)
: m(mm)
, n(nn)
, k(kk)
, time(tt)
, score(ss)
, tps(thr) {}
};
int main(int argc, char **argv) {
std::vector a;
std::vector b;
std::vector c;
int min_m = 1000;
int min_n = 1000;
int min_k = 1000;
int max_m = 50000;
int max_n = 1000;
int max_k = 1000;
int step_m = 500;
int step_n = 500;
int step_k = 500;
int n_rep = 2;
auto ctx = cosma::make_context();
bool copy_c_back = true;
// run random dgemm in order to initialize it
for (int i = 0; i < n_rep; ++i) {
a = std::vector(min_m * min_m);
b = std::vector(min_m * min_m);
c = std::vector(min_m * min_m);
local_multiply(
ctx, a.data(), b.data(), c.data(), min_m, min_m, min_m, 1.0, 0.0, copy_c_back);
}
std::vector timings;
for (int m = min_m; m <= max_m; m += step_m) {
for (int n = min_n; n <= max_n; n += step_n) {
for (int k = min_k; k <= max_k; k += step_k) {
auto start = std::chrono::high_resolution_clock::now();
for (int rep = 0; rep < n_rep; ++rep) {
a = std::vector(m * k);
b = std::vector(k * n);
c = std::vector(m * n);
local_multiply(
ctx, a.data(), b.data(), c.data(), m, n, k, 1.0, 0.0, copy_c_back);
}
auto finish = std::chrono::high_resolution_clock::now();
auto time =
std::chrono::duration_cast(
finish - start)
.count();
time /= 1.0 * n_rep;
double mul_score = score(m, n, k);
double tps = throughput(m, n, k, time);
problem prob(m, n, k, time, mul_score, tps);
timings.push_back(prob);
}
}
}
std::sort(timings.begin(),
timings.end(),
[](const problem &lhs, const problem &rhs) {
return lhs.tps < rhs.tps;
});
for (auto &problem : timings) {
std::cout << problem.m << " " << problem.tps << " " << problem.score
<< std::endl;
// std::cout << "(" << problem.m << ", " << problem.n << ", " <<
// problem.k << "), tps = " << problem.tps << ", score = " <<
// problem.score << std::endl;
}
return 0;
}
================================================
FILE: benchmarks/gpu_gemm_cublas.cpp
================================================
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
template
void fill_matrix(T* ptr, size_t size) {
static std::random_device dev; // seed
static std::mt19937 rng(dev()); // generator
static std::uniform_real_distribution dist(10.0); // distribution
for (unsigned i = 0; i < size; ++i) {
ptr[i] = T{dist(rng)};
}
}
std::vector tiled_mm_dgemm(int n_iter, int m, int n, int k) {
auto gpu_ctx = gpu::make_context(2, 4000, 4000, 4000);
std::vector aa(m * k);
std::vector bb(k * n);
std::vector cc(m * n);
double *a = aa.data();
double *b = bb.data();
double *c = cc.data();
double alpha = 1.0;
double beta = 0.0;
std::vector times(n_iter);
for (int i = 0; i < n_iter; ++i) {
fill_matrix(a, aa.size());
fill_matrix(b, bb.size());
if (beta > 0) {
fill_matrix(c, cc.size());
}
// perform dgemm
auto start = std::chrono::steady_clock::now();
cosma::local_multiply(gpu_ctx.get(), a, b, c, m, n, k, alpha, beta);
auto end = std::chrono::steady_clock::now();
times[i] = std::chrono::duration_cast(end - start).count();
}
std::sort(times.begin(), times.end());
return times;
}
std::vector cublasXt_dgemm(int n_iter, int m, int n, int k) {
auto status=
cudaSetDevice(0);
gpu::check_runtime_status(status);
cublasXtHandle_t handle;
auto cublas_status = cublasXtCreate(&handle);
gpu::check_blas_status(cublas_status);
int devices[1] = {0};
cublasXtDeviceSelect(handle, 1, devices);
// cublasXtSetCpuRoutine(handle, CUBLASXT_GEMM, CUBLASXT_DOUBLE, (void*)(&dgemm_));
// cublasXtSetCpuRatio(handle, CUBLASXT_GEMM, CUBLASXT_DOUBLE, 0.2);
// cublasXtSetPinningMemMode(handle, CUBLASXT_PINNING_ENABLED);
// cublasXtSetBlockDim(handle, 4000);
std::vector aa(m * k);
std::vector bb(k * n);
std::vector cc(m * n);
double *a = aa.data();
double *b = bb.data();
double *c = cc.data();
double alpha = 1.0;
double beta = 0.0;
std::vector times(n_iter);
for (int i = 0; i < n_iter; ++i) {
fill_matrix(a, aa.size());
fill_matrix(b, bb.size());
if (beta > 0) {
fill_matrix(c, cc.size());
}
// perform dgemm
auto start = std::chrono::steady_clock::now();
auto status = cublasXtDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N,
m, n, k, &alpha, a, m, b, k, &beta, c, m);
gpu::check_blas_status(status);
cudaDeviceSynchronize();
auto end = std::chrono::steady_clock::now();
times[i] = std::chrono::duration_cast(end - start).count();
}
std::sort(times.begin(), times.end());
// finalization
if (handle)
cublasXtDestroy(handle);
return times;
}
/*
// cublasLt assumes device pointers
std::vector cublasLt_dgemm(int n_iter, int m, int n, int k) {
auto runtime_status=
cudaSetDevice(0);
gpu::check_runtime_status(runtime_status);
cublasLtHandle_t handle;
auto status = cublasLtCreate(&handle);
gpu::check_blas_status(status);
// int devices[1] = {0};
// cublasLtDeviceSelect(handle, 1, devices);
// std::vector aa(m * k);
// std::vector bb(k * n);
// std::vector cc(m * n);
double *a = gpu::malloc_device(m * k);
double *b = gpu::malloc_device(k * n);
double *c = gpu::malloc_device(m * n);
double alpha = 1.0;
double beta = 0.0;
std::size_t workspaceSize = 4000;
std::size_t workspaceSizeBytes = workspaceSize * sizeof(double);
auto workspace = gpu::malloc_device(workspaceSize);
auto transa = CUBLAS_OP_N;
auto transb = CUBLAS_OP_N;
cublasLtMatmulDesc_t operationDesc = nullptr;
cublasLtMatrixLayout_t Adesc = nullptr;
cublasLtMatrixLayout_t Bdesc = nullptr;
cublasLtMatrixLayout_t Cdesc = nullptr;
cublasLtMatmulPreference_t preference = nullptr;
int returnedResults = 0;
cublasLtMatmulHeuristicResult_t heuristicResult = {};
status = cublasLtMatmulDescCreate(&operationDesc, CUDA_R_64F);
gpu::check_blas_status(status);
status = cublasLtMatmulDescSetAttribute(operationDesc,
CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa));
gpu::check_blas_status(status);
status = cublasLtMatmulDescSetAttribute(operationDesc,
CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transb));
gpu::check_blas_status(status);
status = cublasLtMatrixLayoutCreate(&Adesc, CUDA_R_64F, m, k, m);
gpu::check_blas_status(status);
status = cublasLtMatrixLayoutCreate(&Bdesc, CUDA_R_64F, k, n, k);
gpu::check_blas_status(status);
status = cublasLtMatrixLayoutCreate(&Cdesc, CUDA_R_64F, m, n, m);
gpu::check_blas_status(status);
std::cout << "Created matrix layouts." << std::endl;
status = cublasLtMatmulPreferenceCreate(&preference);
gpu::check_blas_status(status);
status = cublasLtMatmulPreferenceSetAttribute(
preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
&workspaceSizeBytes, sizeof(workspaceSizeBytes));
gpu::check_blas_status(status);
std::cout << "Set up preferences." << std::endl;
status = cublasLtMatmulAlgoGetHeuristic(
handle, operationDesc, Adesc, Bdesc, Cdesc, Cdesc,
preference, 1, &heuristicResult, &returnedResults);
gpu::check_blas_status(status);
if (returnedResults == 0) {
std::cout << "No algorithm was returned." << std::endl;
status = CUBLAS_STATUS_NOT_SUPPORTED;
gpu::check_blas_status(status);
}
std::cout << "Chose the algorithm." << std::endl;
std::vector times(n_iter);
for (int i = 0; i < n_iter; ++i) {
// fill_matrix(a, m * k);
// fill_matrix(b, k * n);
// if (beta > 0) {
// fill_matrix(c, m * n);
// }
// perform dgemm
auto start = std::chrono::steady_clock::now();
status = cublasLtMatmul(handle,
operationDesc,
&alpha,
a,
Adesc,
b,
Bdesc,
&beta,
c,
Cdesc,
c,
Cdesc,
&heuristicResult.algo,
workspace,
workspaceSizeBytes,
0);
gpu::check_blas_status(status);
cudaDeviceSynchronize();
auto end = std::chrono::steady_clock::now();
times[i] = std::chrono::duration_cast(end - start).count();
}
std::sort(times.begin(), times.end());
// finalization
if (handle)
cublasLtDestroy(handle);
// Descriptors are no longer needed as all GPU work was already
// enqueued.
if (preference)
status = cublasLtMatmulPreferenceDestroy(preference);
if (Cdesc)
status = cublasLtMatrixLayoutDestroy(Cdesc);
if (Bdesc)
status = cublasLtMatrixLayoutDestroy(Bdesc);
if (Adesc)
status = cublasLtMatrixLayoutDestroy(Adesc);
if (operationDesc)
status = cublasLtMatmulDescDestroy(operationDesc);
gpu::check_blas_status(status);
return times;
}
*/
int main(int argc, char* argv[]) {
// std::vector dims = {500, 1000, 2000, 4000, 8000, 16000, 32000};
std::vector dims = {4000, 8000, 12000, 16000, 20000, 24000, 28000, 32000};
int n_iter = 2;
std::vector times(n_iter);
for (const int& dim : dims) {
std::cout << "Dimension = " << dim << std::endl;
/*
// cublasLt
times = cublasLt_dgemm(n_iter, dim, dim, dim);
std::cout << "cublasLt: ";
for (const auto& time : times) {
std::cout << time << ", ";
}
std::cout << std::endl;
*/
// cublasXt
times = cublasXt_dgemm(n_iter, dim, dim, dim);
std::cout << "cublasXt: ";
for (const auto& time : times) {
std::cout << time << ", ";
}
if (times.size()) {
std::cout << "highest throughtput [Glop/s]: " << 2.0*dim*dim*dim/(1e6*times[0]);
}
std::cout << std::endl;
// tiled-mm
times = tiled_mm_dgemm(n_iter, dim, dim, dim);
std::cout << "Tiled-MM: ";
for (const auto& time : times) {
std::cout << time << ", ";
}
if (times.size()) {
std::cout << "highest throughtput [Glop/s]: " << 2.0*dim*dim*dim/(1e6*times[0]);
}
std::cout << std::endl;
}
}
================================================
FILE: benchmarks/gpu_gemm_libsci_acc.cpp
================================================
#include
#include
#include
#include
long libsci_acc_dgemm(int m, int n, int k) {
double* a, *b, *c;
double alpha = 1.0;
double beta = 0.0;
libsci_acc_HostAlloc((void**)&a, sizeof(double)*m*k);
libsci_acc_HostAlloc((void**)&b, sizeof(double)*k*n);
libsci_acc_HostAlloc((void**)&c, sizeof(double)*m*n);
// perform dgemm
auto start = std::chrono::steady_clock::now();
dgemm('n', 'n', m, n, k, alpha, a, m, b, k, beta, c, m);
auto end = std::chrono::steady_clock::now();
libsci_acc_HostFree(a);
libsci_acc_HostFree(b);
libsci_acc_HostFree(c);
return std::chrono::duration_cast(end - start).count();
}
int main(int argc, char* argv[]) {
// initialization
libsci_acc_init();
// std::vector dims = {500, 1000, 2000, 4000, 8000, 16000, 32000};
std::vector dims = {32000};
int n_iter = 1;
for (const int& dim : dims) {
std::cout << "Dimension = " << dim << std::endl;
double t_avg_libsci = 0;
for (int i = 0; i < n_iter+1; ++i) {
long t_libsci = libsci_acc_dgemm(dim, dim, dim);
if (i == 0) continue;
t_avg_libsci += t_libsci;
}
std::cout << "libsci average time [ms]: " << 1.0*t_avg_libsci/n_iter << std::endl;
}
libsci_acc_finalize();
}
================================================
FILE: benchmarks/reduce-scatter.cpp
================================================
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
using namespace cosma;
int main( int argc, char **argv ) {
MPI_Init(&argc, &argv);
int P, rank;
MPI_Comm_size(MPI_COMM_WORLD, &P);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
const int n_rep = 2;
int scaling_factor = P;
/*
for (int i = -10; i <= 10; ++i) {
int dim = (scaling_factor+i)*P;
int block_size = (dim/P) * dim;
int total_size = block_size * P;
if (rank == 0)
std::cout << "dim = " << dim << std::endl;
std::vector in(total_size);
std::vector result(block_size);
MPI_Request reqs[2];
{
Timer time(n_rep, "MPI_Reduce_scatter_block");
for (int i = 0; i < n_rep; ++i) {
MPI_Ireduce_scatter_block(in.data(),
result.data(),
block_size/2,
MPI_DOUBLE,
MPI_SUM,
MPI_COMM_WORLD,
&reqs[0]);
MPI_Ireduce_scatter_block(in.data(),
result.data(),
block_size/2,
MPI_DOUBLE,
MPI_SUM,
MPI_COMM_WORLD,
&reqs[1]);
MPI_Waitall(2, &reqs[0], MPI_STATUSES_IGNORE);
}
}
}
*/
int dim = 17408;
int block_size = (dim/P) * dim;
int total_size = block_size * P;
std::vector in(total_size);
std::vector result(block_size);
{
Timer time(n_rep, "MPI_Reduce_scatter_block");
for (int i = 0; i < n_rep; ++i) {
MPI_Reduce_scatter_block(in.data(),
result.data(),
block_size,
MPI_DOUBLE,
MPI_SUM,
MPI_COMM_WORLD);
}
}
MPI_Finalize();
return 0;
}
================================================
FILE: benchmarks/run_ubench.sh
================================================
n_nodes_list=(2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36)
run() {
n_nodes=$1
srun -N $n_nodes -n $n_nodes ./tests/ubench/ubench-allgather
}
IFS=
for n_nodes in ${n_nodes_list[@]}
do
echo "NODES = "$n_nodes
output=$(run $n_nodes)
avg_time_v=$(echo $output | awk '/MPI_Allgatherv AVG TIME/ {print $5}')
avg_time=$(echo $output | awk '/MPI_Allgather AVG TIME/ {print $5}')
echo $output
echo "avg_time_v = "$avg_time_v
echo "avg_time = "$avg_time
echo $avg_time_v >> "allgather_v.txt"
echo $avg_time >> "allgather.txt"
done
================================================
FILE: benchmarks/scalapack_transformer.cpp
================================================
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
using namespace grid2grid;
extern "C" {
/* Cblacs declarations */
void Cblacs_pinfo(int*, int*);
void Cblacs_get(int, int, int*);
void Cblacs_gridinit(int*, const char*, int, int);
void Cblacs_pcoord(int, int, int*, int*);
void Cblacs_gridexit(int);
void Cblacs_barrier(int, const char*);
int numroc_(int*, int*, int*, int*, int*);
void pdgemr2d_(int *m, int *n,
double *a, int *ia, int *ja, int *desca,
double *b, int *ib, int *jb, int *descb,
int* ictxt);
void descinit_(int* desc, int* m, int* n, int* bm, int* bn,
int* rsrc, int* csrc, int* ctxt, int* lda, int* info);
}
// *****************************
// OUR LAYOUT TRANSFORMER
// *****************************
long int run_our_layout(int m, int n, int bm1, int bn1, int bm2, int bn2, int pm, int pn, int nrep, int rank) {
auto ordering = scalapack::ordering::row_major;
auto values = [](int i, int j) {
return cosma::math_utils::cantor_pairing(i, j);
};
scalapack::data_layout layout1({m, n}, {bm1, bn1}, {pm, pn}, ordering);
std::vector buffer1 = initialize_locally(rank, layout1, values);
grid_layout scalapack_layout_1 = get_scalapack_grid(layout1, buffer1.data(), rank);
scalapack::data_layout layout2({m, n}, {bm2, bn2}, {pm, pn}, ordering);
std::vector buffer2 = initialize_locally(rank, layout2, values);
grid_layout scalapack_layout_2 = get_scalapack_grid(layout2, buffer2.data(), rank);
long int min_time = std::numeric_limits::max();
for (int i = 0; i < nrep; ++i) {
MPI_Barrier(MPI_COMM_WORLD);
auto start = std::chrono::steady_clock::now();
transform(scalapack_layout_1, scalapack_layout_2, MPI_COMM_WORLD);
MPI_Barrier(MPI_COMM_WORLD);
auto end = std::chrono::steady_clock::now();
auto our_time = std::chrono::duration_cast(end - start).count();
min_time = std::min(our_time, min_time);
}
return min_time;
}
// *****************************
// SCALAPACK LAYOUT TRANSFORMER
// *****************************
long int run_scalapack_layout(int m, int n, int bm1, int bn1, int bm2, int bn2, int pm, int pn, int nrep, int rank) {
// Begin Cblas context
// We assume that we have 4 processes and place them in a 2-by-2 grid
int iZERO = 0;
int ctxt, myid, myrow, mycol, numproc;
int procrows = 2, proccols = 2;
Cblacs_pinfo(&myid, &numproc);
Cblacs_get(0, 0, &ctxt);
Cblacs_gridinit(&ctxt, "Row-major", procrows, proccols);
Cblacs_pcoord(ctxt, myid, &myrow, &mycol);
// Number of rows and cols owned by the current process
int nrows1 = numroc_(&m, &bm1, &myrow, &iZERO, &procrows);
int ncols1 = numroc_(&n, &bn1, &mycol, &iZERO, &proccols);
int nrows2 = numroc_(&m, &bm2, &myrow, &iZERO, &procrows);
int ncols2 = numroc_(&n, &bn2, &mycol, &iZERO, &proccols);
std::vector buffer1(nrows1 * ncols1);
std::vector buffer2(nrows2 * ncols2);
int ia = 1;
int ja = 1;
int ib = 1;
int jb = 1;
// std::vector desca = {1, ctxt, m, n, bm1, bn1, 0, 0, m};
// std::vector descb = {1, ctxt, m, n, bm2, bn2, 0, 0, m};
std::array desc1;
std::array desc2;
int info;
descinit_(&desc1[0], &m, &n, &bm1, &bn1, &iZERO, &iZERO, &ctxt, &nrows1, &info);
descinit_(&desc2[0], &m, &n, &bm2, &bn2, &iZERO, &iZERO, &ctxt, &nrows2, &info);
long int min_time = std::numeric_limits::max();
for (int i = 0; i < nrep; ++i) {
MPI_Barrier(MPI_COMM_WORLD);
auto start = std::chrono::steady_clock::now();
pdgemr2d_(&m, &n, buffer1.data(), &ia, &ib, &desc1[0],
buffer2.data(), &ib, &jb, &desc2[0],
&ctxt);
MPI_Barrier(MPI_COMM_WORLD);
auto end = std::chrono::steady_clock::now();
auto scalapack_time = std::chrono::duration_cast(end - start).count();
min_time = std::min(min_time, scalapack_time);
}
// Release resources
Cblacs_gridexit(ctxt);
return min_time;
}
int main(int argc, char **argv) {
MPI_Init(&argc, &argv);
int P, rank;
MPI_Comm_size(MPI_COMM_WORLD, &P);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
int dim = 10000;
int bm1 = 124;
int bn1 = 124;
int pm = 2;
int pn = 2;
int bm2 = 192;
int bn2 = 192;
int nrep = 3;
for (int i = 1; i <= 5; ++i) {
int m = dim * i;
int n = dim * i;
auto our_time = run_our_layout(m, n, bm1, bn1, bm2, bn2, pm, pn, nrep, rank);
auto scalapack_time = run_scalapack_layout(m, n, bm1, bn1, bm2, bn2, pm, pn, nrep, rank);
if (rank == 0) {
std::cout << "Dimension = " << m << std::endl;
std::cout << "Our time [ms] = " << our_time << std::endl;
std::cout << "ScaLAPACK time [ms] = " << scalapack_time << std::endl;
std::cout << "Ration scalapack/our = " << 1.0 * scalapack_time/our_time << std::endl;
std::cout << "============================" << std::endl;
}
}
MPI_Finalize();
return 0;
}
================================================
FILE: benchmarks/sendrecv.cpp
================================================
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
using namespace cosma;
int main(int argc, char **argv) {
MPI_Init(&argc, &argv);
int P, rank;
MPI_Comm_size(MPI_COMM_WORLD, &P);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
int base_size = 1 << 25;
int local_size = base_size;
std::vector in(local_size);
std::vector result(local_size);
const int n_rep = 10;
for (int i = 0; i < n_rep; ++i) {
int target = 1 - rank;
MPI_Sendrecv(in.data(),
local_size,
MPI_DOUBLE,
target,
0,
result.data(),
local_size,
MPI_DOUBLE,
target,
0,
MPI_COMM_WORLD,
MPI_STATUS_IGNORE);
}
MPI_Finalize();
return 0;
}
================================================
FILE: benchmarks/transpose.cpp
================================================
#include
#include
#include
#include
#include
int main(int argc, char** argv) {
int n_rep = 3;
// dimensions before transposing
std::vector n_rows = {5000, 10000, 15000, 20000, 25000, 30000}; // 5000;
std::vector n_cols = {5000, 10000, 15000, 20000, 25000, 30000}; // 10000;
// not strided
auto src_stride = n_rows; // 5000;
auto dest_stride = n_cols; // 10000;
bool conjugate = false;
costa::memory::threads_workspace workspace(256);
std::vector g2g_times;
std::vector mkl_times;
for (int i = 0; i < n_rows.size(); ++i) {
long g2g_time = std::numeric_limits::max();
long mkl_time = std::numeric_limits::max();
src_stride[i] = std::max(n_rows[i], src_stride[i]);
// since transposed
dest_stride[i] = std::max(n_cols[i], dest_stride[i]);
std::vector src(src_stride[i] * n_cols[i]);
std::vector dest_g2g(dest_stride[i] * n_rows[i]);
std::vector dest_mkl(dest_stride[i] * n_rows[i]);
for (int row = 0; row < n_rows[i]; ++row) {
for (int col = 0; col < n_cols[i]; ++col) {
src[col * src_stride[i] + row] = col * src_stride[i] + row;
}
}
for (int rep = 0; rep < n_rep; ++rep) {
// ***********************************
// transpose with costa
// ***********************************
auto start = std::chrono::steady_clock::now();
costa::memory::copy_and_transpose(src.data(), n_rows[i], n_cols[i], src_stride[i],
dest_g2g.data(), dest_stride[i], false, workspace);
auto end = std::chrono::steady_clock::now();
g2g_time = std::min(g2g_time, (long) std::chrono::duration_cast(end - start).count());
// ***********************************
// transpose with mkl
// ***********************************
start = std::chrono::steady_clock::now();
mkl_domatcopy('C', 'T', n_rows[i], n_cols[i], 1.0, src.data(), src_stride[i], dest_mkl.data(), dest_stride[i]);
end = std::chrono::steady_clock::now();
mkl_time = std::min(mkl_time, (long) std::chrono::duration_cast(end - start).count());
}
g2g_times.push_back(g2g_time);
mkl_times.push_back(mkl_time);
// ***********************************
// checking results
// ***********************************
int n_rows_t = n_cols[i];
int n_cols_t = n_rows[i];
for (int row = 0; row < n_rows_t; ++row) {
for (int col = 0; col < n_cols_t; ++col) {
// dest_stride >= n_cols
auto g2g = dest_g2g[col * dest_stride[i] + row];
auto mkl = dest_mkl[col * dest_stride[i] + row];
auto target = src[row * src_stride[i] + col];
if (g2g != mkl) {
std::cout << "Error: (" << col << ", " << row << ") = " << ", g2g = " << g2g << ", mkl = " << mkl << ", target = " << target << std::endl;
}
}
}
}
// ***********************************
// output COSTA timings
// ***********************************
std::cout << "COSTA times: " << std::endl;
for (int i = 0; i < g2g_times.size(); ++i) {
std::cout << g2g_times[i] << ", ";
}
std::cout << std::endl;
// ***********************************
// output MKL timings
// ***********************************
std::cout << "mkl times: " << std::endl;
for (int i = 0; i < mkl_times.size(); ++i) {
std::cout << mkl_times[i] << ", ";
}
std::cout << std::endl;
return 0;
}
================================================
FILE: benchmarks/ubench-allgather.cpp
================================================
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
using namespace cosma;
int main(int argc, char **argv) {
MPI_Init(&argc, &argv);
int P, rank;
MPI_Comm_size(MPI_COMM_WORLD, &P);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
int base_size = 1500000;
int var = base_size / 10;
int local_size = base_size + ((rank % 2 == 0) ? var : 0);
int max_size = -1;
int total_size = 0;
std::vector sizes(P);
std::vector dspls(P);
for (int i = 0; i < P; ++i) {
int local_size = base_size + ((i % 2 == 0) ? var : 0);
max_size = std::max(max_size, local_size);
sizes[i] = local_size;
dspls[i] = total_size;
total_size += local_size;
}
std::vector in(local_size);
std::vector in_padded(max_size);
std::vector result(total_size);
std::vector result_padded(P * max_size);
const int n_rep = 30;
{
Timer time(n_rep, "MPI_Allgatherv");
for (int i = 0; i < n_rep; ++i) {
MPI_Allgatherv(in.data(),
local_size,
MPI_DOUBLE,
result.data(),
sizes.data(),
dspls.data(),
MPI_DOUBLE,
MPI_COMM_WORLD);
}
}
{
Timer time(n_rep, "MPI_Allgather");
for (int i = 0; i < n_rep; ++i) {
MPI_Allgather(in_padded.data(),
max_size,
MPI_DOUBLE,
result_padded.data(),
max_size,
MPI_DOUBLE,
MPI_COMM_WORLD);
}
}
MPI_Finalize();
return 0;
}
================================================
FILE: bors.toml
================================================
status = [
"ci/gitlab/%",
]
delete_merged_branches = true
================================================
FILE: ci/baseimage.cuda.Dockerfile
================================================
FROM ubuntu:24.04 as builder
ARG CUDA_ARCH=90
ENV DEBIAN_FRONTEND noninteractive
ENV FORCE_UNSAFE_CONFIGURE 1
ENV PATH="/spack/bin:${PATH}"
ENV MPICH_VERSION=4.3.2
ENV CMAKE_VERSION=3.30.9
RUN apt-get -y update
RUN apt-get install -y apt-utils
# install basic tools
RUN apt-get install -y --no-install-recommends gcc g++ gfortran clang libomp-14-dev git make unzip file \
vim wget pkg-config python3-pip python3-dev cython3 python3-pythran tcl m4 cpio curl automake meson \
xz-utils patch patchelf apt-transport-https ca-certificates gnupg software-properties-common perl tar bzip2 \
liblzma-dev libbz2-dev
# install CMake
RUN wget https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz -O cmake.tar.gz && \
tar zxvf cmake.tar.gz --strip-components=1 -C /usr
# get latest version of spack
RUN git clone -b releases/v1.1 https://github.com/spack/spack.git
# set the location of packages built by spack
RUN spack config add config:install_tree:root:/opt/local
# set cuda_arch for all packages
RUN spack config add packages:all:variants:cuda_arch=${CUDA_ARCH}
# add local repo for cosma and tiled-mm
COPY ./spack_repo /spack_repo
RUN spack repo add /spack_repo/cosma
# find all external packages
RUN spack external find --all --exclude python --exclude meson
# find compilers
RUN spack compiler find
# install MPICH
RUN spack install mpich@${MPICH_VERSION} %gcc
# for the MPI hook
RUN echo $(spack find --format='{prefix.lib}' mpich) > /etc/ld.so.conf.d/mpich.conf
RUN ldconfig
# # create environments for several configurations and install dependencies
RUN spack env create -d /cosma-env-cuda && \
spack -e /cosma-env-cuda add "cosma@=master +cuda +tests +scalapack +shared %gcc ^mpich" && \
spack -e /cosma-env-cuda add "tiled-mm@=master" && \
spack -e /cosma-env-cuda develop -p "./tiled-mm" "tiled-mm" && \
spack -e /cosma-env-cuda add "costa@=master" && \
spack -e /cosma-env-cuda develop -p "./costa" "costa" && \
spack -e /cosma-env-cuda develop -p /src cosma@master
RUN spack -e /cosma-env-cuda install --only=dependencies --fail-fast
RUN spack clean -a
# RUN spack env create -d /cosma-env-cuda-gpu-direct && \
# spack -e /cosma-env-cuda-gpu-direct add "cosma@master +cuda +tests +scalapack +shared +gpu_direct %gcc ^mpich " && \
# spack -e /cosma-env-cuda-gpu-direct add "tiled-mm@master" && \
# spack -e /cosma-env-cuda-gpu-direct add "costa@master" && \
# spack -e /cosma-env-cuda-gpu-direct add "cuda@12" && \
# spack -e /cosma-env-cuda-gpu-direct develop -p /src cosma@master && \
# spack -e /cosma-env-cuda-gpu-direct install --only=dependencies --fail-fast
# RUN spack env create -d /cosma-env-cuda-nccl && \
# spack -e /cosma-env-cuda-nccl add "cosma@master +cuda +tests +scalapack +shared +nccl %gcc ^mpich " && \
# spack -e /cosma-env-cuda-nccl add "tiled-mm@2.3.1" && \
# spack -e /cosma-env-cuda-nccl add "costa@master" && \
# spack -e /cosma-env-cuda-nccl add "cuda@12" && \
# spack -e /cosma-env-cuda-nccl develop -p /src cosma@master && \
# spack -e /cosma-env-cuda-nccl install --only=dependencies --fail-fast
# RUN spack env create -d /cosma-env-cpu && \
# spack -e /cosma-env-cpu add "cosma@master ~cuda +tests +scalapack +shared %gcc ^mpich " && \
# spack -e /cosma-env-cpu add "costa@master" && \
# spack -e /cosma-env-cpu develop -p /src cosma@master && \
# spack -e /cosma-env-cpu install --only=dependencies --fail-fast
================================================
FILE: ci/build.Dockerfile
================================================
ARG BASE_IMAGE
FROM $BASE_IMAGE
ARG ENVPATH
# copy source files of the pull request into container
COPY . /src
# # show the spack's spec
RUN spack -e $ENVPATH find -lcdv
# build COSTA and Tiled-MM with current @master branch
RUN cd $ENVPATH/costa && git pull && git log --oneline -1 && \
cd $ENVPATH/tiled-mm && git pull && git log --oneline -1
# show the spack.yaml
RUN cat $ENVPATH/spack.yaml
# build packages
RUN spack -e $ENVPATH install
# we need a fixed name for the build directory
# here is a hacky workaround to link ./spack-build-{hash} to ./spack-build
RUN cd /src && ln -s $(spack -e $ENVPATH location -b cosma) spack-build
================================================
FILE: ci/cscs.yml
================================================
include:
- remote: 'https://gitlab.com/cscs-ci/recipes/-/raw/master/templates/v2/.ci-ext.yml'
stages:
- baseimage
- build
- test
build base image:
extends: [.dynamic-image-name, .container-builder-cscs-gh200]
stage: baseimage
timeout: 2h
variables:
DOCKERFILE: ci/baseimage.cuda.Dockerfile
WATCH_FILECHANGES: ci/baseimage.cuda.Dockerfile
PERSIST_IMAGE_NAME: $CSCS_REGISTRY_PATH/base/cosma-ci
KUBERNETES_MEMORY_REQUEST: "92Gi"
KUBERNETES_MEMORY_LIMIT: "92Gi"
build cosma:
extends: .container-builder-cscs-gh200
needs: ["build base image"]
stage: build
variables:
CSCS_REBUILD_POLICY: "always"
DOCKERFILE: ci/build.Dockerfile
PERSIST_IMAGE_NAME: $CSCS_REGISTRY_PATH/cosma/cosma-ci:$CI_COMMIT_SHA
ENVPATH: "/cosma-env-cuda"
DOCKER_BUILD_ARGS: '["BASE_IMAGE=${BASE_IMAGE}", "ENVPATH=$ENVPATH"]'
.run_tests:
extends: [.container-runner-daint-gh200]
needs: ["build cosma"]
stage: test
image: $CSCS_REGISTRY_PATH/cosma/cosma-ci:$CI_COMMIT_SHA
variables:
GIT_STRATEGY: none
MPICH_MAX_THREAD_SAFETY: multiple
CSCS_REGISTRY_LOGIN: 'YES'
PULL_IMAGE: 'YES'
SLURM_HINT: nomultithread
SLURM_UNBUFFEREDIO: ''
SLURM_CPU_BIND: 'socket'
SLURM_MPI: "pmi2"
CRAY_CUDA_MPS: 'YES'
# SLURM_WAIT: 0
COSMA_GPU_MAX_TILE_K: 100
COSMA_GPU_MAX_TILE_M: 100
COSMA_GPU_MAX_TILE_N: 100
mapper:
extends: .run_tests
stage: test
script: /cosma-env-cuda/.spack-env/view/bin/test.mapper
variables:
SLURM_JOB_NUM_NODES: 1
SLURM_NTASKS: 1
USE_MPI: 'YES'
pdgemm:
extends: .run_tests
stage: test
script: /cosma-env-cuda/.spack-env/view/bin/test.pdgemm
variables:
SLURM_JOB_NUM_NODES: 2
SLURM_NTASKS: 16
USE_MPI: 'YES'
multiply:
extends: .run_tests
stage: test
script: /cosma-env-cuda/.spack-env/view/bin/test.multiply
variables:
SLURM_JOB_NUM_NODES: 2
SLURM_NTASKS: 16
USE_MPI: 'YES'
scalar_matmul:
extends: .run_tests
stage: test
script: /cosma-env-cuda/.spack-env/view/bin/test.scalar_matmul
variables:
SLURM_JOB_NUM_NODES: 1
SLURM_NTASKS: 8
USE_MPI: 'YES'
multiply_using_layout:
extends: .run_tests
stage: test
script: /cosma-env-cuda/.spack-env/view/bin/test.multiply_using_layout
variables:
SLURM_JOB_NUM_NODES: 1
SLURM_NTASKS: 4
================================================
FILE: ci/mps-wrapper.sh
================================================
#!/bin/bash
# Example mps-wrapper.sh usage:
# > srun --cpu-bind=socket [...] mps-wrapper.sh
export CUDA_MPS_PIPE_DIRECTORY=/tmp/nvidia-mps
export CUDA_MPS_LOG_DIRECTORY=/tmp/nvidia-log
# Launch MPS from a single rank per node
if [ $SLURM_LOCALID -eq 0 ]; then
CUDA_VISIBLE_DEVICES=0,1,2,3 nvidia-cuda-mps-control -d
fi
# set cuda device
numa_nodes=$(hwloc-calc --physical --intersect NUMAnode $(taskset -p $$ | awk '{print "0x"$6}'))
export CUDA_VISIBLE_DEVICES=$numa_nodes
# Run the command
exec numactl --membind=$numa_nodes "$@"
================================================
FILE: cmake/FindARMPL.cmake
================================================
# Copyright (c) 2022- ETH Zurich
#
# authors : Mathieu Taillefumier
include(FindPackageHandleStandardArgs)
set(_ARMPL_PATHS ${ARMPL_ROOT}
$ENV{ARMPL_ROOT}
$ENV{ARMPLROOT}
$ENV{ARMPL_DIR}
$ENV{ARMPLDIR}
$ENV{ORNL_ARMPL_ROOT}
$ENV{CRAY_ARMPL_ROOT})
foreach(_var armpl armpl_int64 armpl_ilp64 armpl_lp64 armpl_ilp64_mp armpl_lp64_mp)
string(TOUPPER ${_var} _var_up)
find_library("COSMA_${_var_up}_LINK_LIBRARIES" NAME ${_var} HINTS ${_ARMPL_PATHS} PATH_SUFFIXES "lib" "lib64" "armpl/lib" "armpl/lib64" "armpl")
endforeach()
find_path(COSMA_ARMPL_INCLUDE_DIRS NAMES "armpl.h" HINTS ${_ARMPL_PATHS} PATH_SUFFIXES "include" "armpl" "armpl/include" "include/armpl")
# Check for 64bit Integer support
if(COSMA_BLAS_INTERFACE MATCHES "64bits")
set(COSMA_BLAS_armpl_LIB "ARMPL_ILP64")
else()
set(COSMA_BLAS_armpl_LIB "ARMPL_LP64")
endif()
# Check for OpenMP support, VIA BLAS_VENDOR of Arm_mp or Arm_ipl64_mp
if(COSMA_BLAS_THREADING MATCHES "openmp")
string(APPEND COSMA_BLAS_armpl_LIB "_MP")
endif()
# check if found
find_package_handle_standard_args(
Armpl REQUIRED_VARS COSMA_ARMPL_INCLUDE_DIRS COSMA_ARMPL_LP64_LINK_LIBRARIES
COSMA_ARMPL_LP64_MP_LINK_LIBRARIES COSMA_ARMPL_ILP64_LINK_LIBRARIES COSMA_ARMPL_ILP64_MP_LINK_LIBRARIES)
# add target to link against
if (NOT TARGET cosma::BLAS::ARMPL::armpl)
add_library(cosma::BLAS::ARMPL::armpl INTERFACE IMPORTED)
# now define an alias to the target library
add_library(cosma::BLAS::ARMPL::blas ALIAS cosma::BLAS::ARMPL::armpl)
endif()
# we need to iniitialize the targets of each individual libraries only once.
foreach(_var armpl_ilp64 armpl_lp64 armpl_ilp64_mp armpl_lp64_mp)
string(TOUPPER "${_var}" _var_up)
if (NOT TARGET cosma::BLAS::ARMPL::${_var})
add_library(cosma::BLAS::ARMPL::${_var} INTERFACE IMPORTED)
set_property(TARGET cosma::BLAS::ARMPL::${_var} PROPERTY INTERFACE_INCLUDE_DIRECTORIES
${COSMA_ARMPL_INCLUDE_DIRS})
set_property(TARGET cosma::BLAS::ARMPL::${_var} PROPERTY INTERFACE_LINK_LIBRARIES
"${COSMA_${_var_up}_LINK_LIBRARIES}")
endif()
endforeach()
set_property(TARGET cosma::BLAS::ARMPL::armpl PROPERTY INTERFACE_INCLUDE_DIRECTORIES
${COSMA_ARMPL_INCLUDE_DIRS})
set_property(TARGET cosma::BLAS::ARMPL::armpl PROPERTY INTERFACE_LINK_LIBRARIES
"${COSMA_${COSMA_BLAS_armpl_LIB}_LINK_LIBRARIES}")
endif()
set(COSMA_BLAS_VENDOR "ARMPL")
mark_as_advanced(COSMA_ARMPL_FOUND COSMA_BLAS_VENDOR COSMA_ARMPL_INCLUDE_DIRS)
================================================
FILE: cmake/FindATLAS.cmake
================================================
# Copyright (c) 2019 ETH Zurich
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# 3. Neither the name of the copyright holder nor the names of its contributors
# may be used to endorse or promote products derived from this software
# without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#.rst:
# FindATLAS
# -----------
#
# This module tries to find the ATLAS library.
#
# The following variables are set
#
# ::
#
# ATLAS_FOUND - True if atlas is found
# ATLAS_LIBRARIES - The required libraries
# ATLAS_INCLUDE_DIRS - The required include directory
#
# The following import target is created
#
# ::
#
# ATLAS::atlas
#set paths to look for library from ROOT variables.If new policy is set, find_library() automatically uses them.
# if(NOT POLICY CMP0074)
set(_ATLAS_PATHS ${ATLAS_ROOT}
$ENV{ATLAS_ROOT}
$ENV{ATLASROOT}
$ENV{ATLAS_DIR}
$ENV{ATLASDIR})
# endif()
find_library(
COSMA_ATLAS_LINK_LIBRARIES
NAMES "atlas"
HINTS ${_ATLAS_PATHS}
PATH_SUFFIXES "atlas/lib" "atlas/lib64" "atlas"
)
find_path(
COSMA_ATLAS_INCLUDE_DIRS
NAMES "cblas-atlas.h" "cblas_atlas.h" "cblas.h"
HINTS ${_ATLAS_PATHS}
PATH_SUFFIXES "atlas" "atlas/include" "include/atlas"
)
# check if found
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(ATLAS REQUIRED_VARS COSMA_ATLAS_INCLUDE_DIRS COSMA_ATLAS_LINK_LIBRARIES)
# add target to link against
if(NOT TARGET cosma::BLAS::ATLAS::atlas)
add_library(cosma::BLAS::ATLAS::atlas INTERFACE IMPORTED)
add_library(cosma::BLAS::ATLAS::blas ALIAS cosma::BLAS::ATLAS::atlas)
endif()
set_property(TARGET cosma::BLAS::ATLAS::atlas PROPERTY INTERFACE_LINK_LIBRARIES ${COSMA_ATLAS_LINK_LIBRARIES})
set_property(TARGET cosma::BLAS::ATLAS::atlas PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${COSMA_ATLAS_INCLUDE_DIRS})
# prevent clutter in cache
MARK_AS_ADVANCED(ATLAS_FOUND ATLAS_LIBRARIES ATLAS_INCLUDE_DIRS)
================================================
FILE: cmake/FindBLIS.cmake
================================================
# Copyright (c) 2019 ETH Zurich
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# 3. Neither the name of the copyright holder nor the names of its contributors
# may be used to endorse or promote products derived from this software
# without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#.rst:
# FindBLIS
# -----------
#
# This module tries to find the BLIS library.
#
# The following variables are set
#
# ::
#
# BLIS_FOUND - True if blis is found
# BLIS_LIBRARIES - The required libraries
# BLIS_INCLUDE_DIRS - The required include directory
#
# The following import target is created
#
# ::
#
# BLIS::blis
#set paths to look for library from ROOT variables.If new policy is set, find_library() automatically uses them.
# if(NOT POLICY CMP0074)
set(_BLIS_PATHS ${BLIS_ROOT}
$ENV{BLIS_ROOT}
$ENV{BLISROOT}
$ENV{BLIS_DIR}
$ENV{BLISDIR})
# endif()
find_library(
COSMA_BLIS_LINK_LIBRARIES
NAMES "blis"
HINTS ${_BLIS_PATHS}
PATH_SUFFIXES "lib" "lib64" "blis/lib" "blis/lib64" "blis"
)
find_path(
COSMA_BLIS_INCLUDE_DIRS
NAMES "blis.h"
HINTS ${_BLIS_PATHS}
PATH_SUFFIXES "include" "blis" "blis/include" "include/blis"
)
find_path(
COSMA_BLIS_CBLAS_INCLUDE_DIRS
NAMES "cblas_blis.h" "cblas-blis.h" "cblas.h"
HINTS ${_BLIS_PATHS}
PATH_SUFFIXES "include" "blis" "blis/include" "include/blis"
)
# check if found
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(BLIS REQUIRED_VARS COSMA_BLIS_INCLUDE_DIRS COSMA_BLIS_LINK_LIBRARIES COSMA_BLIS_CBLAS_INCLUDE_DIRS)
# add target to link against
if(NOT TARGET cosma::BLAS::BLIS::blis)
add_library(cosma::BLAS::BLIS::blis INTERFACE IMPORTED)
add_library(cosma::BLAS::BLIS::blas ALIAS cosma::BLAS::BLIS::blis)
endif()
set_property(TARGET cosma::BLAS::BLIS::blis PROPERTY INTERFACE_LINK_LIBRARIES ${COSMA_BLIS_LINK_LIBRARIES})
set_property(TARGET cosma::BLAS::BLIS::blis PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${COSMA_BLIS_INCLUDE_DIRS} ${COSMA_BLIS_CBLAS_INCLUDE_DIRS})
# prevent clutter in cache
MARK_AS_ADVANCED(BLIS_FOUND COSMA_BLIS_LINK_LIBRARIES COSMA_BLIS_INCLUDE_DIRS COSMA_BLIS_CBLAS_INCLUDE_DIRS)
================================================
FILE: cmake/FindBlas.cmake
================================================
# Copyright (c) 2022- ETH Zurich
#
# authors : Mathieu Taillefumier
include(FindPackageHandleStandardArgs)
if(NOT
(CMAKE_C_COMPILER_LOADED
OR CMAKE_CXX_COMPILER_LOADED
OR CMAKE_Fortran_COMPILER_LOADED))
message(FATAL_ERROR "FindBLAS requires Fortran, C, or C++ to be enabled.")
endif()
set(COSMA_BLAS_VENDOR_LIST
"auto"
"MKL"
"OPENBLAS"
"FLEXIBLAS"
"ARMPL"
"GenericBLAS"
"CRAY_LIBSCI"
"BLIS"
"ATLAS"
"NVPL"
"OFF")
# COSMA_BLAS_VENDOR should normally be defined here but cosma defines it in the
# main CMakeLists.txt to keep the old behavior. the threading and integer
# interface can also be controlled but are fixed to the default values that
# COSMA was configured before introducing this module. So if findBLAS.cmake is
# to be used elsewhere, it is better to look at what CP2K does and start from
# there
if(NOT ${COSMA_BLAS_VENDOR} IN_LIST COSMA_BLAS_VENDOR_LIST)
message(FATAL_ERROR "Invalid Host BLAS backend")
endif()
set(COSMA_BLAS_THREAD_LIST "sequential" "thread" "gnu-thread" "intel-thread"
"tbb-thread" "openmp")
set(COSMA_BLAS_THREADING
"openmp"
CACHE STRING "threaded blas library")
set_property(CACHE COSMA_BLAS_THREADING PROPERTY STRINGS
${COSMA_BLAS_THREAD_LIST})
if(NOT ${COSMA_BLAS_THREADING} IN_LIST COSMA_BLAS_THREAD_LIST)
message(FATAL_ERROR "Invalid threaded BLAS backend")
endif()
set(COSMA_BLAS_INTERFACE_BITS_LIST "32bits" "64bits")
set(COSMA_BLAS_INTERFACE
"32bits"
CACHE STRING
"32 bits integers are used for indices, matrices and vectors sizes")
set_property(CACHE COSMA_BLAS_INTERFACE
PROPERTY STRINGS ${COSMA_BLAS_INTERFACE_BITS_LIST})
if(NOT ${COSMA_BLAS_INTERFACE} IN_LIST COSMA_BLAS_INTERFACE_BITS_LIST)
message(
FATAL_ERROR
"Invalid parameters. Blas and lapack can exist in two flavors 32 or 64 bits interfaces (relevant mostly for mkl)"
)
endif()
if (COSMA_BLAS_VENDOR MATCHES "OFF")
return ()
endif()
set(COSMA_BLAS_FOUND FALSE)
# first check for a specific implementation if requested
if(NOT COSMA_BLAS_VENDOR MATCHES "auto")
if (COSMA_BLAS_VENDOR MATCHES "CUSTOM")
find_package(GenericBLAS REQUIRED)
else()
find_package(${COSMA_BLAS_VENDOR} REQUIRED)
endif()
if(TARGET cosma::BLAS::${COSMA_BLAS_VENDOR}::blas)
get_target_property(COSMA_BLAS_INCLUDE_DIRS cosma::BLAS::${COSMA_BLAS_VENDOR}::blas
INTERFACE_INCLUDE_DIRECTORIES)
get_target_property(COSMA_BLAS_LINK_LIBRARIES cosma::BLAS::${COSMA_BLAS_VENDOR}::blas
INTERFACE_LINK_LIBRARIES)
set(COSMA_BLAS_FOUND TRUE)
endif()
else()
# search for any blas implementation and exit imediately if one is found
foreach(_libs ${COSMA_BLAS_VENDOR_LIST})
# i exclude the first item of the list
if (NOT _libs STREQUAL "auto")
find_package(${_libs})
if(TARGET cosma::BLAS::${_libs}::blas)
get_target_property(COSMA_BLAS_INCLUDE_DIRS cosma::BLAS::${_libs}::blas
INTERFACE_INCLUDE_DIRECTORIES)
get_target_property(COSMA_BLAS_LINK_LIBRARIES cosma::BLAS::${_libs}::blas
INTERFACE_LINK_LIBRARIES)
set(COSMA_BLAS_VENDOR "${_libs}")
set(COSMA_BLAS_FOUND TRUE)
break()
endif()
endif()
endforeach()
endif()
if(COSMA_BLAS_INCLUDE_DIRS)
find_package_handle_standard_args(
Blas REQUIRED_VARS COSMA_BLAS_LINK_LIBRARIES COSMA_BLAS_INCLUDE_DIRS
COSMA_BLAS_VENDOR)
else()
message(WARNING "Blas REQUIRED_VARS '${COSMA_BLAS_LINK_LIBRARIES}' '${COSMA_BLAS_VENDOR}'")
find_package_handle_standard_args(
Blas REQUIRED_VARS COSMA_BLAS_LINK_LIBRARIES COSMA_BLAS_VENDOR)
endif()
if(NOT TARGET cosma::BLAS::blas)
add_library(cosma::BLAS::blas INTERFACE IMPORTED)
endif()
set_target_properties(cosma::BLAS::blas PROPERTIES INTERFACE_LINK_LIBRARIES
"${COSMA_BLAS_LINK_LIBRARIES}")
if(COSMA_BLAS_INCLUDE_DIRS)
set_target_properties(cosma::BLAS::blas PROPERTIES INTERFACE_INCLUDE_DIRECTORIES
"${COSMA_BLAS_INCLUDE_DIRS}")
endif()
mark_as_advanced(COSMA_BLAS_INCLUDE_DIRS)
mark_as_advanced(COSMA_BLAS_LINK_LIBRARIES)
mark_as_advanced(COSMA_BLAS_VENDOR)
mark_as_advanced(COSMA_BLAS_FOUND)
================================================
FILE: cmake/FindCRAY_LIBSCI.cmake
================================================
include(FindPackageHandleStandardArgs)
# we are using the GNU compiler
set(_sciname "sci_gnu_mpi_mp")
set(_sciname_acc "sci_acc_gnu_nv60")
find_library(COSMA_CRAY_LIBSCI_LIBRARIES
NAMES ${_sciname_acc} ${_sciname}
HINTS
${_SCALAPACK_LIBRARY_DIRS}
ENV CRAY_LIBSCI_PREFIX_DIR
ENV CRAY_PE_LIBSCI_PREFIX_DIR
ENV CRAY_LIBSCI_ACC_PREFIX_DIR
ENV CRAY_PE_LIBSCI_ACC_PREFIX_DIR
PATH_SUFFIXES lib
DOC "Path to the Cray-libsci library.")
message("CRAY_LIBSCI: ${COSMA_CRAY_LIBSCI_LIBRARIES}")
find_package_handle_standard_args(CRAY_LIBSCI DEFAULT_MSG COSMA_CRAY_LIBSCI_LIBRARIES)
if (NOT TARGET cosma::BLAS::CRAY_LIBSCI::sci)
add_library(cosma::BLAS::CRAY_LIBSCI::sci INTERFACE IMPORTED)
set_target_properties(cosma::BLAS::CRAY_LIBSCI::sci PROPERTIES INTERFACE_LINK_LIBRARIES "${COSMA_CRAY_LIBSCI_LIBRARIES}")
add_library(cosma::BLAS::CRAY_LIBSCI::blas ALIAS cosma::BLAS::CRAY_LIBSCI::sci)
add_library(cosma::BLAS::CRAY_LIBSCI::scalapack_link INTERFACE IMPORTED)
set_target_properties(cosma::BLAS::CRAY_LIBSCI::scalapack_link PROPERTIES INTERFACE_LINK_LIBRARIES "${COSMA_CRAY_LIBSCI_LIBRARIES}")
endif()
================================================
FILE: cmake/FindFLEXIBLAS.cmake
================================================
# Copyright (c) 2022- ETH Zurich
#
# authors : Mathieu Taillefumier
include(FindPackageHandleStandardArgs)
set(_FLEXIBLAS_PATHS ${FLEXIBLAS_ROOT}
$ENV{FLEXIBLAS_ROOT}
$ENV{FLEXIBLASROOT}
$ENV{FLEXIBLAS_DIR}
$ENV{FLEXIBLASDIR}
$ENV{ORNL_FLEXIBLAS_ROOT}
$ENV{CRAY_FLEXIBLAS_ROOT})
# try first with pkg-config
find_package(PkgConfig QUIET)
if(PKG_CONFIG_FOUND)
pkg_check_modules(COSMA_FLEXIBLAS IMPORTED_TARGET GLOBAL flexiblas)
endif()
find_package_handle_standard_args(
FLEXIBLAS DEFAULT_MSG COSMA_FLEXIBLAS_INCLUDE_DIRS
COSMA_FLEXIBLAS_LINK_LIBRARIES)
if(COSMA_FLEXIBLAS_FOUND)
set(COSMA_BLAS_VENDOR "FlexiBLAS")
if(NOT TARGET cosma::BLAS::FLEXIBLAS::flexiblas)
add_library(cosma::BLAS::FLEXIBLAS::flexiblas INTERFACE IMPORTED)
add_library(cosma::BLAS::FLEXIBLAS::blas ALIAS cosma::BLAS::FLEXIBLAS::flexiblas)
endif()
set_target_properties(
cosma::BLAS::FLEXIBLAS::flexiblas PROPERTIES INTERFACE_LINK_LIBRARIES
"${COSMA_FLEXIBLAS_LINK_LIBRARIES}")
if(COSMA_FLEXIBLAS_INCLUDE_DIRS)
set_target_properties(
cosma::BLAS::FLEXIBLAS::flexiblas PROPERTIES INTERFACE_INCLUDE_DIRECTORIES
"${COSMA_FLEXIBLAS_INCLUDE_DIRS}")
endif()
endif()
mark_as_advanced(COSMA_FLEXIBLAS_FOUND COSMA_FLEXIBLAS_INCLUDE_DIRS
COSMA_FLEXIBLAS_LINK_LIBRARIES COSMA_BLAS_VENDOR)
================================================
FILE: cmake/FindGenericBLAS.cmake
================================================
# Copyright (c) 2022- ETH Zurich
#
# authors : Mathieu Taillefumier
include(FindPackageHandleStandardArgs)
if(NOT POLICY CMP0074)
set(_GenericBLAS_PATHS ${GenericBLAS_ROOT} $ENV{GenericBLAS_ROOT})
endif()
find_library(
COSMA_GenericBLAS_LINK_LIBRARIES
NAMES "blas"
HINTS ${_GenericBLAS_PATHS})
find_library(
# optinally look for cblas library - not required
COSMA_GenericBLAS_CBLAS_LIBRARIES
NAMES "cblas"
HINTS ${_GenericBLAS_PATHS})
find_path(
COSMA_GenericBLAS_INCLUDE_DIRS
NAMES "cblas.h"
HINTS ${_GenericBLAS_PATHS})
# check if found
if(COSMA_GenericBLAS_INCLUDE_DIRS)
find_package_handle_standard_args(
GenericBLAS REQUIRED_VARS COSMA_GenericBLAS_INCLUDE_DIRS COSMA_GenericBLAS_LINK_LIBRARIES)
else()
find_package_handle_standard_args(GenericBLAS
REQUIRED_VARS COSMA_GenericBLAS_LINK_LIBRARIES)
endif()
if(COSMA_GenericBLAS_CBLAS_LINK_LIBRARIES)
list(APPEND GenericBLAS_LINK_LIBRARIES ${GenericBLAS_CBLAS_LINK_LIBRARIES})
endif()
# add target to link against
if(NOT TARGET cosma::BLAS::GenericBLAS::blas)
add_library(cosma::BLAS::GenericBLAS::blas INTERFACE IMPORTED)
endif()
set_property(TARGET cosma::BLAS::GenericBLAS::blas PROPERTY INTERFACE_LINK_LIBRARIES
${COSMA_GenericBLAS_LINK_LIBRARIES})
set_property(
TARGET cosma::BLAS::GenericBLAS::blas PROPERTY INTERFACE_INCLUDE_DIRECTORIES
${COSMA_GenericBLAS_INCLUDE_DIRS})
endif()
# prevent clutter in cache
mark_as_advanced(COSMA_GenericBLAS_FOUND COSMA_GenericBLAS_LINK_LIBRARIES
COSMA_GenericBLAS_INCLUDE_DIRS COSMA_GenericBLAS_CBLAS_LIBRARIES)
================================================
FILE: cmake/FindMKL.cmake
================================================
#
# CMake recipes https://github.com/eth-cscs/cmake-recipes
#
# Copyright (c) 2018-2019, ETH Zurich BSD 3-Clause License. All rights reserved.
#
# Author: Teodor Nikolov (teodor.nikolov22@gmail.com)
#
#[=======================================================================[.rst:
FindMKL
-------
The following conventions are used:
intel / INTEL - Bindings for everything except GNU Fortran
gf / GF - GNU Fortran bindings
seq / SEQ - sequential MKL
omp / OMP - threaded MKL with OpenMP back end
tbb / TBB - threaded MKL with TBB back end
32bit / 32BIT - MKL 32 bit integer interface (used most often)
64bit / 64BIT - MKL 64 bit integer interface
mpich / MPICH - MPICH / IntelMPI BLACS back end
ompi / OMPI - OpenMPI BLACS back end
st / ST - static libraries
dyn / DYN - dynamic libraries
The module attempts to define a target for each MKL configuration. The
configuration will not be available if there are missing library files or a
missing dependency.
MKL Link line advisor:
https://software.intel.com/en-us/articles/intel-mkl-link-line-advisor
Note: Mixing GCC and Intel OpenMP backends is a bad idea.
Search variables
^^^^^^^^^^^^^^^^
``MKLROOT``
Environment variable set to MKL's root directory
``MKL_ROOT``
CMake variable set to MKL's root directory
Example usage
^^^^^^^^^^^^^
To Find MKL:
find_package(MKL REQUIRED)
To check if target is available:
if (TARGET MKL::scalapack_mpich_intel_32bit_omp_dyn)
...
endif()
To link to an available target (see list below):
target_link_libraries(... MKL::scalapack_mpich_intel_32bit_omp_dyn)
Note: dependencies are handled for you (MPI, OpenMP, ...)
the target MKL::blas, MKL::MKL, MKL::lapack also include all necessary libraries
for linking.
MKL::MKL is also used by the cmake module provided by intel.
MKL::scalapack_link gives all libraries needed for scalapack.
Imported targets
^^^^^^^^^^^^^^^^
MKL (BLAS, LAPACK, FFT) targets:
MKL::[gf|intel]_[32bit|64bit]_[seq|omp|tbb]_[st|dyn] e.g.
MKL::mkl_intel_32bit_omp_dyn
BLACS targets:
MKL::blacs_[mpich|ompi]_[gf|intel]_[32bit|64bit]_[seq|omp|tbb]_[st|dyn] e.g.
MKL::blacs_intel_mpich_32bit_seq_st
ScaLAPACK targets:
MKL::scalapack_[mpich|ompi]_[gf|intel]_[32bit|64bit]_[seq|omp|tbb]_[st|dyn] e.g.
MKL::scalapack_mpich_intel_64bit_omp_dyn
Result variables
^^^^^^^^^^^^^^^^
MKL_FOUND
Not supported
^^^^^^^^^^^^^
- F95 interfaces
#]=======================================================================]
# Copyright (c) 2022- ETH Zurich
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
# 3. Neither the name of the copyright holder nor the names of its contributors
# may be used to endorse or promote products derived from this software
# without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
include(FindPackageHandleStandardArgs)
if(NOT
(CMAKE_C_COMPILER_LOADED
OR CMAKE_CXX_COMPILER_LOADED
OR CMAKE_Fortran_COMPILER_LOADED))
message(FATAL_ERROR "FindMKL requires Fortran, C, or C++ to be enabled.")
endif()
# Dependencies
#
enable_language(Fortran)
enable_language(C)
find_package(Threads)
find_package(MPI COMPONENTS CXX C Fortran)
find_package(OpenMP COMPONENTS CXX C Fortran)
# If MKL_ROOT is not set, set it via the env variable MKLROOT.
#
if(NOT DEFINED MKL_ROOT)
set(MKL_ROOT
$ENV{MKLROOT}
CACHE PATH "MKL's root directory.")
endif()
# Determine MKL's library folder
#
set(_mkl_libpath_suffix "intel64")
if(CMAKE_SIZEOF_VOID_P EQUAL 4) # 32 bit
set(_mkl_libpath_suffix "ia32")
endif()
if(WIN32)
list(APPEND _mkl_libpath_suffix_list ${_mkl_libpath_suffix})
string(APPEND _mkl_libpath_suffix "_win")
list(APPEND _mkl_libpath_suffix_list ${_mkl_libpath_suffix})
set(_mkl_libname_prefix "")
set(_mkl_shared_lib "_dll.lib")
set(_mkl_static_lib ".lib")
elseif(APPLE)
list(APPEND _mkl_libpath_suffix_list ${_mkl_libpath_suffix})
string(APPEND _mkl_libpath_suffix "_mac")
list(APPEND _mkl_libpath_suffix_list ${_mkl_libpath_suffix})
set(_mkl_libname_prefix "lib")
set(_mkl_shared_lib ".dylib")
set(_mkl_static_lib ".a")
else() # LINUX
list(APPEND _mkl_libpath_suffix_list ${_mkl_libpath_suffix})
string(APPEND _mkl_libpath_suffix "_lin")
list(APPEND _mkl_libpath_suffix_list ${_mkl_libpath_suffix})
set(_mkl_libname_prefix "lib")
set(_mkl_shared_lib ".so")
set(_mkl_static_lib ".a")
endif()
set(_mkl_search_paths "${MKL_ROOT}" "${MKL_ROOT}/lib" "${MKL_ROOT}/mkl/lib"
"${MKL_ROOT}/compiler/lib")
# Functions: finds both static and shared MKL libraries
#
function(__mkl_find_library _varname _libname)
find_library(
${_varname}_DYN
NAMES ${_mkl_libname_prefix}${_libname}${_mkl_shared_lib}
HINTS ${_mkl_search_paths}
PATH_SUFFIXES ${_mkl_libpath_suffix_list})
mark_as_advanced(${_varname}_DYN)
find_library(
${_varname}_ST
NAMES ${_mkl_libname_prefix}${_libname}${_mkl_static_lib}
HINTS ${_mkl_search_paths}
PATH_SUFFIXES ${_mkl_libpath_suffix_list})
mark_as_advanced(${_varname}_ST)
endfunction()
# Find MKL headers
#
find_path(COSMA_MKL_INCLUDE_DIRS mkl.h HINTS ${MKL_ROOT}/include
${MKL_ROOT}/mkl/include)
mark_as_advanced(COSMA_MKL_INCLUDE_DIRS)
# Group flags for static libraries on Linux (GNU, PGI, ICC -> same linker)
#
if(UNIX AND NOT APPLE)
set(_mkl_linker_pre_flags_ST "-Wl,--start-group")
set(_mkl_linker_post_flags_ST "-Wl,--end-group")
endif()
# Core MKL
#
__mkl_find_library(MKL_CORE_LIB mkl_core)
# Interface
#
__mkl_find_library(MKL_INTERFACE_INTEL_32BIT_LIB mkl_intel_lp64)
__mkl_find_library(MKL_INTERFACE_INTEL_64BIT_LIB mkl_intel_ilp64)
if(NOT APPLE
AND CMAKE_Fortran_COMPILER_LOADED
AND CMAKE_Fortran_COMPILER_ID STREQUAL "GNU")
__mkl_find_library(MKL_INTERFACE_GF_32BIT_LIB mkl_gf_lp64)
__mkl_find_library(MKL_INTERFACE_GF_64BIT_LIB mkl_gf_ilp64)
endif()
# Threading
#
__mkl_find_library(MKL_SEQ_LIB mkl_sequential)
if(NOT APPLE
AND (CMAKE_C_COMPILER_ID STREQUAL "GNU"
OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU"
OR CMAKE_Fortran_COMPILER_ID STREQUAL "GNU"))
__mkl_find_library(MKL_OMP_LIB mkl_gnu_thread)
else()
__mkl_find_library(MKL_OMP_LIB mkl_intel_thread)
endif()
__mkl_find_library(MKL_TBB_LIB mkl_tbb_thread)
# BLACS
#
if(APPLE)
__mkl_find_library(MKL_BLACS_MPICH_32BIT_LIB mkl_blacs_mpich_lp64)
__mkl_find_library(MKL_BLACS_MPICH_64BIT_LIB mkl_blacs_mpich_ilp64)
else()
__mkl_find_library(MKL_BLACS_MPICH_32BIT_LIB mkl_blacs_intelmpi_lp64)
__mkl_find_library(MKL_BLACS_MPICH_64BIT_LIB mkl_blacs_intelmpi_ilp64)
endif()
__mkl_find_library(MKL_BLACS_OMPI_32BIT_LIB mkl_blacs_openmpi_lp64)
__mkl_find_library(MKL_BLACS_OMPI_64BIT_LIB mkl_blacs_openmpi_ilp64)
# ScaLAPACK
#
__mkl_find_library(MKL_SCALAPACK_32BIT_LIB mkl_scalapack_lp64)
__mkl_find_library(MKL_SCALAPACK_64BIT_LIB mkl_scalapack_ilp64)
# Check if core libs were found
#
find_package_handle_standard_args(MKL REQUIRED_VARS COSMA_MKL_INCLUDE_DIRS
Threads_FOUND)
# Sequential has no threading dependency. There is currently no TBB module
# shipped with CMake. The dependency is not accounted for. (FIXME)
#
set(_mkl_dep_found_SEQ TRUE)
set(_mkl_dep_found_TBB TRUE)
if(TARGET OpenMP::OpenMP_CXX)
set(_mkl_dep_OMP ${OpenMP_CXX_LIBRARIES})
set(_mkl_dep_found_OMP TRUE)
endif()
# Define all blas, blacs and scalapack
#
foreach(_libtype "ST" "DYN")
set(_mkl_core_lib ${MKL_CORE_LIB_${_libtype}})
foreach(_bits "32BIT" "64BIT")
set(_mkl_scalapack_lib ${MKL_SCALAPACK_${_bits}_LIB_${_libtype}})
foreach(_iface "INTEL" "GF")
set(_mkl_interface_lib
${MKL_INTERFACE_${_iface}_${_bits}_LIB_${_libtype}})
foreach(_threading "SEQ" "OMP" "TBB")
set(_mkl_threading_lib ${MKL_${_threading}_LIB_${_libtype}})
string(TOLOWER "${_iface}_${_bits}_${_threading}_${_libtype}"
_tgt_config)
set(_mkl_tgt cosma::BLAS::MKL::${_tgt_config})
if(MKL_FOUND
AND _mkl_interface_lib
AND _mkl_threading_lib
AND _mkl_core_lib
AND _mkl_dep_found_${_threading}
AND NOT TARGET ${_mkl_tgt})
set(_mkl_libs
"${_mkl_linker_pre_flags_${_threading}}"
"${_mkl_interface_lib}"
"${_mkl_threading_lib}"
"${_mkl_core_lib}"
"${_mkl_linker_post_flags_${_threading}}"
"${_mkl_dep_${_threading}}"
"Threads::Threads")
add_library(${_mkl_tgt} INTERFACE IMPORTED)
set_target_properties(
${_mkl_tgt}
PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${COSMA_MKL_INCLUDE_DIRS}"
INTERFACE_LINK_LIBRARIES "${_mkl_libs}")
endif()
foreach(_mpi_impl "MPICH" "OMPI")
set(_mkl_blacs_lib ${MKL_BLACS_${_mpi_impl}_${_bits}_LIB_${_libtype}})
string(
TOLOWER "${_mpi_impl}_${_iface}_${_bits}_${_threading}_${_libtype}"
_tgt_config)
set(_scalapack_tgt cosma::BLAS::MKL::scalapack_${_tgt_config})
if(_mkl_blacs_lib
AND TARGET ${_mkl_tgt}
AND TARGET MPI::MPI_CXX
AND NOT TARGET cosma::BLAS::MKL::blacs_${_tgt_config})
set(_blacs_libs
"${_mkl_linker_pre_flags_${_libtype}}"
"${_mkl_interface_lib}"
"${_mkl_threading_lib}"
"${_mkl_core_lib}"
"${_mkl_blacs_lib}"
"${_mkl_linker_post_flags_${_libtype}}"
"MPI::MPI_CXX"
"${_mkl_dep_${_threading}}"
"Threads::Threads")
add_library(cosma::BLAS::MKL::blacs_${_tgt_config} INTERFACE IMPORTED)
set_target_properties(
cosma::BLAS::MKL::blacs_${_tgt_config}
PROPERTIES INTERFACE_INCLUDE_DIRECTORIES
"${COSMA_MKL_INCLUDE_DIRS}" INTERFACE_LINK_LIBRARIES
"${_mkl_blacs_lib}")
endif()
if(_mkl_scalapack_lib AND NOT TARGET
cosma::BLAS::MKL::scalapack_${_tgt_config})
set(_scalapack_libs "${_mkl_scalapack_lib}" "${_blacs_tgt}")
add_library(cosma::BLAS::MKL::scalapack_${_tgt_config} INTERFACE IMPORTED)
set_target_properties(
cosma::BLAS::MKL::scalapack_${_tgt_config}
PROPERTIES INTERFACE_LINK_LIBRARIES "${_scalapack_libs}")
endif()
endforeach()
endforeach()
endforeach()
endforeach()
endforeach()
if(MKL_FOUND)
# BLAS in the Intel MKL 10+ library?
# the findMKL package finds all possible combination and define target for
# each of them we just need to find which compiler we use, mpi etc...
if(CMAKE_Fortran_COMPILER_LOADED
AND CMAKE_Fortran_COMPILER_ID STREQUAL "GNU"
AND NOT APPLE)
set(COSMA_BLAS_mkl_INTFACE "gf")
else()
set(COSMA_BLAS_mkl_INTFACE "intel")
endif()
if(COSMA_BLAS_THREADING MATCHES "thread|gnu-thread|openmp")
set(COSMA_BLAS_mkl_thread__ "omp")
endif()
if(COSMA_BLAS_THREADING MATCHES "sequential")
set(COSMA_BLAS_mkl_thread__ "seq")
endif()
if(COSMA_BLAS_THREADING MATCHES "intel-thread")
set(COSMA_BLAS_mkl_thread__ "intel")
endif()
if(COSMA_BLAS_THREADING MATCHES "tbb")
set(COSMA_BLAS_mkl_thread__ "tbb")
endif()
if(COSMA_BLAS_INTERFACE MATCHES "64bits")
set(COSMA_BLAS_mkl_ILP_MODE "64bit")
else()
set(COSMA_BLAS_mkl_ILP_MODE "32bit")
endif()
get_target_property(
MKL_BLAS_INCLUDE_DIRS
cosma::BLAS::MKL::${COSMA_BLAS_mkl_INTFACE}_${COSMA_BLAS_mkl_ILP_MODE}_${COSMA_BLAS_mkl_thread__}_dyn
INTERFACE_INCLUDE_DIRECTORIES)
get_target_property(
MKL_BLAS_LIBRARIES
cosma::BLAS::MKL::${COSMA_BLAS_mkl_INTFACE}_${COSMA_BLAS_mkl_ILP_MODE}_${COSMA_BLAS_mkl_thread__}_dyn
INTERFACE_LINK_LIBRARIES)
if(NOT TARGET cosma::BLAS::MKL::blas)
add_library(cosma::BLAS::MKL::MKL INTERFACE IMPORTED)
add_library(cosma::BLAS::MKL::blas ALIAS cosma::BLAS::MKL::MKL)
# create a empty lapack
add_library(cosma::BLAS::MKL::lapack INTERFACE IMPORTED)
endif()
set_target_properties(
cosma::BLAS::MKL::MKL
PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${COSMA_MKL_INCLUDE_DIRS}"
INTERFACE_LINK_LIBRARIES "${MKL_BLAS_LIBRARIES}")
if("${MPI_CXX_LIBRARY_VERSION_STRING}" MATCHES "Open MPI")
set(__mkl_mpi_ver_ "ompi")
else()
set(__mkl_mpi_ver_ "mpich")
endif()
get_target_property(
__mkl_scalapack_inc
cosma::BLAS::MKL::scalapack_${__mkl_mpi_ver_}_${COSMA_BLAS_mkl_INTFACE}_${COSMA_BLAS_mkl_ILP_MODE}_${COSMA_BLAS_mkl_thread__}_dyn
INTERFACE_INCLUDE_DIRECTORIES)
get_target_property(
__mkl_scalapack_lib
cosma::BLAS::MKL::scalapack_${__mkl_mpi_ver_}_${COSMA_BLAS_mkl_INTFACE}_${COSMA_BLAS_mkl_ILP_MODE}_${COSMA_BLAS_mkl_thread__}_dyn
INTERFACE_LINK_LIBRARIES)
get_target_property(
__mkl_blacs_inc
cosma::BLAS::MKL::blacs_${__mkl_mpi_ver_}_${COSMA_BLAS_mkl_INTFACE}_${COSMA_BLAS_mkl_ILP_MODE}_${COSMA_BLAS_mkl_thread__}_dyn
INTERFACE_INCLUDE_DIRECTORIES)
get_target_property(
__mkl_blacs_lib
cosma::BLAS::MKL::blacs_${__mkl_mpi_ver_}_${COSMA_BLAS_mkl_INTFACE}_${COSMA_BLAS_mkl_ILP_MODE}_${COSMA_BLAS_mkl_thread__}_dyn
INTERFACE_LINK_LIBRARIES)
if(NOT TARGET cosma::BLAS::MKL::scalapack_link)
add_library(cosma::BLAS::MKL::scalapack_link INTERFACE IMPORTED)
set_target_properties(
cosma::BLAS::MKL::scalapack_link
PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${__mkl_scalapack_inc}"
INTERFACE_LINK_LIBRARIES
"${__mkl_scalapack_lib};${__mkl_blacs_lib}")
endif()
unset(COSMA_BLAS_mkl_ILP_MODE)
unset(COSMA_BLAS_mkl_INTFACE)
unset(COSMA_BLAS_mkl_thread__)
unset(BLAS_mkl_OMP)
unset(BLAS_mkl_OS_NAME)
unset(__mkl_blacs_lib)
unset(__mkl_blacs_inc)
unset(__mkl_scalapack_lib)
unset(__mkl_scalapack_inc)
set(COSMA_BLAS_VENDOR "MKL")
set(COSMA_MKL_SCALAPACK_VENDOR TRUE)
mark_as_advanced(COSMA_BLAS_VENDOR)
mark_as_advanced(COSMA_MKL_FOUND)
mark_as_advanced(COSMA_MKL_SCALAPACK_VENDOR)
endif()
================================================
FILE: cmake/FindNCCL.cmake
================================================
include(FindPackageHandleStandardArgs)
find_path(COSMA_NCCL_INCLUDE_DIRS
NAMES nccl.h
HINTS
${NCCL_ROOT}
ENV NCCLROOT
)
find_library(COSMA_NCCL_LIBRARIES
NAMES nccl nccl_static
HINTS
${NCCL_ROOT}
ENV NCCLROOT
)
find_package_handle_standard_args(NCCL DEFAULT_MSG COSMA_NCCL_INCLUDE_DIRS COSMA_NCCL_LIBRARIES)
if (NCCL_FOUND AND NOT TARGET cosma::nccl)
add_library(cosma::nccl INTERFACE IMPORTED)
set_target_properties(cosma::nccl
PROPERTIES
INTERFACE_INCLUDE_DIRECTORIES ${COSMA_NCCL_INCLUDE_DIRS}
INTERFACE_LINK_LIBRARIES ${COSMA_NCCL_LIBRARIES})
endif()
================================================
FILE: cmake/FindNVPL.cmake
================================================
find_package("nvpl_blas" REQUIRED)
find_package("nvpl_lapack" REQUIRED)
find_package("nvpl_scalapack" REQUIRED)
if(COSMA_BLAS_INTERFACE STREQUAL "32bits")
set(_nvpl_int "_lp64")
else()
set(_nvpl_int "_ilp64")
endif()
if(COSMA_BLAS_THREADING STREQUAL "openmp")
set(_nvpl_thread "_omp")
else()
set(_nvpl_thread "_seq")
endif()
if("${MPI_CXX_LIBRARY_VERSION_STRING}" MATCHES "Open MPI")
if(MPI_VERSION VERSION_GREATER_EQUAL "5.0")
set(_nvpl_mpi "_openmpi5")
elseif(MPI_VERSION VERSION_GREATER_EQUAL "4.0")
set(_nvpl_mpi "_openmpi4")
else(MPI_VERSION VERSION_GREATER_EQUAL "3.0")
set(_nvpl_mpi "_openmpi3")
endif()
else()
set(_nvpl_mpi "_mpich")
endif()
if(NOT TARGET "cosma::BLAS::NVPL::nvpl")
add_library("cosma::BLAS::NVPL::nvpl" INTERFACE IMPORTED)
target_link_libraries("cosma::BLAS::NVPL::nvpl" INTERFACE
"nvpl::blas${_nvpl_int}${_nvpl_thread}" "nvpl::lapack${_nvpl_int}${_nvpl_thread}"
"nvpl::blacs${_nvpl_int}${_nvpl_mpi}" "nvpl::scalapack${_nvpl_int}")
get_target_property(COSMA_NVPL_LAPACK_LIBRARIES "nvpl::lapack${_nvpl_int}${_nvpl_thread}" INTERFACE_LINK_LIBRARIES)
get_target_property(COSMA_NVPL_SCALAPACK_LIBRARIES "nvpl::scalapack${_nvpl_int}" INTERFACE_LINK_LIBRARIES)
get_target_property(COSMA_NVPL_BLAS_INCLUDE_DIRS "nvpl::blas${_nvpl_int}${_nvpl_thread}" INTERFACE_INCLUDE_DIRECTORIES)
get_target_property(COSMA_NVPL_LAPACK_INCLUDE_DIRS "nvpl::lapack${_nvpl_int}${_nvpl_thread}" INTERFACE_INCLUDE_DIRECTORIES)
get_target_property(COSMA_NVPL_SCALAPACK_INCLUDE_DIRS "nvpl::scalapack${_nvpl_int}" INTERFACE_INCLUDE_DIRECTORIES)
set_target_properties(
cosma::BLAS::NVPL::nvpl
PROPERTIES INTERFACE_LINK_LIBRARIES
"${COSMA_NVPL_LAPACK_LIBRARIES}")
set_target_properties(
cosma::BLAS::NVPL::nvpl
PROPERTIES INTERFACE_INCLUDE_DIRECTORIES
"${COSMA_NVPL_BLAS_INCLUDE_DIRS};${COSMA_NVPL_LAPACK_INCLUDE_DIRS}")
add_library(cosma::BLAS::NVPL::blas ALIAS cosma::BLAS::NVPL::nvpl)
add_library(cosma::BLAS::NVPL::scalapack_link INTERFACE IMPORTED)
set_target_properties(
cosma::BLAS::NVPL::scalapack_link
PROPERTIES INTERFACE_LINK_LIBRARIES
"${COSMA_NVPL_LAPACK_LIBRARIES};${COSMA_NVPL_SCALAPACK_LIBRARIES}")
set_target_properties(
cosma::BLAS::NVPL::scalapack_link
PROPERTIES INTERFACE_INCLUDE_DIRECTORIES
"${COSMA_NVPL_BLAS_INCLUDE_DIRS};${COSMA_NVPL_LAPACK_INCLUDE_DIRS};${COSMA_NVPL_SCALAPACK_INCLUDE_DIRS}")
endif()
================================================
FILE: cmake/FindOPENBLAS.cmake
================================================
# find OPENBLAS
include(FindPackageHandleStandardArgs)
# if(NOT POLICY CMP0074)
set(_OPENBLAS_PATHS ${OPENBLAS_ROOT}
$ENV{OPENBLAS_ROOT}
$ENV{OPENBLASROOT}
$ENV{OPENBLAS_DIR}
$ENV{OPENBLASDIR})
# endif()
find_path(COSMA_OPENBLAS_INCLUDE_DIRS
NAMES "cblas-openblas.h" "cblas_openblas.h" "cblas.h"
PATH_SUFFIXES "openblas" "openblas/include" "include" "include/openblas"
HINTS ${_OPENBLAS_PATHS}
DOC "openblas include directory")
find_library(COSMA_OPENBLAS_LINK_LIBRARIES
NAMES openblas
PATH_SUFFIXES "lib" "lib64" "openblas/lib" "openblas/lib64" "openblas"
HINTS ${_OPENBLAS_PATHS}
DOC "openblas libraries list")
find_package_handle_standard_args(OPENBLAS
DEFAULT_MSG
COSMA_OPENBLAS_LINK_LIBRARIES COSMA_OPENBLAS_INCLUDE_DIRS)
if(NOT TARGET cosma::BLAS::OPENBLAS::openblas)
add_library(cosma::BLAS::OPENBLAS::openblas INTERFACE IMPORTED)
add_library(cosma::BLAS::OPENBLAS::blas ALIAS cosma::BLAS::OPENBLAS::openblas)
endif()
set_property(TARGET cosma::BLAS::OPENBLAS::openblas
PROPERTY INTERFACE_LINK_LIBRARIES ${COSMA_OPENBLAS_LINK_LIBRARIES})
set_property(TARGET cosma::BLAS::OPENBLAS::openblas
PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${COSMA_OPENBLAS_INCLUDE_DIRS})
# prevent clutter in cache
MARK_AS_ADVANCED(OPENBLAS_FOUND OPENBLAS_LIBRARIES OPENBLAS_INCLUDE_DIRS)
================================================
FILE: cmake/FindSCALAPACK.cmake
================================================
include(FindPackageHandleStandardArgs)
if(COSMA_SCALAPACK STREQUAL "MKL")
find_package(MKL REQUIRED)
get_target_property(COSMA_SCALAPACK_LINK_LIBRARIES cosma::BLAS::MKL::scalapack_link
INTERFACE_LINK_LIBRARIES)
elseif(COSMA_SCALAPACK STREQUAL "CRAY_LIBSCI")
find_package(CRAY_LIBSCI REQUIRED)
get_target_property(COSMA_SCALAPACK_LINK_LIBRARIES cosma::BLAS::CRAY_LIBSCI::scalapack_link
INTERFACE_LINK_LIBRARIES)
elseif(COSMA_SCALAPACK STREQUAL "NVPL")
find_package(NVPL REQUIRED)
get_target_property(COSMA_SCALAPACK_LINK_LIBRARIES cosma::BLAS::NVPL::scalapack_link
INTERFACE_LINK_LIBRARIES)
message(WARNING "COSMA_SCALAPACK_LINK_LIBRARIES: ${COSMA_SCALAPACK_LINK_LIBRARIES}")
elseif(COSMA_SCALAPACK STREQUAL "CUSTOM")
find_library(COSMA_SCALAPACK_LINK_LIBRARIES
NAMES scalapack
HINTS
${_COSMA_SCALAPACK_LIBRARY_DIRS}
ENV SCALAPACKROOT
ENV SCALAPACK_ROOT
ENV ORNL_SCALAPACK_ROOT
ENV SCALAPACK_PREFIX
ENV SCALAPACK_DIR
ENV SCALAPACKDIR
/usr/bin
PATH_SUFFIXES lib
DOC "Path to the scalapack library.")
endif()
find_package_handle_standard_args(SCALAPACK REQUIRED_VARS COSMA_SCALAPACK_LINK_LIBRARIES)
set(COSMA_SCALAPACK_FOUND "YES")
if (NOT TARGET cosma::scalapack::scalapack)
add_library(cosma::scalapack::scalapack INTERFACE IMPORTED)
set_target_properties(
cosma::scalapack::scalapack PROPERTIES INTERFACE_LINK_LIBRARIES
"${COSMA_SCALAPACK_LINK_LIBRARIES}")
endif()
mark_as_advanced(COSMA_SCALAPACK_LINK_LIBRARIES COSMA_SCALAPACK_FOUND)
================================================
FILE: cmake/GitSubmodule.cmake
================================================
# Call to ensure that the git submodule in location `path` is loaded.
# If the submodule is not loaded, an error message that describes
# how to update the submodules is printed.
# Sets the variable name_avail to `ON` if the submodule is available,
# or `OFF` otherwise.
# copyright github.com/arbor-sim
function(check_git_submodule name path)
set(success_var "${name}_avail")
set(${success_var} ON PARENT_SCOPE)
get_filename_component(dotgit "${path}/.git" ABSOLUTE)
if(NOT EXISTS ${dotgit})
message(
"\nThe git submodule for ${name} is not available.\n"
"To check out all submodules use the following commands:\n"
" git submodule init\n"
" git submodule update\n"
"Or download submodules recursively when checking out:\n"
" git clone --recursive https://github.com/eth-cscs/COSMA.git\n"
)
# if the repository was not available, and git failed, set AVAIL to false
set(${success_var} OFF PARENT_SCOPE)
endif()
endfunction()
function(add_git_submodule_or_find_external name path)
check_git_submodule(${name} ${path})
if(NOT ${name}_avail)
# attempt to find system installation of pybind11
find_package(${name} REQUIRED)
else()
message(VERBOSE "Using ${name} as git submodule from ${path}")
add_subdirectory("${path}")
endif()
endfunction()
================================================
FILE: cmake/adjust_mpiexec_flags.cmake
================================================
# Appends the --oversubscribe flag if OpenMPI.
#
function(adjust_mpiexec_flags)
execute_process(COMMAND mpirun --version OUTPUT_VARIABLE MPIRUN_OUTPUT)
string(FIND "${MPIRUN_OUTPUT}" "Open MPI" OMPI_POS)
if(NOT OMPI_POS STREQUAL "-1")
set(MPIEXEC_PREFLAGS "--oversubscribe;${MPIEXEC_PREFLAGS}" CACHE STRING "These flags will be directly before the executable that is being run by mpiexec." FORCE)
set(MPI_TYPE "ompi" PARENT_SCOPE)
else()
set(MPI_TYPE "mpich" PARENT_SCOPE)
endif()
endfunction()
================================================
FILE: cmake/build_type.cmake
================================================
# Set default to Release if none was specified and update the docs.
#
set(default_build_type ${CMAKE_BUILD_TYPE})
if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
message(STATUS "Setting build type to 'Release' as none was specified.")
set(default_build_type "Release")
endif()
set(CMAKE_BUILD_TYPE "${default_build_type}" CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel Profile." FORCE)
# Define a custom build type
#
#set( CMAKE_CXX_FLAGS_PROFILE "${CMAKE_CXX_FLAGS_RELEASE}" CACHE STRING "" FORCE)
#set( CMAKE_C_FLAGS_PROFILE "${CMAKE_C_FLAGS_RELEASE}" CACHE STRING "" FORCE )
#set( CMAKE_EXE_LINKER_FLAGS_PROFILE "${CMAKE_EXE_LINKER_FLAGS_RELEASE}" CACHE STRING "" FORCE )
#set( CMAKE_SHARED_LINKER_FLAGS_PROFILE "${CMAKE_SHARED_LINKER_FLAGS_RELEASE}" CACHE STRING "" FORCE )
#mark_as_advanced(CMAKE_CXX_FLAGS_PROFILE
# CMAKE_C_FLAGS_PROFILE
# CMAKE_EXE_LINKER_FLAGS_PROFILE
# CMAKE_SHARED_LINKER_FLAGS_PROFILE )
#
# use with $<$:semiprof>
================================================
FILE: cmake/cosma.pc.in
================================================
prefix=@CMAKE_INSTALL_PREFIX@
exec_prefix=@CMAKE_INSTALL_PREFIX@
libdir=@CMAKE_INSTALL_FULL_LIBDIR@
includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
Name: cosma
Description: Distributed communication-optimal matrix-matrix multiplication algorithm
Version: @cosma_VERSION@
Libs: -L${libdir} -lcosma
Cflags: -I${includedir}
================================================
FILE: cmake/cosmaConfig.cmake.in
================================================
if(NOT TARGET cosma::cosma)
cmake_policy(PUSH) # Save project's policies
if(POLICY CMP0074)
cmake_policy(SET CMP0074 NEW)
endif()
include(CMakeFindDependencyMacro)
# Bundled modules should be found first to prevent conflicts with similarly
# named modules in calling projects.
#
set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR} ${CMAKE_MODULE_PATH})
set(MKL_ROOT "@MKL_ROOT@")
set(MPI_DETERMINE_LIBRARY_VERSION TRUE)
find_package(MPI COMPONENTS "CXX" REQUIRED)
set(COSMA_BLAS "@COSMA_BLAS@")
set(COSMA_SCALAPACK "@COSMA_SCALAPACK@")
set(COSMA_BLAS_VENDOR "@COSMA_BLAS_VENDOR@")
set(COSMA_BLAS_THREADING "@COSMA_BLAS_THREADING@")
if ("@COSMA_GPU_BACKEND@" STREQUAL "CUDA" OR "@COSMA_GPU_BACKEND@" STREQUAL "ROCM")
if (${COSMA_BLAS} STREQUAL "CUDA")
find_dependency(CUDAToolkit)
else()
find_dependency(hip)
endif()
set(TILEMM_GPU_BACKEND "@COSMA_GPU_BACKEND@" CACHE STRING FORCE "")
find_dependency(Tiled-MM) # bundled
if ("@COSMA_WITH_NCCL@")
find_dependency(NCCL)
endif()
if("@COSMA_WITH_RCCL@")
find_dependency(rccl)
endif()
endif ()
if (NOT @COSMA_BLAS_VENDOR@ MATCHES "OFF")
find_dependency(Blas)
endif()
if (NOT ${COSMA_SCALAPACK} MATCHES "OFF")
find_dependency(SCALAPACK)
endif ()
if ("@COSMA_WITH_PROFILING@")
find_dependency(semiprof)
endif ()
# Clean-up module path.
#
list(REMOVE_ITEM CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR})
# These are bundled with cosma
#
find_dependency(costa)
include("${CMAKE_CURRENT_LIST_DIR}/cosmaTargets.cmake")
cmake_policy(POP) # Restore project's policies
endif()
================================================
FILE: cmake/find_cuda_version.cmake
================================================
# finds CUDA_TOOLKIT_MAJOR_VERSION AND CUDA_TOOLKIT_MINOR_VERSION
function(find_cuda_version)
execute_process(COMMAND nvcc --version OUTPUT_VARIABLE CUDA_VERSION_STRING)
string(REGEX MATCH "release ([0-9]*)\\.([0-9]*)" _ "${CUDA_VERSION_STRING}")
set(CUDA_TOOLKIT_MAJOR_VERSION ${CMAKE_MATCH_1})
set(CUDA_TOOLKIT_MINOR_VERSION ${CMAKE_MATCH_2})
message(STATUS "CUDA_TOOLKIT_MAJOR_VERSION = ${CUDA_TOOLKIT_MAJOR_VERSION}")
message(STATUS "CUDA_TOOLKIT_MINOR_VERSION = ${CUDA_TOOLKIT_MINOR_VERSION}")
endfunction()
================================================
FILE: docker/asan/build-env.Dockerfile
================================================
FROM ubuntu:20.04
WORKDIR /root
SHELL ["/bin/bash", "-c"]
ARG MPICH_VERSION=4.0.1
ENV DEBIAN_FRONTEND noninteractive
ENV FORCE_UNSAFE_CONFIGURE 1
ENV MPICH_VERSION ${MPICH_VERSION}
# Install basic tools
RUN apt-get update -qq && apt-get install -qq -y --no-install-recommends \
software-properties-common \
build-essential gfortran pkg-config \
git tar wget curl chrpath && \
rm -rf /var/lib/apt/lists/*
# Install cmake
RUN wget -qO- "https://cmake.org/files/v3.22/cmake-3.22.1-linux-x86_64.tar.gz" | tar --strip-components=1 -xz -C /usr/local
# Install MPICH ABI compatible with Cray's lib on Piz Daint
RUN wget -q https://www.mpich.org/static/downloads/${MPICH_VERSION}/mpich-${MPICH_VERSION}.tar.gz && \
tar -xzf mpich-${MPICH_VERSION}.tar.gz && \
cd mpich-${MPICH_VERSION} && \
./configure && \
make install -j$(nproc) && \
rm -rf /root/mpich-${MPICH_VERSION}.tar.gz /root/mpich-${MPICH_VERSION}
# Install OpenBLAS
ARG OPENBLAS_VERSION=0.3.20
RUN wget -qO - https://github.com/xianyi/OpenBLAS/archive/v${OPENBLAS_VERSION}.tar.gz -O openblas.tar.gz && \
tar -xzf openblas.tar.gz && \
cd OpenBLAS-${OPENBLAS_VERSION}/ && \
make TARGET=HASWELL NO_STATIC=1 DEBUG=1 -j$(nproc) && \
make install TARGET=HASWELL NO_STATIC=1 PREFIX=/usr/local/ && \
rm -rf /root/openblas.tar.gz /root/OpenBLAS-${OPENBLAS_VERSION}/ && \
ldconfig
ARG NETLIB_SCALAPACK_VERSION=2.2.0
RUN wget -qO - http://www.netlib.org/scalapack/scalapack-${NETLIB_SCALAPACK_VERSION}.tgz -O scalapack.tar.gz && \
tar -xzf scalapack.tar.gz && \
cd scalapack-${NETLIB_SCALAPACK_VERSION} && \
mkdir build && \
cd build && \
CC=mpicc FC=mpif90 cmake .. \
-DBUILD_STATIC_LIBS=OFF \
-DBUILD_SHARED_LIBS=ON \
-DCMAKE_BUILD_TYPE=Debug && \
make -j$(nproc) && \
make install && \
rm -rf /root/scalapack.tar.gz /root/scalapack-${NETLIB_SCALAPACK_VERSION} && \
ldconfig
# Add deployment tooling
RUN mkdir -p /opt/libtree && \
curl -Lfso /opt/libtree/libtree https://github.com/haampie/libtree/releases/download/v2.0.0/libtree_x86_64 && \
chmod +x /opt/libtree/libtree
================================================
FILE: docker/asan/deploy.Dockerfile
================================================
ARG BUILD_ENV
FROM $BUILD_ENV as builder
ARG BLAS
# Build COSMA
COPY . /COSMA
# reduce the minimum local dimension to allow all mpi ranks to take part
# in testing
ENV COSMA_MIN_LOCAL_DIMENSION=32
RUN mkdir /COSMA/build && cd /COSMA/build && \
CC=mpicc CXX=mpicxx cmake .. \
-DCOSMA_WITH_TESTS=ON \
-DCOSMA_BLAS=OPENBLAS \
-DCOSMA_SCALAPACK=CUSTOM \
-DCMAKE_BUILD_TYPE=Debug \
-DCMAKE_CXX_FLAGS_DEBUG="-g -Og -fno-omit-frame-pointer -fsanitize=address,undefined" \
-DCMAKE_INSTALL_PREFIX=/root/COSMA-build && \
make -j$(nproc) && \
make install && \
rm -rf /COSMA
RUN /opt/libtree/libtree \
--chrpath \
-d /root/COSMA.bundle/ \
/root/COSMA-build/bin/test.cosma \
/root/COSMA-build/bin/test.mapper \
/root/COSMA-build/bin/test.multiply \
/root/COSMA-build/bin/test.multiply_using_layout \
/root/COSMA-build/bin/test.pdgemm \
/root/COSMA-build/bin/test.scalar_matmul
FROM ubuntu:20.04
# Automatically print stacktraces on segfault
ENV LD_PRELOAD=/lib/x86_64-linux-gnu/libSegFault.so
COPY --from=builder /root/COSMA.bundle /root/COSMA.bundle
# Make it easy to call our binaries.
ENV PATH="/root/COSMA.bundle/usr/bin:$PATH"
RUN echo "/root/COSMA.bundle/usr/lib/" > /etc/ld.so.conf.d/cosma.conf && ldconfig
WORKDIR /root/COSMA.bundle/usr/bin
# I'm not getting ASAN_OPTIONS=suppressions=file to work, so just disable leak detection for now.
ENV ASAN_OPTIONS=detect_leaks=false
================================================
FILE: docker/cpu-release/build-env.Dockerfile
================================================
FROM ubuntu:20.04
WORKDIR /root
SHELL ["/bin/bash", "-c"]
ARG MKL_VERSION=2020.4-912
ARG MPICH_VERSION=4.0.1
ENV DEBIAN_FRONTEND noninteractive
ENV MKLROOT=/opt/intel/compilers_and_libraries/linux/mkl
ENV FORCE_UNSAFE_CONFIGURE 1
ENV MPICH_VERSION ${MPICH_VERSION}
ENV MKL_VERSION ${MKL_VERSION}
# reduce the minimum local dimension to allow all mpi ranks to take part
# in testing
ENV COSMA_MIN_LOCAL_DIMENSION=32
# Install basic tools
RUN apt-get update -qq && apt-get install -qq -y --no-install-recommends \
software-properties-common \
build-essential \
git tar wget curl gpg-agent chrpath && \
rm -rf /var/lib/apt/lists/*
# Install cmake
RUN wget -qO- "https://cmake.org/files/v3.22/cmake-3.22.1-linux-x86_64.tar.gz" | tar --strip-components=1 -xz -C /usr/local
# Install MPICH ABI compatible with Cray's lib on Piz Daint
RUN wget -q https://www.mpich.org/static/downloads/${MPICH_VERSION}/mpich-${MPICH_VERSION}.tar.gz && \
tar -xzf mpich-${MPICH_VERSION}.tar.gz && \
cd mpich-${MPICH_VERSION} && \
./configure --disable-fortran && \
make install -j$(nproc) && \
rm -rf /root/mpich-${MPICH_VERSION}.tar.gz /root/mpich-${MPICH_VERSION}
# Install MKL
RUN wget -qO - https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB 2>/dev/null | apt-key add - && \
apt-add-repository 'deb https://apt.repos.intel.com/mkl all main' && \
apt-get install -y -qq --no-install-recommends intel-mkl-64bit-${MKL_VERSION} && \
rm -rf /var/lib/apt/lists/* && \
echo -e "/opt/intel/lib/intel64\n/opt/intel/compilers_and_libraries/linux/mkl/lib/intel64" >> /etc/ld.so.conf.d/intel.conf && \
ldconfig
# Add deployment tooling
RUN mkdir -p /opt/libtree && \
curl -Lfso /opt/libtree/libtree https://github.com/haampie/libtree/releases/download/v2.0.0/libtree_x86_64 && \
chmod +x /opt/libtree/libtree
================================================
FILE: docker/cpu-release/deploy.Dockerfile
================================================
ARG BUILD_ENV
FROM $BUILD_ENV as builder
# Build COSMA
COPY . /COSMA
RUN source /opt/intel/bin/compilervars.sh intel64 && \
mkdir /COSMA/build && cd /COSMA/build && \
CC=mpicc CXX=mpicxx cmake .. \
-DCOSMA_WITH_TESTS=ON \
-DCOSMA_BLAS=MKL \
-DCOSMA_SCALAPACK=MKL \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_INSTALL_PREFIX=/root/COSMA-build && \
make -j$(nproc) && \
make install && \
rm -rf /COSMA
ENV MKL_LIB=/opt/intel/compilers_and_libraries/linux/mkl/lib/intel64
# Run linuxdeploy, and add a bunch of libs that are dlopen'ed by mkl
RUN /opt/libtree/libtree --chrpath --strip -d /root/COSMA.bundle/ \
/root/COSMA-build/bin/test.cosma \
/root/COSMA-build/bin/test.mapper \
/root/COSMA-build/bin/test.multiply \
/root/COSMA-build/bin/test.multiply_using_layout \
/root/COSMA-build/bin/test.pdgemm \
/root/COSMA-build/bin/test.scalar_matmul \
# MKL dlopen's some of their libs, so we have to explicitly copy them over
${MKL_LIB}/libmkl_avx.so \
${MKL_LIB}/libmkl_avx2.so \
${MKL_LIB}/libmkl_avx512_mic.so \
${MKL_LIB}/libmkl_avx512.so \
${MKL_LIB}/libmkl_core.so \
${MKL_LIB}/libmkl_def.so \
${MKL_LIB}/libmkl_intel_thread.so \
${MKL_LIB}/libmkl_mc.so \
${MKL_LIB}/libmkl_mc3.so \
${MKL_LIB}/libmkl_sequential.so \
${MKL_LIB}/libmkl_tbb_thread.so \
${MKL_LIB}/libmkl_vml_avx.so \
${MKL_LIB}/libmkl_vml_avx2.so \
${MKL_LIB}/libmkl_vml_avx512_mic.so \
${MKL_LIB}/libmkl_vml_avx512.so \
${MKL_LIB}/libmkl_vml_cmpt.so \
${MKL_LIB}/libmkl_vml_def.so \
${MKL_LIB}/libmkl_vml_mc.so \
${MKL_LIB}/libmkl_vml_mc3.so
FROM ubuntu:20.04
COPY --from=builder /root/COSMA.bundle /root/COSMA.bundle
# Automatically print stacktraces on segfault
ENV LD_PRELOAD=/lib/x86_64-linux-gnu/libSegFault.so
# Make it easy to call our binaries.
ENV PATH="/root/COSMA.bundle/usr/bin:$PATH"
RUN echo "/root/COSMA.bundle/usr/lib/" > /etc/ld.so.conf.d/cosma.conf && ldconfig
WORKDIR /root/COSMA.bundle/usr/bin
================================================
FILE: docker/gpu/build-env.Dockerfile
================================================
FROM nvidia/cuda:11.6.2-devel-ubuntu20.04
WORKDIR /root
SHELL ["/bin/bash", "-c"]
ARG MPICH_VERSION=4.0.1
ARG OPENBLAS_VERSION=0.3.20
ARG NETLIB_SCALAPACK_VERSION=2.2.0
ENV DEBIAN_FRONTEND noninteractive
ENV MKLROOT=/opt/intel/compilers_and_libraries/linux/mkl
ENV FORCE_UNSAFE_CONFIGURE 1
ENV MPICH_VERSION ${MPICH_VERSION}
ENV MKL_VERSION ${MKL_VERSION}
# reduce the minimum local dimension to allow all mpi ranks to take part
# in testing
ENV COSMA_MIN_LOCAL_DIMENSION=32
# Install basic tools
RUN apt-get update -qq && \
apt-get install -qq -y --no-install-recommends \
software-properties-common \
build-essential gfortran pkg-config \
git tar wget curl chrpath && \
rm -rf /var/lib/apt/lists/*
# Install cmake
RUN wget -qO- "https://cmake.org/files/v3.22/cmake-3.22.1-linux-x86_64.tar.gz" | tar --strip-components=1 -xz -C /usr/local
# Install MPICH ABI compatible with Cray's lib on Piz Daint
RUN wget -q https://www.mpich.org/static/downloads/${MPICH_VERSION}/mpich-${MPICH_VERSION}.tar.gz && \
tar -xzf mpich-${MPICH_VERSION}.tar.gz && \
cd mpich-${MPICH_VERSION} && \
./configure && \
make install -j$(nproc) && \
rm -rf /root/mpich-${MPICH_VERSION}.tar.gz /root/mpich-${MPICH_VERSION}
# Install OpenBLAS
RUN wget -qO - https://github.com/xianyi/OpenBLAS/archive/v${OPENBLAS_VERSION}.tar.gz -O openblas.tar.gz && \
tar -xzf openblas.tar.gz && \
cd OpenBLAS-${OPENBLAS_VERSION}/ && \
make TARGET=HASWELL NO_STATIC=1 -j$(nproc) && \
make install TARGET=HASWELL NO_STATIC=1 PREFIX=/usr/local/ && \
rm -rf /root/openblas.tar.gz /root/OpenBLAS-${OPENBLAS_VERSION}/ && \
ldconfig
RUN wget -qO - http://www.netlib.org/scalapack/scalapack-${NETLIB_SCALAPACK_VERSION}.tgz -O scalapack.tar.gz && \
tar -xzf scalapack.tar.gz && \
cd scalapack-${NETLIB_SCALAPACK_VERSION} && \
mkdir build && \
cd build && \
CC=mpicc FC=mpif90 cmake .. \
-DBUILD_STATIC_LIBS=OFF \
-DBUILD_SHARED_LIBS=ON \
-DCMAKE_BUILD_TYPE=Release && \
make -j$(nproc) && \
make install && \
rm -rf /root/scalapack.tar.gz /root/scalapack-${NETLIB_SCALAPACK_VERSION} && \
ldconfig
# Add deployment tooling
RUN mkdir -p /opt/libtree && \
curl -Lfso /opt/libtree/libtree https://github.com/haampie/libtree/releases/download/v2.0.0/libtree_x86_64 && \
chmod +x /opt/libtree/libtree
================================================
FILE: docker/gpu/deploy.Dockerfile
================================================
ARG BUILD_ENV
FROM $BUILD_ENV as builder
ARG BLAS
# Build COSMA
COPY . /COSMA
RUN mkdir /COSMA/build && cd /COSMA/build && \
CC=mpicc CXX=mpicxx cmake .. \
-DCOSMA_WITH_TESTS=ON \
-DCUDA_PATH=/usr/local/cuda \
-DCOSMA_BLAS=CUDA \
-DCOSMA_SCALAPACK=CUSTOM \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_INSTALL_PREFIX=/root/COSMA-build && \
make -j$(nproc) && \
make install && \
rm -rf /COSMA
# Run linuxdeploy, and add a bunch of libs that are dlopen'ed by mkl
RUN /opt/libtree/libtree \
-d /root/COSMA.bundle/ \
--chrpath \
--strip \
/root/COSMA-build/bin/test.cosma \
/root/COSMA-build/bin/test.mapper \
/root/COSMA-build/bin/test.multiply \
/root/COSMA-build/bin/test.multiply_using_layout \
/root/COSMA-build/bin/test.pdgemm \
/root/COSMA-build/bin/test.scalar_matmul
FROM ubuntu:20.04
# This is the only thing necessary really from nvidia/cuda's ubuntu18.04 runtime image
ENV NVIDIA_VISIBLE_DEVICES all
ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
ENV NVIDIA_REQUIRE_CUDA "cuda>=10.2"
# Automatically print stacktraces on segfault
ENV LD_PRELOAD=/lib/x86_64-linux-gnu/libSegFault.so
COPY --from=builder /root/COSMA.bundle /root/COSMA.bundle
# Make it easy to call our binaries.
ENV PATH="/root/COSMA.bundle/usr/bin:$PATH"
RUN echo "/root/COSMA.bundle/usr/lib/" > /etc/ld.so.conf.d/cosma.conf && ldconfig
WORKDIR /root/COSMA.bundle/usr/bin
================================================
FILE: libs/gtest_mpi/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.24 FATAL_ERROR)
project(gtest_mpi CXX)
add_subdirectory(external/gtest)
add_library(gtest_mpi INTERFACE)
target_include_directories(gtest_mpi INTERFACE ${gtest_mpi_SOURCE_DIR}/include)
target_link_libraries(gtest_mpi INTERFACE gtest)
target_compile_features(gtest_mpi INTERFACE cxx_std_11)
================================================
FILE: libs/gtest_mpi/LICENSE
================================================
This project contains source code from the Googletest framework
obtained from https:github.com/google/googletest with the following
terms:
Copyright 2005, Google Inc.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
---------------------------------------------------------------------
Modifications and additions are published under the following terms:
Copyright 2019, Simon Frasch
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
---------------------------------------------------------------------
================================================
FILE: libs/gtest_mpi/README.md
================================================
# GTest MPI
This project provides an extension to the Google Test framework, to allow testing of MPI enabled applications.
The implementation includes a custom MPI environment and listener, with which all test failure messages are collected on the root process and the output includes the rank index for each failure.
## Requirements
- Google Test version 1.8.1 (other versions may work as well, depending on changes to Listener or Environment interfaces)
- A MPI library
- At least C++ 11.
- Linux or macOS
## Limitations
- All ranks MUST execute all tests in the same order. Within a test, the executed assertions may differ. If a test should run only on a subset of ranks, the excluded ranks must enter the test, but may exit immediately.
- Logging features of Google Test are not supported
## Example
```
#include
#include "gtest/gtest.h"
#include "gtest_mpi/gtest_mpi.hpp"
int main(int argc, char* argv[]) {
// Initialize MPI before any call to gtest_mpi
MPI_Init(&argc, &argv);
// Intialize google test
::testing::InitGoogleTest(&argc, argv);
// Add a test environment, which will initialize a test communicator
// (a duplicate of MPI_COMM_WORLD)
::testing::AddGlobalTestEnvironment(new gtest_mpi::MPITestEnvironment());
auto& test_listeners = ::testing::UnitTest::GetInstance()->listeners();
// Remove default listener and replace with the custom MPI listener
delete test_listeners.Release(test_listeners.default_result_printer());
test_listeners.Append(new gtest_mpi::PrettyMPIUnitTestResultPrinter());
// run tests
auto exit_code = RUN_ALL_TESTS();
// Finalize MPI before exiting
MPI_Finalize();
return exit_code;
}
```
# License
```
This project contains source code from the Googletest framework
obtained from https:github.com/google/googletest with the following
terms:
Copyright 2005, Google Inc.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
---------------------------------------------------------------------
Modifications and additions are published under the following terms:
Copyright 2019, Simon Frasch
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
---------------------------------------------------------------------
```
================================================
FILE: libs/gtest_mpi/external/gtest/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.24 FATAL_ERROR)
project(gtest CXX)
add_library(gtest STATIC src/gtest-all.cpp)
target_include_directories(gtest PUBLIC ${gtest_SOURCE_DIR}/include)
target_compile_features(gtest PUBLIC cxx_std_11)
================================================
FILE: libs/gtest_mpi/external/gtest/include/gtest/gtest.h
================================================
// Copyright 2005, Google Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Author: wan@google.com (Zhanyong Wan)
//
// The Google C++ Testing Framework (Google Test)
//
// This header file defines the public API for Google Test. It should be
// included by any test program that uses Google Test.
//
// IMPORTANT NOTE: Due to limitation of the C++ language, we have to
// leave some internal implementation details in this header file.
// They are clearly marked by comments like this:
//
// // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
//
// Such code is NOT meant to be used by a user directly, and is subject
// to CHANGE WITHOUT NOTICE. Therefore DO NOT DEPEND ON IT in a user
// program!
//
// Acknowledgment: Google Test borrowed the idea of automatic test
// registration from Barthelemy Dagenais' (barthelemy@prologique.com)
// easyUnit framework.
#ifndef GTEST_INCLUDE_GTEST_GTEST_H_
#define GTEST_INCLUDE_GTEST_GTEST_H_
#include
#include
#include
// Copyright 2005, Google Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Authors: wan@google.com (Zhanyong Wan), eefacm@gmail.com (Sean Mcafee)
//
// The Google C++ Testing Framework (Google Test)
//
// This header file declares functions and macros used internally by
// Google Test. They are subject to change without notice.
#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
// Copyright 2005, Google Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Authors: wan@google.com (Zhanyong Wan)
//
// Low-level types and utilities for porting Google Test to various
// platforms. All macros ending with _ and symbols defined in an
// internal namespace are subject to change without notice. Code
// outside Google Test MUST NOT USE THEM DIRECTLY. Macros that don't
// end with _ are part of Google Test's public API and can be used by
// code outside Google Test.
//
// This file is fundamental to Google Test. All other Google Test source
// files are expected to #include this. Therefore, it cannot #include
// any other Google Test header.
#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
// Environment-describing macros
// -----------------------------
//
// Google Test can be used in many different environments. Macros in
// this section tell Google Test what kind of environment it is being
// used in, such that Google Test can provide environment-specific
// features and implementations.
//
// Google Test tries to automatically detect the properties of its
// environment, so users usually don't need to worry about these
// macros. However, the automatic detection is not perfect.
// Sometimes it's necessary for a user to define some of the following
// macros in the build script to override Google Test's decisions.
//
// If the user doesn't define a macro in the list, Google Test will
// provide a default definition. After this header is #included, all
// macros in this list will be defined to either 1 or 0.
//
// Notes to maintainers:
// - Each macro here is a user-tweakable knob; do not grow the list
// lightly.
// - Use #if to key off these macros. Don't use #ifdef or "#if
// defined(...)", which will not work as these macros are ALWAYS
// defined.
//
// GTEST_HAS_CLONE - Define it to 1/0 to indicate that clone(2)
// is/isn't available.
// GTEST_HAS_EXCEPTIONS - Define it to 1/0 to indicate that exceptions
// are enabled.
// GTEST_HAS_GLOBAL_STRING - Define it to 1/0 to indicate that ::string
// is/isn't available (some systems define
// ::string, which is different to std::string).
// GTEST_HAS_GLOBAL_WSTRING - Define it to 1/0 to indicate that ::string
// is/isn't available (some systems define
// ::wstring, which is different to std::wstring).
// GTEST_HAS_POSIX_RE - Define it to 1/0 to indicate that POSIX regular
// expressions are/aren't available.
// GTEST_HAS_PTHREAD - Define it to 1/0 to indicate that
// is/isn't available.
// GTEST_HAS_RTTI - Define it to 1/0 to indicate that RTTI is/isn't
// enabled.
// GTEST_HAS_STD_WSTRING - Define it to 1/0 to indicate that
// std::wstring does/doesn't work (Google Test can
// be used where std::wstring is unavailable).
// GTEST_HAS_TR1_TUPLE - Define it to 1/0 to indicate tr1::tuple
// is/isn't available.
// GTEST_HAS_SEH - Define it to 1/0 to indicate whether the
// compiler supports Microsoft's "Structured
// Exception Handling".
// GTEST_HAS_STREAM_REDIRECTION
// - Define it to 1/0 to indicate whether the
// platform supports I/O stream redirection using
// dup() and dup2().
// GTEST_USE_OWN_TR1_TUPLE - Define it to 1/0 to indicate whether Google
// Test's own tr1 tuple implementation should be
// used. Unused when the user sets
// GTEST_HAS_TR1_TUPLE to 0.
// GTEST_LANG_CXX11 - Define it to 1/0 to indicate that Google Test
// is building in C++11/C++98 mode.
// GTEST_LINKED_AS_SHARED_LIBRARY
// - Define to 1 when compiling tests that use
// Google Test as a shared library (known as
// DLL on Windows).
// GTEST_CREATE_SHARED_LIBRARY
// - Define to 1 when compiling Google Test itself
// as a shared library.
// Platform-indicating macros
// --------------------------
//
// Macros indicating the platform on which Google Test is being used
// (a macro is defined to 1 if compiled on the given platform;
// otherwise UNDEFINED -- it's never defined to 0.). Google Test
// defines these macros automatically. Code outside Google Test MUST
// NOT define them.
//
// GTEST_OS_AIX - IBM AIX
// GTEST_OS_CYGWIN - Cygwin
// GTEST_OS_FREEBSD - FreeBSD
// GTEST_OS_HPUX - HP-UX
// GTEST_OS_LINUX - Linux
// GTEST_OS_LINUX_ANDROID - Google Android
// GTEST_OS_MAC - Mac OS X
// GTEST_OS_IOS - iOS
// GTEST_OS_NACL - Google Native Client (NaCl)
// GTEST_OS_OPENBSD - OpenBSD
// GTEST_OS_QNX - QNX
// GTEST_OS_SOLARIS - Sun Solaris
// GTEST_OS_SYMBIAN - Symbian
// GTEST_OS_WINDOWS - Windows (Desktop, MinGW, or Mobile)
// GTEST_OS_WINDOWS_DESKTOP - Windows Desktop
// GTEST_OS_WINDOWS_MINGW - MinGW
// GTEST_OS_WINDOWS_MOBILE - Windows Mobile
// GTEST_OS_WINDOWS_PHONE - Windows Phone
// GTEST_OS_WINDOWS_RT - Windows Store App/WinRT
// GTEST_OS_ZOS - z/OS
//
// Among the platforms, Cygwin, Linux, Max OS X, and Windows have the
// most stable support. Since core members of the Google Test project
// don't have access to other platforms, support for them may be less
// stable. If you notice any problems on your platform, please notify
// googletestframework@googlegroups.com (patches for fixing them are
// even more welcome!).
//
// It is possible that none of the GTEST_OS_* macros are defined.
// Feature-indicating macros
// -------------------------
//
// Macros indicating which Google Test features are available (a macro
// is defined to 1 if the corresponding feature is supported;
// otherwise UNDEFINED -- it's never defined to 0.). Google Test
// defines these macros automatically. Code outside Google Test MUST
// NOT define them.
//
// These macros are public so that portable tests can be written.
// Such tests typically surround code using a feature with an #if
// which controls that code. For example:
//
// #if GTEST_HAS_DEATH_TEST
// EXPECT_DEATH(DoSomethingDeadly());
// #endif
//
// GTEST_HAS_COMBINE - the Combine() function (for value-parameterized
// tests)
// GTEST_HAS_DEATH_TEST - death tests
// GTEST_HAS_PARAM_TEST - value-parameterized tests
// GTEST_HAS_TYPED_TEST - typed tests
// GTEST_HAS_TYPED_TEST_P - type-parameterized tests
// GTEST_IS_THREADSAFE - Google Test is thread-safe.
// GTEST_USES_POSIX_RE - enhanced POSIX regex is used. Do not confuse with
// GTEST_HAS_POSIX_RE (see above) which users can
// define themselves.
// GTEST_USES_SIMPLE_RE - our own simple regex is used;
// the above two are mutually exclusive.
// GTEST_CAN_COMPARE_NULL - accepts untyped NULL in EXPECT_EQ().
// Misc public macros
// ------------------
//
// GTEST_FLAG(flag_name) - references the variable corresponding to
// the given Google Test flag.
// Internal utilities
// ------------------
//
// The following macros and utilities are for Google Test's INTERNAL
// use only. Code outside Google Test MUST NOT USE THEM DIRECTLY.
//
// Macros for basic C++ coding:
// GTEST_AMBIGUOUS_ELSE_BLOCKER_ - for disabling a gcc warning.
// GTEST_ATTRIBUTE_UNUSED_ - declares that a class' instances or a
// variable don't have to be used.
// GTEST_DISALLOW_ASSIGN_ - disables operator=.
// GTEST_DISALLOW_COPY_AND_ASSIGN_ - disables copy ctor and operator=.
// GTEST_MUST_USE_RESULT_ - declares that a function's result must be used.
// GTEST_INTENTIONAL_CONST_COND_PUSH_ - start code section where MSVC C4127 is
// suppressed (constant conditional).
// GTEST_INTENTIONAL_CONST_COND_POP_ - finish code section where MSVC C4127
// is suppressed.
//
// C++11 feature wrappers:
//
// testing::internal::move - portability wrapper for std::move.
//
// Synchronization:
// Mutex, MutexLock, ThreadLocal, GetThreadCount()
// - synchronization primitives.
//
// Template meta programming:
// is_pointer - as in TR1; needed on Symbian and IBM XL C/C++ only.
// IteratorTraits - partial implementation of std::iterator_traits, which
// is not available in libCstd when compiled with Sun C++.
//
// Smart pointers:
// scoped_ptr - as in TR2.
//
// Regular expressions:
// RE - a simple regular expression class using the POSIX
// Extended Regular Expression syntax on UNIX-like
// platforms, or a reduced regular exception syntax on
// other platforms, including Windows.
//
// Logging:
// GTEST_LOG_() - logs messages at the specified severity level.
// LogToStderr() - directs all log messages to stderr.
// FlushInfoLog() - flushes informational log messages.
//
// Stdout and stderr capturing:
// CaptureStdout() - starts capturing stdout.
// GetCapturedStdout() - stops capturing stdout and returns the captured
// string.
// CaptureStderr() - starts capturing stderr.
// GetCapturedStderr() - stops capturing stderr and returns the captured
// string.
//
// Integer types:
// TypeWithSize - maps an integer to a int type.
// Int32, UInt32, Int64, UInt64, TimeInMillis
// - integers of known sizes.
// BiggestInt - the biggest signed integer type.
//
// Command-line utilities:
// GTEST_DECLARE_*() - declares a flag.
// GTEST_DEFINE_*() - defines a flag.
// GetInjectableArgvs() - returns the command line as a vector of strings.
//
// Environment variable utilities:
// GetEnv() - gets the value of an environment variable.
// BoolFromGTestEnv() - parses a bool environment variable.
// Int32FromGTestEnv() - parses an Int32 environment variable.
// StringFromGTestEnv() - parses a string environment variable.
#include // for isspace, etc
#include // for ptrdiff_t
#include
#include
#include
#ifndef _WIN32_WCE
# include
# include
#endif // !_WIN32_WCE
#if defined __APPLE__
# include
# include
#endif
#include // NOLINT
#include // NOLINT
#include // NOLINT
#include // NOLINT
#include
#include // NOLINT
// Copyright 2015, Google Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// The Google C++ Testing Framework (Google Test)
//
// This header file defines the GTEST_OS_* macro.
// It is separate from gtest-port.h so that custom/gtest-port.h can include it.
#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_
#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_
// Determines the platform on which Google Test is compiled.
#ifdef __CYGWIN__
# define GTEST_OS_CYGWIN 1
#elif defined __SYMBIAN32__
# define GTEST_OS_SYMBIAN 1
#elif defined _WIN32
# define GTEST_OS_WINDOWS 1
# ifdef _WIN32_WCE
# define GTEST_OS_WINDOWS_MOBILE 1
# elif defined(__MINGW__) || defined(__MINGW32__)
# define GTEST_OS_WINDOWS_MINGW 1
# elif defined(WINAPI_FAMILY)
# include
# if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
# define GTEST_OS_WINDOWS_DESKTOP 1
# elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_PHONE_APP)
# define GTEST_OS_WINDOWS_PHONE 1
# elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP)
# define GTEST_OS_WINDOWS_RT 1
# else
// WINAPI_FAMILY defined but no known partition matched.
// Default to desktop.
# define GTEST_OS_WINDOWS_DESKTOP 1
# endif
# else
# define GTEST_OS_WINDOWS_DESKTOP 1
# endif // _WIN32_WCE
#elif defined __APPLE__
# define GTEST_OS_MAC 1
# if TARGET_OS_IPHONE
# define GTEST_OS_IOS 1
# endif
#elif defined __FreeBSD__
# define GTEST_OS_FREEBSD 1
#elif defined __linux__
# define GTEST_OS_LINUX 1
# if defined __ANDROID__
# define GTEST_OS_LINUX_ANDROID 1
# endif
#elif defined __MVS__
# define GTEST_OS_ZOS 1
#elif defined(__sun) && defined(__SVR4)
# define GTEST_OS_SOLARIS 1
#elif defined(_AIX)
# define GTEST_OS_AIX 1
#elif defined(__hpux)
# define GTEST_OS_HPUX 1
#elif defined __native_client__
# define GTEST_OS_NACL 1
#elif defined __OpenBSD__
# define GTEST_OS_OPENBSD 1
#elif defined __QNX__
# define GTEST_OS_QNX 1
#endif // __CYGWIN__
#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_
// Copyright 2015, Google Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Injection point for custom user configurations.
// The following macros can be defined:
//
// Flag related macros:
// GTEST_FLAG(flag_name)
// GTEST_USE_OWN_FLAGFILE_FLAG_ - Define to 0 when the system provides its
// own flagfile flag parsing.
// GTEST_DECLARE_bool_(name)
// GTEST_DECLARE_int32_(name)
// GTEST_DECLARE_string_(name)
// GTEST_DEFINE_bool_(name, default_val, doc)
// GTEST_DEFINE_int32_(name, default_val, doc)
// GTEST_DEFINE_string_(name, default_val, doc)
//
// Test filtering:
// GTEST_TEST_FILTER_ENV_VAR_ - The name of an environment variable that
// will be used if --GTEST_FLAG(test_filter)
// is not provided.
//
// Logging:
// GTEST_LOG_(severity)
// GTEST_CHECK_(condition)
// Functions LogToStderr() and FlushInfoLog() have to be provided too.
//
// Threading:
// GTEST_HAS_NOTIFICATION_ - Enabled if Notification is already provided.
// GTEST_HAS_MUTEX_AND_THREAD_LOCAL_ - Enabled if Mutex and ThreadLocal are
// already provided.
// Must also provide GTEST_DECLARE_STATIC_MUTEX_(mutex) and
// GTEST_DEFINE_STATIC_MUTEX_(mutex)
//
// GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks)
// GTEST_LOCK_EXCLUDED_(locks)
//
// ** Custom implementation starts here **
#ifndef GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
#define GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
#endif // GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
#if !defined(GTEST_DEV_EMAIL_)
# define GTEST_DEV_EMAIL_ "googletestframework@@googlegroups.com"
# define GTEST_FLAG_PREFIX_ "gtest_"
# define GTEST_FLAG_PREFIX_DASH_ "gtest-"
# define GTEST_FLAG_PREFIX_UPPER_ "GTEST_"
# define GTEST_NAME_ "Google Test"
# define GTEST_PROJECT_URL_ "http://code.google.com/p/googletest/"
#endif // !defined(GTEST_DEV_EMAIL_)
#if !defined(GTEST_INIT_GOOGLE_TEST_NAME_)
# define GTEST_INIT_GOOGLE_TEST_NAME_ "testing::InitGoogleTest"
#endif // !defined(GTEST_INIT_GOOGLE_TEST_NAME_)
// Determines the version of gcc that is used to compile this.
#ifdef __GNUC__
// 40302 means version 4.3.2.
# define GTEST_GCC_VER_ \
(__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__)
#endif // __GNUC__
// Macros for disabling Microsoft Visual C++ warnings.
//
// GTEST_DISABLE_MSC_WARNINGS_PUSH_(4800 4385)
// /* code that triggers warnings C4800 and C4385 */
// GTEST_DISABLE_MSC_WARNINGS_POP_()
#if _MSC_VER >= 1500
# define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings) \
__pragma(warning(push)) \
__pragma(warning(disable: warnings))
# define GTEST_DISABLE_MSC_WARNINGS_POP_() \
__pragma(warning(pop))
#else
// Older versions of MSVC don't have __pragma.
# define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings)
# define GTEST_DISABLE_MSC_WARNINGS_POP_()
#endif
#ifndef GTEST_LANG_CXX11
// gcc and clang define __GXX_EXPERIMENTAL_CXX0X__ when
// -std={c,gnu}++{0x,11} is passed. The C++11 standard specifies a
// value for __cplusplus, and recent versions of clang, gcc, and
// probably other compilers set that too in C++11 mode.
# if __GXX_EXPERIMENTAL_CXX0X__ || __cplusplus >= 201103L
// Compiling in at least C++11 mode.
# define GTEST_LANG_CXX11 1
# else
# define GTEST_LANG_CXX11 0
# endif
#endif
// Distinct from C++11 language support, some environments don't provide
// proper C++11 library support. Notably, it's possible to build in
// C++11 mode when targeting Mac OS X 10.6, which has an old libstdc++
// with no C++11 support.
//
// libstdc++ has sufficient C++11 support as of GCC 4.6.0, __GLIBCXX__
// 20110325, but maintenance releases in the 4.4 and 4.5 series followed
// this date, so check for those versions by their date stamps.
// https://gcc.gnu.org/onlinedocs/libstdc++/manual/abi.html#abi.versioning
#if GTEST_LANG_CXX11 && \
(!defined(__GLIBCXX__) || ( \
__GLIBCXX__ >= 20110325ul && /* GCC >= 4.6.0 */ \
/* Blacklist of patch releases of older branches: */ \
__GLIBCXX__ != 20110416ul && /* GCC 4.4.6 */ \
__GLIBCXX__ != 20120313ul && /* GCC 4.4.7 */ \
__GLIBCXX__ != 20110428ul && /* GCC 4.5.3 */ \
__GLIBCXX__ != 20120702ul)) /* GCC 4.5.4 */
# define GTEST_STDLIB_CXX11 1
#endif
// Only use C++11 library features if the library provides them.
#if GTEST_STDLIB_CXX11
# define GTEST_HAS_STD_BEGIN_AND_END_ 1
# define GTEST_HAS_STD_FORWARD_LIST_ 1
# define GTEST_HAS_STD_FUNCTION_ 1
# define GTEST_HAS_STD_INITIALIZER_LIST_ 1
# define GTEST_HAS_STD_MOVE_ 1
# define GTEST_HAS_STD_SHARED_PTR_ 1
# define GTEST_HAS_STD_TYPE_TRAITS_ 1
# define GTEST_HAS_STD_UNIQUE_PTR_ 1
#endif
// C++11 specifies that provides std::tuple.
// Some platforms still might not have it, however.
#if GTEST_LANG_CXX11
# define GTEST_HAS_STD_TUPLE_ 1
# if defined(__clang__)
// Inspired by http://clang.llvm.org/docs/LanguageExtensions.html#__has_include
# if defined(__has_include) && !__has_include()
# undef GTEST_HAS_STD_TUPLE_
# endif
# elif defined(_MSC_VER)
// Inspired by boost/config/stdlib/dinkumware.hpp
# if defined(_CPPLIB_VER) && _CPPLIB_VER < 520
# undef GTEST_HAS_STD_TUPLE_
# endif
# elif defined(__GLIBCXX__)
// Inspired by boost/config/stdlib/libstdcpp3.hpp,
// http://gcc.gnu.org/gcc-4.2/changes.html and
// http://gcc.gnu.org/onlinedocs/libstdc++/manual/bk01pt01ch01.html#manual.intro.status.standard.200x
# if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 2)
# undef GTEST_HAS_STD_TUPLE_
# endif
# endif
#endif
// Brings in definitions for functions used in the testing::internal::posix
// namespace (read, write, close, chdir, isatty, stat). We do not currently
// use them on Windows Mobile.
#if GTEST_OS_WINDOWS
# if !GTEST_OS_WINDOWS_MOBILE
# include
# include
# endif
// In order to avoid having to include , use forward declaration
// assuming CRITICAL_SECTION is a typedef of _RTL_CRITICAL_SECTION.
// This assumption is verified by
// WindowsTypesTest.CRITICAL_SECTIONIs_RTL_CRITICAL_SECTION.
struct _RTL_CRITICAL_SECTION;
#else
// This assumes that non-Windows OSes provide unistd.h. For OSes where this
// is not the case, we need to include headers that provide the functions
// mentioned above.
# include
# include
#endif // GTEST_OS_WINDOWS
#if GTEST_OS_LINUX_ANDROID
// Used to define __ANDROID_API__ matching the target NDK API level.
# include // NOLINT
#endif
// Defines this to true iff Google Test can use POSIX regular expressions.
#ifndef GTEST_HAS_POSIX_RE
# if GTEST_OS_LINUX_ANDROID
// On Android, is only available starting with Gingerbread.
# define GTEST_HAS_POSIX_RE (__ANDROID_API__ >= 9)
# else
# define GTEST_HAS_POSIX_RE (!GTEST_OS_WINDOWS)
# endif
#endif
#if GTEST_USES_PCRE
// The appropriate headers have already been included.
#elif GTEST_HAS_POSIX_RE
// On some platforms, needs someone to define size_t, and
// won't compile otherwise. We can #include it here as we already
// included , which is guaranteed to define size_t through
// .
# include // NOLINT
# define GTEST_USES_POSIX_RE 1
#elif GTEST_OS_WINDOWS
// is not available on Windows. Use our own simple regex
// implementation instead.
# define GTEST_USES_SIMPLE_RE 1
#else
// may not be available on this platform. Use our own
// simple regex implementation instead.
# define GTEST_USES_SIMPLE_RE 1
#endif // GTEST_USES_PCRE
#ifndef GTEST_HAS_EXCEPTIONS
// The user didn't tell us whether exceptions are enabled, so we need
// to figure it out.
# if defined(_MSC_VER) || defined(__BORLANDC__)
// MSVC's and C++Builder's implementations of the STL use the _HAS_EXCEPTIONS
// macro to enable exceptions, so we'll do the same.
// Assumes that exceptions are enabled by default.
# ifndef _HAS_EXCEPTIONS
# define _HAS_EXCEPTIONS 1
# endif // _HAS_EXCEPTIONS
# define GTEST_HAS_EXCEPTIONS _HAS_EXCEPTIONS
# elif defined(__clang__)
// clang defines __EXCEPTIONS iff exceptions are enabled before clang 220714,
// but iff cleanups are enabled after that. In Obj-C++ files, there can be
// cleanups for ObjC exceptions which also need cleanups, even if C++ exceptions
// are disabled. clang has __has_feature(cxx_exceptions) which checks for C++
// exceptions starting at clang r206352, but which checked for cleanups prior to
// that. To reliably check for C++ exception availability with clang, check for
// __EXCEPTIONS && __has_feature(cxx_exceptions).
# define GTEST_HAS_EXCEPTIONS (__EXCEPTIONS && __has_feature(cxx_exceptions))
# elif defined(__GNUC__) && __EXCEPTIONS
// gcc defines __EXCEPTIONS to 1 iff exceptions are enabled.
# define GTEST_HAS_EXCEPTIONS 1
# elif defined(__SUNPRO_CC)
// Sun Pro CC supports exceptions. However, there is no compile-time way of
// detecting whether they are enabled or not. Therefore, we assume that
// they are enabled unless the user tells us otherwise.
# define GTEST_HAS_EXCEPTIONS 1
# elif defined(__IBMCPP__) && __EXCEPTIONS
// xlC defines __EXCEPTIONS to 1 iff exceptions are enabled.
# define GTEST_HAS_EXCEPTIONS 1
# elif defined(__HP_aCC)
// Exception handling is in effect by default in HP aCC compiler. It has to
// be turned of by +noeh compiler option if desired.
# define GTEST_HAS_EXCEPTIONS 1
# else
// For other compilers, we assume exceptions are disabled to be
// conservative.
# define GTEST_HAS_EXCEPTIONS 0
# endif // defined(_MSC_VER) || defined(__BORLANDC__)
#endif // GTEST_HAS_EXCEPTIONS
#if !defined(GTEST_HAS_STD_STRING)
// Even though we don't use this macro any longer, we keep it in case
// some clients still depend on it.
# define GTEST_HAS_STD_STRING 1
#elif !GTEST_HAS_STD_STRING
// The user told us that ::std::string isn't available.
# error "Google Test cannot be used where ::std::string isn't available."
#endif // !defined(GTEST_HAS_STD_STRING)
#ifndef GTEST_HAS_GLOBAL_STRING
// The user didn't tell us whether ::string is available, so we need
// to figure it out.
# define GTEST_HAS_GLOBAL_STRING 0
#endif // GTEST_HAS_GLOBAL_STRING
#ifndef GTEST_HAS_STD_WSTRING
// The user didn't tell us whether ::std::wstring is available, so we need
// to figure it out.
// TODO(wan@google.com): uses autoconf to detect whether ::std::wstring
// is available.
// Cygwin 1.7 and below doesn't support ::std::wstring.
// Solaris' libc++ doesn't support it either. Android has
// no support for it at least as recent as Froyo (2.2).
# define GTEST_HAS_STD_WSTRING \
(!(GTEST_OS_LINUX_ANDROID || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS))
#endif // GTEST_HAS_STD_WSTRING
#ifndef GTEST_HAS_GLOBAL_WSTRING
// The user didn't tell us whether ::wstring is available, so we need
// to figure it out.
# define GTEST_HAS_GLOBAL_WSTRING \
(GTEST_HAS_STD_WSTRING && GTEST_HAS_GLOBAL_STRING)
#endif // GTEST_HAS_GLOBAL_WSTRING
// Determines whether RTTI is available.
#ifndef GTEST_HAS_RTTI
// The user didn't tell us whether RTTI is enabled, so we need to
// figure it out.
# ifdef _MSC_VER
# ifdef _CPPRTTI // MSVC defines this macro iff RTTI is enabled.
# define GTEST_HAS_RTTI 1
# else
# define GTEST_HAS_RTTI 0
# endif
// Starting with version 4.3.2, gcc defines __GXX_RTTI iff RTTI is enabled.
# elif defined(__GNUC__) && (GTEST_GCC_VER_ >= 40302)
# ifdef __GXX_RTTI
// When building against STLport with the Android NDK and with
// -frtti -fno-exceptions, the build fails at link time with undefined
// references to __cxa_bad_typeid. Note sure if STL or toolchain bug,
// so disable RTTI when detected.
# if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR) && \
!defined(__EXCEPTIONS)
# define GTEST_HAS_RTTI 0
# else
# define GTEST_HAS_RTTI 1
# endif // GTEST_OS_LINUX_ANDROID && __STLPORT_MAJOR && !__EXCEPTIONS
# else
# define GTEST_HAS_RTTI 0
# endif // __GXX_RTTI
// Clang defines __GXX_RTTI starting with version 3.0, but its manual recommends
// using has_feature instead. has_feature(cxx_rtti) is supported since 2.7, the
// first version with C++ support.
# elif defined(__clang__)
# define GTEST_HAS_RTTI __has_feature(cxx_rtti)
// Starting with version 9.0 IBM Visual Age defines __RTTI_ALL__ to 1 if
// both the typeid and dynamic_cast features are present.
# elif defined(__IBMCPP__) && (__IBMCPP__ >= 900)
# ifdef __RTTI_ALL__
# define GTEST_HAS_RTTI 1
# else
# define GTEST_HAS_RTTI 0
# endif
# else
// For all other compilers, we assume RTTI is enabled.
# define GTEST_HAS_RTTI 1
# endif // _MSC_VER
#endif // GTEST_HAS_RTTI
// It's this header's responsibility to #include when RTTI
// is enabled.
#if GTEST_HAS_RTTI
# include
#endif
// Determines whether Google Test can use the pthreads library.
#ifndef GTEST_HAS_PTHREAD
// The user didn't tell us explicitly, so we make reasonable assumptions about
// which platforms have pthreads support.
//
// To disable threading support in Google Test, add -DGTEST_HAS_PTHREAD=0
// to your compiler flags.
# define GTEST_HAS_PTHREAD (GTEST_OS_LINUX || GTEST_OS_MAC || GTEST_OS_HPUX \
|| GTEST_OS_QNX || GTEST_OS_FREEBSD || GTEST_OS_NACL)
#endif // GTEST_HAS_PTHREAD
#if GTEST_HAS_PTHREAD
// gtest-port.h guarantees to #include when GTEST_HAS_PTHREAD is
// true.
# include // NOLINT
// For timespec and nanosleep, used below.
# include // NOLINT
#endif
// Determines if hash_map/hash_set are available.
// Only used for testing against those containers.
#if !defined(GTEST_HAS_HASH_MAP_)
# if _MSC_VER
# define GTEST_HAS_HASH_MAP_ 1 // Indicates that hash_map is available.
# define GTEST_HAS_HASH_SET_ 1 // Indicates that hash_set is available.
# endif // _MSC_VER
#endif // !defined(GTEST_HAS_HASH_MAP_)
// Determines whether Google Test can use tr1/tuple. You can define
// this macro to 0 to prevent Google Test from using tuple (any
// feature depending on tuple with be disabled in this mode).
#ifndef GTEST_HAS_TR1_TUPLE
# if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR)
// STLport, provided with the Android NDK, has neither or .
# define GTEST_HAS_TR1_TUPLE 0
# else
// The user didn't tell us not to do it, so we assume it's OK.
# define GTEST_HAS_TR1_TUPLE 1
# endif
#endif // GTEST_HAS_TR1_TUPLE
// Determines whether Google Test's own tr1 tuple implementation
// should be used.
#ifndef GTEST_USE_OWN_TR1_TUPLE
// The user didn't tell us, so we need to figure it out.
// We use our own TR1 tuple if we aren't sure the user has an
// implementation of it already. At this time, libstdc++ 4.0.0+ and
// MSVC 2010 are the only mainstream standard libraries that come
// with a TR1 tuple implementation. NVIDIA's CUDA NVCC compiler
// pretends to be GCC by defining __GNUC__ and friends, but cannot
// compile GCC's tuple implementation. MSVC 2008 (9.0) provides TR1
// tuple in a 323 MB Feature Pack download, which we cannot assume the
// user has. QNX's QCC compiler is a modified GCC but it doesn't
// support TR1 tuple. libc++ only provides std::tuple, in C++11 mode,
// and it can be used with some compilers that define __GNUC__.
# if (defined(__GNUC__) && !defined(__CUDACC__) && (GTEST_GCC_VER_ >= 40000) \
&& !GTEST_OS_QNX && !defined(_LIBCPP_VERSION)) || _MSC_VER >= 1600
# define GTEST_ENV_HAS_TR1_TUPLE_ 1
# endif
// C++11 specifies that provides std::tuple. Use that if gtest is used
// in C++11 mode and libstdc++ isn't very old (binaries targeting OS X 10.6
// can build with clang but need to use gcc4.2's libstdc++).
# if GTEST_LANG_CXX11 && (!defined(__GLIBCXX__) || __GLIBCXX__ > 20110325)
# define GTEST_ENV_HAS_STD_TUPLE_ 1
# endif
# if GTEST_ENV_HAS_TR1_TUPLE_ || GTEST_ENV_HAS_STD_TUPLE_
# define GTEST_USE_OWN_TR1_TUPLE 0
# else
# define GTEST_USE_OWN_TR1_TUPLE 1
# endif
#endif // GTEST_USE_OWN_TR1_TUPLE
// To avoid conditional compilation everywhere, we make it
// gtest-port.h's responsibility to #include the header implementing
// tuple.
#if GTEST_HAS_STD_TUPLE_
# include // IWYU pragma: export
# define GTEST_TUPLE_NAMESPACE_ ::std
#endif // GTEST_HAS_STD_TUPLE_
// We include tr1::tuple even if std::tuple is available to define printers for
// them.
#if GTEST_HAS_TR1_TUPLE
# ifndef GTEST_TUPLE_NAMESPACE_
# define GTEST_TUPLE_NAMESPACE_ ::std::tr1
# endif // GTEST_TUPLE_NAMESPACE_
# if GTEST_USE_OWN_TR1_TUPLE
// This file was GENERATED by command:
// pump.py gtest-tuple.h.pump
// DO NOT EDIT BY HAND!!!
// Copyright 2009 Google Inc.
// All Rights Reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Author: wan@google.com (Zhanyong Wan)
// Implements a subset of TR1 tuple needed by Google Test and Google Mock.
#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_
#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_
#include // For ::std::pair.
// The compiler used in Symbian has a bug that prevents us from declaring the
// tuple template as a friend (it complains that tuple is redefined). This
// hack bypasses the bug by declaring the members that should otherwise be
// private as public.
// Sun Studio versions < 12 also have the above bug.
#if defined(__SYMBIAN32__) || (defined(__SUNPRO_CC) && __SUNPRO_CC < 0x590)
# define GTEST_DECLARE_TUPLE_AS_FRIEND_ public:
#else
# define GTEST_DECLARE_TUPLE_AS_FRIEND_ \
template friend class tuple; \
private:
#endif
// Visual Studio 2010, 2012, and 2013 define symbols in std::tr1 that conflict
// with our own definitions. Therefore using our own tuple does not work on
// those compilers.
#if defined(_MSC_VER) && _MSC_VER >= 1600 /* 1600 is Visual Studio 2010 */
# error "gtest's tuple doesn't compile on Visual Studio 2010 or later. \
GTEST_USE_OWN_TR1_TUPLE must be set to 0 on those compilers."
#endif
// GTEST_n_TUPLE_(T) is the type of an n-tuple.
#define GTEST_0_TUPLE_(T) tuple<>
#define GTEST_1_TUPLE_(T) tuple
#define GTEST_2_TUPLE_(T) tuple
#define GTEST_3_TUPLE_(T) tuple
#define GTEST_4_TUPLE_(T) tuple
#define GTEST_5_TUPLE_(T) tuple
#define GTEST_6_TUPLE_(T) tuple
#define GTEST_7_TUPLE_(T) tuple
#define GTEST_8_TUPLE_(T) tuple
#define GTEST_9_TUPLE_(T) tuple
#define GTEST_10_TUPLE_(T) tuple
// GTEST_n_TYPENAMES_(T) declares a list of n typenames.
#define GTEST_0_TYPENAMES_(T)
#define GTEST_1_TYPENAMES_(T) typename T##0
#define GTEST_2_TYPENAMES_(T) typename T##0, typename T##1
#define GTEST_3_TYPENAMES_(T) typename T##0, typename T##1, typename T##2
#define GTEST_4_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
typename T##3
#define GTEST_5_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
typename T##3, typename T##4
#define GTEST_6_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
typename T##3, typename T##4, typename T##5
#define GTEST_7_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
typename T##3, typename T##4, typename T##5, typename T##6
#define GTEST_8_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
typename T##3, typename T##4, typename T##5, typename T##6, typename T##7
#define GTEST_9_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
typename T##3, typename T##4, typename T##5, typename T##6, \
typename T##7, typename T##8
#define GTEST_10_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
typename T##3, typename T##4, typename T##5, typename T##6, \
typename T##7, typename T##8, typename T##9
// In theory, defining stuff in the ::std namespace is undefined
// behavior. We can do this as we are playing the role of a standard
// library vendor.
namespace std {
namespace tr1 {
template
class tuple;
// Anything in namespace gtest_internal is Google Test's INTERNAL
// IMPLEMENTATION DETAIL and MUST NOT BE USED DIRECTLY in user code.
namespace gtest_internal {
// ByRef