Repository: NVIDIA-Merlin/HierarchicalKV Branch: master Commit: ae24eecde0b4 Files: 104 Total size: 2.0 MB Directory structure: gitextract_3c35qd95/ ├── .bazeliskrc ├── .bazelrc ├── .clang-format ├── .github/ │ └── workflows/ │ ├── blossom-ci.yml │ ├── docs-build.yaml │ ├── docs-preview-pr.yaml │ ├── docs-remove-stale-reviews.yaml │ └── docs-sched-rebuild.yaml ├── .gitignore ├── .gitmodules ├── CMakeLists.txt ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── STYLE_GUIDE.md ├── WORKSPACE ├── bazel_build.sh ├── benchmark/ │ ├── BUILD │ ├── benchmark_util.cuh │ ├── dual_bucket_benchmark.cc.cu │ ├── find_with_missed_keys_benchmark.cc.cu │ └── merlin_hashtable_benchmark.cc.cu ├── build_deps/ │ ├── gpus/ │ │ ├── BUILD │ │ ├── check_cuda_libs.py │ │ ├── configure.bzl │ │ ├── crosstool/ │ │ │ ├── BUILD │ │ │ ├── BUILD.tpl │ │ │ ├── cc_toolchain_config.bzl.tpl │ │ │ └── crosstool_compiler_wrapper.tpl │ │ ├── cuda/ │ │ │ ├── BUILD │ │ │ ├── BUILD.tpl │ │ │ ├── build_defs.bzl.tpl │ │ │ ├── cuda_config.h.tpl │ │ │ └── cuda_config.py.tpl │ │ └── find_cuda_config.py │ └── remote_config/ │ ├── BUILD │ ├── BUILD.tpl │ ├── common.bzl │ └── remote_platform_configure.bzl ├── cmake/ │ └── modules/ │ └── ClangFormat.cmake ├── docs/ │ ├── Makefile │ ├── README.md │ ├── make.bat │ ├── requirements-doc.txt │ └── source/ │ ├── _static/ │ │ ├── .gitkeep │ │ └── css/ │ │ ├── banner.css │ │ └── custom.css │ ├── _templates/ │ │ ├── footer.html │ │ └── versions.html │ ├── conf.py │ ├── index.rst │ └── toc.yaml ├── include/ │ ├── BUILD │ ├── merlin/ │ │ ├── BUILD │ │ ├── allocator.cuh │ │ ├── array_kernels.cuh │ │ ├── core_kernels/ │ │ │ ├── BUILD │ │ │ ├── accum_or_assign.cuh │ │ │ ├── contains.cuh │ │ │ ├── dual_bucket_lookup.cuh │ │ │ ├── dual_bucket_upsert.cuh │ │ │ ├── dual_bucket_utils.cuh │ │ │ ├── find_or_insert.cuh │ │ │ ├── find_ptr_or_insert.cuh │ │ │ ├── group_lock_kernels.cuh │ │ │ ├── kernel_utils.cuh │ │ │ ├── lookup.cuh │ │ │ ├── lookup_ptr.cuh │ │ │ ├── update.cuh │ │ │ ├── update_score.cuh │ │ │ ├── update_values.cuh │ │ │ ├── upsert.cuh │ │ │ └── upsert_and_evict.cuh │ │ ├── core_kernels.cuh │ │ ├── debug.hpp │ │ ├── flexible_buffer.cuh │ │ ├── group_lock.cuh │ │ ├── memory_pool.cuh │ │ ├── multi_vector.hpp │ │ ├── optimizers.cuh │ │ ├── types.cuh │ │ └── utils.cuh │ ├── merlin_hashtable.cuh │ └── merlin_localfile.hpp ├── run_all_tests.sh └── tests/ ├── accum_or_assign_test.cc.cu ├── assign_score_test.cc.cu ├── assign_values_test.cc.cu ├── dual_bucket_test.cc.cu ├── dynamic_max_capacity_test.cc.cu ├── export_batch_if_test.cc.cu ├── find_or_insert_ptr_lock_test.cc.cu ├── find_or_insert_ptr_test.cc.cu ├── find_or_insert_test.cc.cu ├── find_with_missed_keys_test.cc.cu ├── group_lock_test.cc.cu ├── insert_and_evict_test.cc.cu ├── lock_unlock_test.cc.cu ├── memory_pool_test.cc.cu ├── merlin_hashtable_test.cc.cu ├── reserved_keys_test.cc.cu ├── save_and_load_test.cc.cu ├── test_util.cuh └── uint32_score_test.cc.cu ================================================ FILE CONTENTS ================================================ ================================================ FILE: .bazeliskrc ================================================ USE_BAZEL_VERSION=5.0.0 ================================================ FILE: .bazelrc ================================================ build -c opt build --copt -O3 build --copt -pthread build --linkopt -pthread build --linkopt -ldl build --incompatible_linkopts_to_linklibs build --copt -g --strip=never build --experimental_repo_remote_exec # By default, build HKV in C++ 17 mode. build --cxxopt=-std=c++17 build --host_cxxopt=-std=c++17 # This config refers to building CUDA kernels with nvcc. build:cuda --crosstool_top=@local_config_cuda//crosstool:toolchain # CUDA options build:cuda --action_env GCC_HOST_COMPILER_PATH="/usr/bin/gcc" build:cuda --action_env CUDA_TOOLKIT_PATH="/usr/local/cuda" build:cuda --action_env CUDA_VERSION="11" build:cuda --action_env CUDNN_VERSION="8" build:cuda --action_env CUDNN_INSTALL_PATH="/usr/" build:cuda --action_env CUDA_COMPUTE_CAPABILITIES="7.5" ================================================ FILE: .clang-format ================================================ BasedOnStyle: Google DerivePointerAlignment: false IncludeBlocks: Merge SortIncludes: true ================================================ FILE: .github/workflows/blossom-ci.yml ================================================ # Copyright (c) 2020-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # A workflow to trigger ci on hybrid infra (github + self hosted runner) name: Blossom-CI on: issue_comment: types: [created] workflow_dispatch: inputs: platform: description: 'runs-on argument' required: false args: description: 'argument' required: false jobs: Authorization: name: Authorization runs-on: blossom outputs: args: ${{ env.args }} # This job only runs for pull request comments if: | (github.actor == 'EmmaQiaoCh' || github.actor == 'rhdong' || github.actor == 'Ranjeet-Nvidia' || github.actor == 'jiashuy') && github.event.comment.body == '/blossom-ci' steps: - name: Check if comment is issued by authorized person run: blossom-ci env: OPERATION: 'AUTH' REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }} REPO_KEY_DATA: ${{ secrets.BLOSSOM_KEY }} Vulnerability-scan: name: Vulnerability scan needs: [Authorization] runs-on: ubuntu-latest steps: - name: Checkout code uses: actions/checkout@v2 with: repository: ${{ fromJson(needs.Authorization.outputs.args).repo }} ref: ${{ fromJson(needs.Authorization.outputs.args).ref }} lfs: 'true' - name: Run blossom action uses: NVIDIA/blossom-action@main env: REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }} REPO_KEY_DATA: ${{ secrets.BLOSSOM_KEY }} with: args1: ${{ fromJson(needs.Authorization.outputs.args).args1 }} args2: ${{ fromJson(needs.Authorization.outputs.args).args2 }} args3: ${{ fromJson(needs.Authorization.outputs.args).args3 }} Job-trigger: name: Start ci job needs: [Vulnerability-scan] runs-on: blossom steps: - name: Start ci job run: blossom-ci env: OPERATION: 'START-CI-JOB' CI_SERVER: ${{ secrets.CI_SERVER }} REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }} Upload-Log: name: Upload log runs-on: blossom if : github.event_name == 'workflow_dispatch' steps: - name: Jenkins log for pull request ${{ fromJson(github.event.inputs.args).pr }} (click here) run: blossom-ci env: OPERATION: 'POST-PROCESSING' CI_SERVER: ${{ secrets.CI_SERVER }} REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }} ================================================ FILE: .github/workflows/docs-build.yaml ================================================ name: docs-build on: pull_request: branches: [master] jobs: build: runs-on: "ubuntu-latest" steps: - uses: actions/checkout@v3 - name: Set up Python 3.8 uses: actions/setup-python@v4 with: python-version: '3.8' - name: Install Ubuntu packages run: | sudo apt-get update -y sudo apt-get install -y --no-install-recommends doxygen - name: Install dependencies run: | python -m pip install -r docs/requirements-doc.txt - name: Building docs run: | make -C docs html - name: Upload HTML uses: actions/upload-artifact@v4 with: name: html-build-artifact path: docs/build/html if-no-files-found: error retention-days: 1 - name: Store PR information run: | mkdir ./pr echo ${{ github.event.number }} > ./pr/pr.txt echo ${{ github.event.pull_request.merged }} > ./pr/merged.txt echo ${{ github.event.action }} > ./pr/action.txt - name: Upload PR information uses: actions/upload-artifact@v4 with: name: pr path: pr/ ================================================ FILE: .github/workflows/docs-preview-pr.yaml ================================================ name: docs-preview-pr on: workflow_run: workflows: [docs-build] types: [completed] env: WF_ID: ${{ github.event.workflow_run.id }} jobs: preview: uses: nvidia-merlin/.github/.github/workflows/docs-preview-pr-common.yaml@main ================================================ FILE: .github/workflows/docs-remove-stale-reviews.yaml ================================================ name: docs-remove-stale-reviews on: schedule: # 42 minutes after 0:00 UTC on Sundays - cron: "42 0 * * 0" workflow_dispatch: jobs: remove: uses: nvidia-merlin/.github/.github/workflows/docs-remove-stale-reviews-common.yaml@main ================================================ FILE: .github/workflows/docs-sched-rebuild.yaml ================================================ name: docs-sched-rebuild on: push: branches: [master] tags: - v* workflow_dispatch: jobs: build: runs-on: "ubuntu-latest" steps: - uses: actions/checkout@v3 with: fetch-depth: 0 - name: Set up Python 3.8 uses: actions/setup-python@v4 with: python-version: 3.8 - name: Install Ubuntu packages run: | sudo apt-get update -y sudo apt-get install -y doxygen - name: Install dependencies run: | python -m pip install --upgrade pip python -m pip install -r docs/requirements-doc.txt - name: Report the versions to build run: | sphinx-multiversion -D 'exhale_args.containmentFolder=${sourcedir}/api' --dump-metadata docs/source docs/build/html | jq "keys" - name: Building docs (multiversion) run: | sphinx-multiversion -D 'exhale_args.containmentFolder=${sourcedir}/api' docs/source docs/build/html - name: Delete unnecessary files run: | find docs/build -name .doctrees -prune -exec rm -rf {} \; find docs/build -name .buildinfo -exec rm {} \; - name: Upload HTML uses: actions/upload-artifact@v4 with: name: html-build-artifact path: docs/build/html if-no-files-found: error retention-days: 1 # Identify the dir for the HTML. store-html: needs: [build] runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 with: ref: "gh-pages" - name: Initialize Git configuration run: | git config user.name docs-sched-rebuild git config user.email do-not-send-@github.com - name: Download artifacts uses: actions/download-artifact@v4 with: name: html-build-artifact - name: Copy HTML directories run: | ls -asl for i in `ls -d *` do echo "Git adding ${i}" git add "${i}" done - name: Check or create dot-no-jekyll file run: | if [ -f ".nojekyll" ]; then echo "The dot-no-jekyll file already exists." exit 0 fi touch .nojekyll git add .nojekyll - name: Check or create redirect page env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | resp=$(grep 'http-equiv="refresh"' index.html 2>/dev/null) || true if [ -n "${resp}" ]; then echo "The redirect file already exists." exit 0 fi # If any of these commands fail, fail the build. def_branch=$(gh api "repos/${GITHUB_REPOSITORY}" --jq ".default_branch") html_url=$(gh api "repos/${GITHUB_REPOSITORY}/pages" --jq ".html_url") # Beware ugly quotation mark avoidance in the foll lines. echo '' > index.html echo '' >> index.html echo ' ' >> index.html echo ' Redirect to documentation' >> index.html echo ' ' >> index.html echo ' ' >> index.html echo ' ' >> index.html echo ' ' >> index.html echo ' ' >> index.html echo ' ' >> index.html echo '

Please follow the link to the ' >> index.html echo ${def_branch}' branch documentation.

' >> index.html echo ' ' >> index.html echo '' >> index.html git add index.html - name: Commit changes to the GitHub Pages branch run: | git status if git commit -m 'Pushing changes to GitHub Pages.'; then git push -f else echo "Nothing changed." fi ================================================ FILE: .gitignore ================================================ .DS_Store .idea .vscode build .clwb cmake-build-debug/ docs/build docs/source/README.md docs/source/CONTRIBUTING.md docs/source/api ================================================ FILE: .gitmodules ================================================ [submodule "tests/googletest"] path = tests/googletest url = https://github.com/google/googletest.git ignore = dirty ================================================ FILE: CMakeLists.txt ================================================ # Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. cmake_minimum_required(VERSION 3.10) project(merlin-hkvs LANGUAGES CXX CUDA) find_package(CUDAToolkit) # TODO(Q3): target_compile_features below still declare cxx_std_14, which is # inconsistent with the project-level C++17. Update them to cxx_std_17 (or # remove the per-target lines entirely) once downstream compatibility is # confirmed. set(CMAKE_CXX_STANDARD 17) set(CMAKE_CUDA_STANDARD 17) list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/modules) option(CLANGFORMAT "Clangformat code files before compiling" OFF) if(CLANGFORMAT) include(ClangFormat) file(GLOB_RECURSE clangformat_includes ${PROJECT_SOURCE_DIR}/include/*.h ${PROJECT_SOURCE_DIR}/include/*.hpp ${PROJECT_SOURCE_DIR}/include/*.cuh ) file(GLOB clangformat_tests ${PROJECT_SOURCE_DIR}/tests/*.c ${PROJECT_SOURCE_DIR}/tests/*.h ${PROJECT_SOURCE_DIR}/tests/*.cpp ${PROJECT_SOURCE_DIR}/tests/*.hpp ${PROJECT_SOURCE_DIR}/tests/*.cu ${PROJECT_SOURCE_DIR}/tests/*.cuh ) set(clangformat_files ${clangformat_includes} ${clangformat_tests}) clangformat_setup("${clangformat_files}") endif() # Default to release build. if (NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE "Release") message(STATUS "Setting default CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}") endif() # Some neat defaults. set(CUDA_SEPARABLE_COMPILATION ON) # Select target CUDA binary architecture. foreach(cuda_arch ${sm}) list(APPEND cuda_arch_list ${cuda_arch}) message(STATUS "Assign GPU architecture (sm=${cuda_arch})") endforeach() list(LENGTH cuda_arch_list cuda_arch_list_length) if(cuda_arch_list_length EQUAL 0) list(APPEND cuda_arch_list "80") message(STATUS "Assign default GPU architecture sm=80") endif() if (CMAKE_BUILD_TYPE STREQUAL "Debug") add_compile_definitions(CUDA_ERROR_CHECK) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") endif() foreach(cuda_arch ${cuda_arch_list}) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode arch=compute_${cuda_arch},code=sm_${cuda_arch}") endforeach() message(CMAKE_CUDA_FLAGS="${CMAKE_CUDA_FLAGS}") include_directories( ${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/tests/googletest/googletest/include ) ADD_SUBDIRECTORY(tests/googletest) link_directories( ) file(GLOB_RECURSE merlin_hkvs_src RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.cpp *.cu) # TODO: # add_library(hierarchical_kv STATIC ${hierarchical_kv_src}) # target_compile_features(hierarchical_kv PUBLIC cxx_std_14) # target_link_libraries(hierarchical_kv PUBLIC ...) add_executable(merlin_hashtable_benchmark benchmark/merlin_hashtable_benchmark.cc.cu) target_compile_features(merlin_hashtable_benchmark PUBLIC cxx_std_14) set_target_properties(merlin_hashtable_benchmark PROPERTIES CUDA_ARCHITECTURES OFF) add_executable(find_with_missed_keys_benchmark benchmark/find_with_missed_keys_benchmark.cc.cu) target_compile_features(find_with_missed_keys_benchmark PUBLIC cxx_std_14) set_target_properties(find_with_missed_keys_benchmark PROPERTIES CUDA_ARCHITECTURES OFF) add_executable(merlin_hashtable_test tests/merlin_hashtable_test.cc.cu) target_compile_features(merlin_hashtable_test PUBLIC cxx_std_14) set_target_properties(merlin_hashtable_test PROPERTIES CUDA_ARCHITECTURES OFF) TARGET_LINK_LIBRARIES(merlin_hashtable_test gtest_main) add_executable(find_or_insert_test tests/find_or_insert_test.cc.cu) target_compile_features(find_or_insert_test PUBLIC cxx_std_14) set_target_properties(find_or_insert_test PROPERTIES CUDA_ARCHITECTURES OFF) TARGET_LINK_LIBRARIES(find_or_insert_test gtest_main) add_executable(merlin_memory_pool_test tests/memory_pool_test.cc.cu) target_compile_features(merlin_memory_pool_test PUBLIC cxx_std_14) set_target_properties(merlin_memory_pool_test PROPERTIES CUDA_ARCHITECTURES OFF) TARGET_LINK_LIBRARIES(merlin_memory_pool_test gtest_main) set(CMAKE_BUILD_TYPE "Debug") add_executable(save_and_load_test tests/save_and_load_test.cc.cu) target_compile_features(save_and_load_test PUBLIC cxx_std_14) set_target_properties(save_and_load_test PROPERTIES CUDA_ARCHITECTURES OFF) TARGET_LINK_LIBRARIES(save_and_load_test gtest_main) add_executable(insert_and_evict_test tests/insert_and_evict_test.cc.cu) target_compile_features(insert_and_evict_test PUBLIC cxx_std_14) set_target_properties(insert_and_evict_test PROPERTIES CUDA_ARCHITECTURES OFF) TARGET_LINK_LIBRARIES(insert_and_evict_test gtest_main) add_executable(dynamic_max_capacity_test tests/dynamic_max_capacity_test.cc.cu) target_compile_features(dynamic_max_capacity_test PUBLIC cxx_std_14) set_target_properties(dynamic_max_capacity_test PROPERTIES CUDA_ARCHITECTURES OFF) TARGET_LINK_LIBRARIES(dynamic_max_capacity_test gtest_main) add_executable(group_lock_test tests/group_lock_test.cc.cu) target_compile_features(group_lock_test PUBLIC cxx_std_14) set_target_properties(group_lock_test PROPERTIES CUDA_ARCHITECTURES OFF) TARGET_LINK_LIBRARIES(group_lock_test gtest_main) add_executable(find_or_insert_ptr_test tests/find_or_insert_ptr_test.cc.cu) target_compile_features(find_or_insert_ptr_test PUBLIC cxx_std_14) set_target_properties(find_or_insert_ptr_test PROPERTIES CUDA_ARCHITECTURES OFF) TARGET_LINK_LIBRARIES(find_or_insert_ptr_test gtest_main) add_executable(assign_score_test tests/assign_score_test.cc.cu) target_compile_features(assign_score_test PUBLIC cxx_std_14) set_target_properties(assign_score_test PROPERTIES CUDA_ARCHITECTURES OFF) TARGET_LINK_LIBRARIES(assign_score_test gtest_main) add_executable(uint32_score_test tests/uint32_score_test.cc.cu) target_compile_features(uint32_score_test PUBLIC cxx_std_14) set_target_properties(uint32_score_test PROPERTIES CUDA_ARCHITECTURES OFF) TARGET_LINK_LIBRARIES(uint32_score_test gtest_main) add_executable(accum_or_assign_test tests/accum_or_assign_test.cc) target_compile_features(accum_or_assign_test PUBLIC cxx_std_14) set_target_properties(accum_or_assign_test PROPERTIES CUDA_ARCHITECTURES OFF) TARGET_LINK_LIBRARIES(accum_or_assign_test gtest_main) add_executable(assign_values_test tests/assign_values_test.cc.cu) target_compile_features(assign_values_test PUBLIC cxx_std_14) set_target_properties(assign_values_test PROPERTIES CUDA_ARCHITECTURES OFF) TARGET_LINK_LIBRARIES(assign_values_test gtest_main) add_executable(find_with_missed_keys_test tests/find_with_missed_keys_test.cc.cu) target_compile_features(find_with_missed_keys_test PUBLIC cxx_std_14) set_target_properties(find_with_missed_keys_test PROPERTIES CUDA_ARCHITECTURES OFF) TARGET_LINK_LIBRARIES(find_with_missed_keys_test gtest_main) add_executable(reserved_keys_test tests/reserved_keys_test.cc.cu) target_compile_features(reserved_keys_test PUBLIC cxx_std_14) set_target_properties(reserved_keys_test PROPERTIES CUDA_ARCHITECTURES OFF) TARGET_LINK_LIBRARIES(reserved_keys_test gtest_main) add_executable(export_batch_if_test tests/export_batch_if_test.cc.cu) target_compile_features(export_batch_if_test PUBLIC cxx_std_14) set_target_properties(export_batch_if_test PROPERTIES CUDA_ARCHITECTURES OFF) add_executable(find_or_insert_ptr_lock_test tests/find_or_insert_ptr_lock_test.cc.cu) target_compile_features(find_or_insert_ptr_lock_test PUBLIC cxx_std_14) set_target_properties(find_or_insert_ptr_lock_test PROPERTIES CUDA_ARCHITECTURES OFF) TARGET_LINK_LIBRARIES(find_or_insert_ptr_lock_test gtest_main) add_executable(lock_unlock_test tests/lock_unlock_test.cc.cu) target_compile_features(lock_unlock_test PUBLIC cxx_std_14) set_target_properties(lock_unlock_test PROPERTIES CUDA_ARCHITECTURES OFF) TARGET_LINK_LIBRARIES(lock_unlock_test gtest_main) add_executable(dual_bucket_test tests/dual_bucket_test.cc.cu) target_compile_features(dual_bucket_test PUBLIC cxx_std_14) set_target_properties(dual_bucket_test PROPERTIES CUDA_ARCHITECTURES OFF) TARGET_LINK_LIBRARIES(dual_bucket_test gtest_main) add_executable(dual_bucket_benchmark benchmark/dual_bucket_benchmark.cc.cu) target_compile_features(dual_bucket_benchmark PUBLIC cxx_std_14) set_target_properties(dual_bucket_benchmark PROPERTIES CUDA_ARCHITECTURES OFF) ================================================ FILE: CONTRIBUTING.md ================================================ # Contributing ## About HierarchicalKV HierarchicalKV is a part of NVIDIA Merlin and provides hierarchical key-value storage to meet RecSys requirements. The key capability of HierarchicalKV is to store key-value (feature-embedding) on high-bandwidth memory (HBM) of GPUs and in host memory. You can also use the library for generic key-value storage. ## Maintainership HierarchicalKV is co-maintianed by [NVIDIA Merlin Team](https://github.com/NVIDIA-Merlin) and NVIDIA product end-users, and also open for public contributions, bug fixes, and documentation. This project adheres to NVIDIA's Code of Conduct. ## Contributing We’re grateful for your interest in HierarchicalKV and value your contributions. We welcome contributions via pull requests(PR). Before sending out a pull request for significant change on the end-user API, we recommend you open an issue and discuss your proposed change. Some changes may require a design review. All submissions require review by project reviewers. ### Coding Style Refer to the [Style Guide](http://github.com/NVIDIA-Merlin/HierarchicalKV/STYLE_GUIDE.md) ### Additional Requirements In addition to the above requirements, contribution also needs to meet the following criteria: * The change needs to include unit tests and integration tests if any. * Each PR needs to provide necessary documentation for when and how to use it. ## Community * HierarchicalKV code (https://github.com/NVIDIA-Merlin/HierarchicalKV) ## Licence Apache License 2.0 ================================================ FILE: LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "{}" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright 2022 NVIDIA Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: README.md ================================================ # [NVIDIA HierarchicalKV(Beta)](https://github.com/NVIDIA-Merlin/HierarchicalKV) [![Version](https://img.shields.io/github/v/release/NVIDIA-Merlin/HierarchicalKV?color=orange&include_prereleases)](https://github.com/NVIDIA-Merlin/HierarchicalKV/releases) [![GitHub License](https://img.shields.io/github/license/NVIDIA-Merlin/HierarchicalKV)](https://github.com/NVIDIA-Merlin/HierarchicalKV/blob/master/LICENSE) [![Documentation](https://img.shields.io/badge/documentation-blue.svg)](https://nvidia-merlin.github.io/HierarchicalKV/master/README.html) ## About HierarchicalKV HierarchicalKV is a part of NVIDIA Merlin and provides hierarchical key-value storage to meet RecSys requirements. The key capability of HierarchicalKV is to store key-value (feature-embedding) on high-bandwidth memory (HBM) of GPUs and in host memory. You can also use the library for generic key-value storage. ## Benefits When building large recommender systems, machine learning (ML) engineers face the following challenges: - GPUs are needed, but HBM on a single GPU is too small for the large DLRMs that scale to several terabytes. - Improving communication performance is getting more difficult in larger and larger CPU clusters. - It is difficult to efficiently control consumption growth of limited HBM with customized strategies. - Most generic key-value libraries provide low HBM and host memory utilization. HierarchicalKV alleviates these challenges and helps the machine learning engineers in RecSys with the following benefits: - Supports training large RecSys models on **HBM and host memory** at the same time. - Provides better performance by **full bypassing CPUs** and reducing the communication workload. - Implements table-size restraint strategies that are based on **LRU or customized strategies**. The strategies are implemented by CUDA kernels. - Operates at a high working-status load factor that is close to 1.0. ## Key ideas - Buckets are locally ordered - Store keys and values separately - Store all the keys in HBM - Build-in and customizable eviction strategy HierarchicalKV makes NVIDIA GPUs more suitable for training large and super-large models of ***search, recommendations, and advertising***. The library simplifies the common challenges to building, evaluating, and serving sophisticated recommenders models. ## API Documentation The main classes and structs are below, but reading the comments in the source code is recommended: - [`class HashTable`](https://github.com/NVIDIA-Merlin/HierarchicalKV/blob/master/include/merlin_hashtable.cuh#L151) - [`class EvictStrategy`](https://github.com/NVIDIA-Merlin/HierarchicalKV/blob/master/include/merlin_hashtable.cuh#L52) - [`struct HashTableOptions`](https://github.com/NVIDIA-Merlin/HierarchicalKV/blob/master/include/merlin_hashtable.cuh#L60) For regular API doc, please refer to [API Docs](https://nvidia-merlin.github.io/HierarchicalKV/master/api/index.html) ### API Maturity Matrix `industry-validated` means the API has been well-tested and verified in at least one real-world scenario. | Name | Description | Function | |:---------------------|:-------------------------------------------------------------------------------------------------------------------------|:-------------------| | __insert_or_assign__ | Insert or assign for the specified keys.
Overwrite one key with minimum score when bucket is full. | industry-validated | | __insert_and_evict__ | Insert new keys, and evict keys with minimum score when bucket is full. | industry-validated | | __find_or_insert__ | Search for the specified keys, and insert them when missed. | well-tested | | __assign__ | Update for each key and bypass when missed. | well-tested | | __accum_or_assign__ | Search and update for each key. If found, add value as a delta to the original value.
If missed, update it directly. | well-tested | | __find_or_insert\*__ | Search for the specified keys and return the pointers of values. Insert them firstly when missing. | well-tested | | __find__ | Search for the specified keys. | industry-validated | | __find\*__ | Search and return the pointers of values, thread-unsafe but with high performance. | well-tested | | __export_batch__ | Exports a certain number of the key-value-score tuples. | industry-validated | | __export_batch_if__ | Exports a certain number of the key-value-score tuples which match specific conditions. | industry-validated | | __warmup__ | Move the hot key-values from HMEM to HBM | June 15, 2023 | ### Evict Strategy The `score` is introduced to define the importance of each key, the larger, the more important, the less likely they will be evicted. Eviction only happens when a bucket is full. The `score_type` must be `uint64_t`. For more detail, please refer to [`class EvictStrategy`](https://github.com/NVIDIA-Merlin/HierarchicalKV/blob/master/include/merlin_hashtable.cuh#L52). | Name | Definition of `Score` | |:---------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | __Lru__ | Device clock in a nanosecond, which could differ slightly from host clock. | | __Lfu__ | Frequency increment provided by caller via the input parameter of `scores` of `insert-like` APIs as the increment of frequency. | | __EpochLru__ | The high 32bits is the global epoch provided via the input parameter of `global_epoch`,
the low 32bits is equal to `(device_clock >> 20) & 0xffffffff` with granularity close to 1 ms. | | __EpochLfu__ | The high 32bits is the global epoch provided via the input parameter of `global_epoch`,
the low 32bits is the frequency,
the frequency will keep constant after reaching the max value of `0xffffffff`. | | __Customized__ | Fully provided by the caller via the input parameter of `scores` of `insert-like` APIs. | * __Note__: - The `insert-like` APIs mean the APIs of `insert_or_assign`, `insert_and_evict`, `find_or_insert`, `accum_or_assign`, and `find_or_insert`. - The `global_epoch` should be maintained by the caller and input as the input parameter of `insert-like` APIs. ### Configuration Options It's recommended to keep the default configuration for the options ending with `*`. | Name | Type | Default | Description | |:---------------------------|:-------|:--------|:------------------------------------------------------| | __init_capacity__ | size_t | 0 | The initial capacity of the hash table. | | __max_capacity__ | size_t | 0 | The maximum capacity of the hash table. | | __max_hbm_for_vectors__ | size_t | 0 | The maximum HBM for vectors, in bytes. | | __dim__ | size_t | 64 | The dimension of the value vectors. | | __max_bucket_size*__ | size_t | 128 | The length of each bucket. | | __max_load_factor*__ | float | 0.5f | The max load factor before rehashing. | | __block_size*__ | int | 128 | The default block size for CUDA kernels. | | __io_block_size*__ | int | 1024 | The block size for IO CUDA kernels. | | __device_id*__ | int | -1 | The ID of device. Managed internally when set to `-1` | | __io_by_cpu*__ | bool | false | The flag indicating if the CPU handles IO. | | __reserved_key_start_bit__ | int | 0 | The start bit offset of reserved key in the 64 bit | - Fore more details refer to [`struct HashTableOptions`](https://github.com/NVIDIA-Merlin/HierarchicalKV/blob/master/include/merlin_hashtable.cuh#L60). #### Reserved Keys - By default, the keys of `0xFFFFFFFFFFFFFFFD`, `0xFFFFFFFFFFFFFFFE`, and `0xFFFFFFFFFFFFFFFF` are reserved for internal using. change `options.reserved_key_start_bit` if you want to use the above keys. `reserved_key_start_bit` has a valid range from 0 to 62. The default value is 0, which is the above default reserved keys. When `reserved_key_start_bit` is set to any value other than 0, the least significant bit (bit 0) is always `0` for any reserved key. - Setting `reserved_key_start_bit = 1`: - This setting reserves the two least significant bits 1 and 2 for the reserved keys. - In binary, the last four bits range from `1000` to `1110`. Here, the least significant bit (bit 0) is always `0`, and bits from 3 to 63 are set to `1`. - The new reserved keys in hexadecimal representation are as follows: - `0xFFFFFFFFFFFFFFFE` - `0xFFFFFFFFFFFFFFFC` - `0xFFFFFFFFFFFFFFF8` - `0xFFFFFFFFFFFFFFFA` - Setting `reserved_key_start_bit = 2`: - This configuration reserves bits 2 and 3 as reserved keys. - The binary representation for the last five bits ranges from `10010` to `11110`, with the least significant bit (bit 0) always set to `0`, and bits from 4 to 63 are set to `1`. - if you change the reserved_key_start_bit, you should use same value for save/load For more detail, please refer to [`init_reserved_keys`](https://github.com/search?q=repo%3ANVIDIA-Merlin%2FHierarchicalKV%20init_reserved_keys&type=code) ### How to use: ```cpp #include "merlin_hashtable.cuh" using TableOptions = nv::merlin::HashTableOptions; using EvictStrategy = nv::merlin::EvictStrategy; int main(int argc, char *argv[]) { using K = uint64_t; using V = float; using S = uint64_t; // 1. Define the table and use LRU eviction strategy. using HKVTable = nv::merlin::HashTable; std::unique_ptr table = std::make_unique(); // 2. Define the configuration options. TableOptions options; options.init_capacity = 16 * 1024 * 1024; options.max_capacity = options.init_capacity; options.dim = 16; options.max_hbm_for_vectors = nv::merlin::GB(16); // 3. Initialize the table memory resource. table->init(options); // 4. Use table to do something. return 0; } ``` ### Usage restrictions - The `key_type` must be `int64_t` or `uint64_t`. - The `score_type` must be `uint64_t`. ## Contributors HierarchicalKV is co-maintianed by [NVIDIA Merlin Team](https://github.com/NVIDIA-Merlin) and NVIDIA product end-users, and also open for public contributions, bug fixes, and documentation. [[Contribute](CONTRIBUTING.md)] ## How to build Basically, HierarchicalKV is a headers only library, the commands below only create binaries for benchmark and unit testing. Your environment must meet the following requirements: - CUDA version >= 11.2 - NVIDIA GPU with compute capability 8.0, 8.6, 8.7 or 9.0 - GCC supports `C++17' standard or later. - Bazel version >= 3.7.2 (Bazel compile only) ### with cmake ```shell git clone --recursive https://github.com/NVIDIA-Merlin/HierarchicalKV.git cd HierarchicalKV && mkdir -p build && cd build cmake -DCMAKE_BUILD_TYPE=Release -Dsm=80 .. && make -j ``` For Debug: ```shell cmake -DCMAKE_BUILD_TYPE=Debug -Dsm=80 .. && make -j ``` For Benchmark: ```shell ./merlin_hashtable_benchmark ``` For Unit Test: ```shell ./merlin_hashtable_test ``` ### with bazel - DON'T use the option of `--recursive` for `git clone`. - Please modify the environment variables in the `.bazelrc` file in advance if using the customized docker images. - The docker images maintained on `nvcr.io/nvidia/tensorflow` are highly recommended. Pull the docker image: ```shell docker pull nvcr.io/nvidia/tensorflow:22.09-tf2-py3 docker run --gpus all -it --rm nvcr.io/nvidia/tensorflow:22.09-tf2-py3 ``` Compile in docker container: ```shell git clone https://github.com/NVIDIA-Merlin/HierarchicalKV.git cd HierarchicalKV && bash bazel_build.sh ``` For Benchmark: ```shell ./benchmark_util ``` ## Benchmark & Performance(W.I.P) * GPU: 1 x NVIDIA A100 80GB PCIe: 8.0 * Key Type = uint64_t * Value Type = float32 * {dim} * Key-Values per OP = 1048576 * Evict strategy: LRU * `λ`: load factor * `find*` means the `find` API that directly returns the addresses of values. * `find_or_insert*` means the `find_or_insert` API that directly returns the addresses of values. * ***Throughput Unit: Billion-KV/second*** ### On pure HBM mode: * dim = 8, capacity = 128 Million-KV, HBM = 4 GB, HMEM = 0 GB | λ | insert_or_assign | find | find_or_insert | assign | find* | find_or_insert* | insert_and_evict | |-----:|-----------------:|-------:|---------------:|-------:|-------:|----------------:|-----------------:| | 0.50 | 1.093 | 2.470 | 1.478 | 1.770 | 3.726 | 1.447 | 1.075 | | 0.75 | 1.045 | 2.452 | 1.335 | 1.807 | 3.374 | 1.309 | 1.013 | | 1.00 | 0.655 | 2.481 | 0.612 | 1.815 | 1.865 | 0.619 | 0.511 | | λ | export_batch | export_batch_if | contains | |-----:|-------------:|----------------:|---------:| | 0.50 | 2.087 | 12.258 | 3.121 | | 0.75 | 2.045 | 12.447 | 3.094 | | 1.00 | 1.950 | 2.657 | 3.096 | * dim = 32, capacity = 128 Million-KV, HBM = 16 GB, HMEM = 0 GB | λ | insert_or_assign | find | find_or_insert | assign | find* | find_or_insert* | insert_and_evict | |-----:|-----------------:|-------:|---------------:|-------:|-------:|----------------:|-----------------:| | 0.50 | 0.961 | 2.272 | 1.278 | 1.706 | 3.718 | 1.435 | 0.931 | | 0.75 | 0.930 | 2.238 | 1.177 | 1.693 | 3.369 | 1.316 | 0.866 | | 1.00 | 0.646 | 2.321 | 0.572 | 1.783 | 1.873 | 0.618 | 0.469 | | λ | export_batch | export_batch_if | contains | |-----:|-------------:|----------------:|---------:| | 0.50 | 0.692 | 10.784 | 3.100 | | 0.75 | 0.569 | 10.240 | 3.075 | | 1.00 | 0.551 | 0.765 | 3.096 | * dim = 64, capacity = 64 Million-KV, HBM = 16 GB, HMEM = 0 GB | λ | insert_or_assign | find | find_or_insert | assign | find* | find_or_insert* | insert_and_evict | |-----:|-----------------:|-------:|---------------:|-------:|-------:|----------------:|-----------------:| | 0.50 | 0.834 | 1.982 | 1.113 | 1.499 | 3.950 | 1.502 | 0.805 | | 0.75 | 0.801 | 1.951 | 1.033 | 1.493 | 3.545 | 1.359 | 0.773 | | 1.00 | 0.621 | 2.021 | 0.608 | 1.541 | 1.965 | 0.613 | 0.481 | | λ | export_batch | export_batch_if | contains | |-----:|-------------:|----------------:|---------:| | 0.50 | 0.316 | 8.199 | 3.239 | | 0.75 | 0.296 | 8.549 | 3.198 | | 1.00 | 0.288 | 0.395 | 3.225 | ### On HBM+HMEM hybrid mode: * dim = 64, capacity = 128 Million-KV, HBM = 16 GB, HMEM = 16 GB | λ | insert_or_assign | find | find_or_insert | assign | find* | find_or_insert* | |-----:|-----------------:|-------:|---------------:|-------:|-------:|----------------:| | 0.50 | 0.083 | 0.124 | 0.109 | 0.131 | 3.705 | 1.435 | | 0.75 | 0.083 | 0.122 | 0.111 | 0.129 | 3.221 | 1.274 | | 1.00 | 0.073 | 0.123 | 0.095 | 0.126 | 1.854 | 0.617 | | λ | export_batch | export_batch_if | contains | |-----:|-------------:|----------------:|---------:| | 0.50 | 0.318 | 8.086 | 3.122 | | 0.75 | 0.294 | 5.549 | 3.111 | | 1.00 | 0.287 | 0.393 | 3.075 | * dim = 64, capacity = 512 Million-KV, HBM = 32 GB, HMEM = 96 GB | λ | insert_or_assign | find | find_or_insert | assign | find* | find_or_insert* | |-----:|-----------------:|-------:|---------------:|-------:|-------:|----------------:| | 0.50 | 0.049 | 0.069 | 0.049 | 0.069 | 3.484 | 1.370 | | 0.75 | 0.049 | 0.069 | 0.049 | 0.069 | 3.116 | 1.242 | | 1.00 | 0.047 | 0.072 | 0.047 | 0.070 | 1.771 | 0.607 | | λ | export_batch | export_batch_if | contains | |-----:|-------------:|----------------:|---------:| | 0.50 | 0.316 | 8.181 | 3.073 | | 0.75 | 0.293 | 8.950 | 3.052 | | 1.00 | 0.292 | 0.394 | 3.026 | ### Support and Feedback: If you encounter any issues or have questions, go to [https://github.com/NVIDIA-Merlin/HierarchicalKV/issues](https://github.com/NVIDIA-Merlin/HierarchicalKV/issues) and submit an issue so that we can provide you with the necessary resolutions and answers. ### Acknowledgment We are very grateful to external initial contributors [@Zhangyafei](https://github.com/zhangyafeikimi) and [@Lifan](https://github.com/Lifann) for their design, coding, and review work. ### License Apache License 2.0 ================================================ FILE: STYLE_GUIDE.md ================================================ #### C++ C++ code should conform to [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html). HierarchicalKV uses [clang-format](https://clang.llvm.org/docs/ClangFormat.html) to check your C/C++ changes. Sometimes you have some manually formatted code that you don’t want clang-format to touch. You can disable formatting like this: ```cpp int formatted_code; // clang-format off void unformatted_code ; // clang-format on void formatted_code_again; ``` Install Clang-format (the version 18.1.3 is required) for Ubuntu: ```bash sudo apt install clang-format-18 ``` format all with: ```bash find ./ \( -path ./tests/googletest -prune \) -o \( -iname *.h -o -iname *.cpp -o -iname *.cc -o -iname *.cu -o -iname *.cuh -o -iname *.hpp \) -print | xargs clang-format-18 -i --style=file ``` ================================================ FILE: WORKSPACE ================================================ workspace(name = "HierarchicalKV") load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive") load("//build_deps/gpus:configure.bzl", "cuda_configure") http_archive( name = "bazel_skylib", sha256 = "1dde365491125a3db70731e25658dfdd3bc5dbdfd11b840b3e987ecf043c7ca0", urls = [ "https://github.com/bazelbuild/bazel-skylib/releases/download/0.9.0/bazel_skylib-0.9.0.tar.gz", ], ) cuda_configure(name = "local_config_cuda") ================================================ FILE: bazel_build.sh ================================================ #!/bin/bash # Usage : `./bazel_build.sh` or `bash bazel_build.sh` set -e export $(cat .bazeliskrc | xargs) bazel build --config=cuda //... ================================================ FILE: benchmark/BUILD ================================================ load("@local_config_cuda//cuda:build_defs.bzl", "cuda_cc_library") cc_binary( name = "benchmark_util", deps = [ ":benchmark_lib", ], ) cuda_cc_library( name = "benchmark_lib", srcs = [ "merlin_hashtable_benchmark.cc.cu", ], hdrs = [ "benchmark_util.cuh", ], copts = ["-Iinclude/"], linkopts = ["-pthread"], deps = [ "//include:merlin_hashtable", "@local_config_cuda//cuda", ], ) ================================================ FILE: benchmark/benchmark_util.cuh ================================================ /* * Copyright (c) 2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #pragma once #include #include #include #include "merlin/utils.cuh" namespace benchmark { enum class TimeUnit { Second = 0, MilliSecond = 3, MicroSecond = 6, NanoSecond = 9, }; enum class API_Select { find = 0, insert_or_assign = 1, find_or_insert = 2, assign = 3, insert_and_evict = 4, find_ptr = 5, find_or_insert_ptr = 6, export_batch = 7, export_batch_if = 8, contains = 9, }; enum class Hit_Mode { random = 0, last_insert = 1, }; template struct Timer { explicit Timer(TimeUnit tu = TimeUnit::Second) : tu_(tu) {} void start() { startRecord = std::chrono::steady_clock::now(); } void end() { endRecord = std::chrono::steady_clock::now(); } Rep getResult() { auto duration_ = std::chrono::duration_cast( endRecord - startRecord); auto pow_ = static_cast(tu_) - static_cast(TimeUnit::NanoSecond); auto factor = static_cast(std::pow(10, pow_)); return static_cast(duration_.count()) * factor; } private: TimeUnit tu_; std::chrono::time_point startRecord{}; std::chrono::time_point endRecord{}; }; // RAII Timer using CUDA Event template struct KernelTimer { explicit KernelTimer(TimeUnit tu = TimeUnit::Second) : tu_(tu) { CUDA_CHECK(cudaEventCreate(&start_)); CUDA_CHECK(cudaEventCreate(&end_)); } ~KernelTimer() { CUDA_CHECK(cudaEventDestroy(start_)); CUDA_CHECK(cudaEventDestroy(end_)); } void start() { CUDA_CHECK(cudaEventRecord(start_)); } void end() { CUDA_CHECK(cudaEventRecord(end_)); CUDA_CHECK(cudaEventSynchronize(end_)); CUDA_CHECK(cudaEventElapsedTime(&time, start_, end_)); } Rep getResult() { auto pow_ = static_cast(tu_) - static_cast(TimeUnit::MilliSecond); auto factor = static_cast(std::pow(10, pow_)); return static_cast(time * factor); } private: TimeUnit tu_; float time{-1.0f}; cudaEvent_t start_; cudaEvent_t end_; }; inline uint64_t getTimestamp() { return std::chrono::duration_cast( std::chrono::system_clock::now().time_since_epoch()) .count(); } template void create_continuous_keys(K* h_keys, S* h_scores, const int key_num_per_op, const K start = 0, int freq_range = 1000) { for (K i = 0; i < key_num_per_op; i++) { h_keys[i] = start + static_cast(i); if (h_scores != nullptr) h_scores[i] = h_keys[i] % freq_range; } } template void create_random_keys(K* h_keys, S* h_scores, const int key_num_per_op) { std::unordered_set numbers; std::random_device rd; std::mt19937_64 eng(rd()); std::uniform_int_distribution distr; int i = 0; while (numbers.size() < key_num_per_op) { numbers.insert(distr(eng)); } for (const K num : numbers) { h_keys[i] = num; if (h_scores != nullptr) h_scores[i] = getTimestamp(); i++; } } template void create_keys_for_hitrate(K* h_keys, S* h_scores, const int key_num_per_op, const float hitrate = 0.6f, const Hit_Mode hit_mode = Hit_Mode::last_insert, const K end = 0, const bool reset = false, int freq_range = 1000) { int divide = static_cast(key_num_per_op * hitrate); if (Hit_Mode::random == hit_mode) { std::random_device rd; std::mt19937_64 eng(rd()); K existed_max = end == 0 ? 1 : (end - 1); std::uniform_int_distribution distr(0, existed_max); if (existed_max < divide) { std::cout << "# Can not generate enough keys for hit!"; exit(-1); } std::unordered_set numbers; while (numbers.size() < divide) { numbers.insert(distr(eng)); } int i = 0; for (auto existed_value : numbers) { h_keys[i] = existed_value; if (h_scores != nullptr) h_scores[i] = h_keys[i] % freq_range; i++; } } else { // else keep its original value, but update scores for (int i = 0; i < divide; i++) { if (h_scores != nullptr) h_scores[i] = getTimestamp() % freq_range; } } static K new_value = std::numeric_limits::max(); if (reset) { new_value = std::numeric_limits::max(); } for (int i = divide; i < key_num_per_op; i++) { h_keys[i] = new_value--; if (h_scores != nullptr) h_scores[i] = getTimestamp() % freq_range; } } template void refresh_scores(S* h_scores, const int key_num_per_op) { for (int i = 0; i < key_num_per_op; i++) { h_scores[i] = getTimestamp(); } } template void init_value_using_key(K* h_keys, V* h_vectors, const int key_num_per_op, int dim) { for (size_t i = 0; i < key_num_per_op; i++) { for (size_t j = 0; j < dim; j++) { h_vectors[i * dim + j] = static_cast(h_keys[i] * 0.00001); } } } template __global__ void read_from_ptr_kernel(const V* const* __restrict src, V* __restrict dst, const size_t dim, size_t N) { size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x; for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) { int vec_index = int(t / dim); int dim_index = t % dim; if (src[vec_index]) { dst[vec_index * dim + dim_index] = src[vec_index][dim_index]; } } } template void read_from_ptr(const V* const* __restrict src, V* __restrict dst, const size_t dim, size_t n, cudaStream_t stream) { const size_t block_size = 1024; const size_t N = n * dim; const size_t grid_size = nv::merlin::SAFE_GET_GRID_SIZE(N, block_size); read_from_ptr_kernel <<>>(src, dst, dim, N); } template __global__ void array2ptr_kernel(V** ptr, V* __restrict array, const size_t dim, size_t N) { size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x; for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) { int vec_index = int(t); ptr[vec_index] = array + vec_index * dim; } } template void array2ptr(V** ptr, V* __restrict array, const size_t dim, size_t n, cudaStream_t stream) { const size_t block_size = 1024; const size_t N = n; const size_t grid_size = nv::merlin::SAFE_GET_GRID_SIZE(N, block_size); array2ptr_kernel<<>>(ptr, array, dim, N); } template __global__ void host_nano_kernel(S* d_clk) { S mclk; asm volatile("mov.u64 %0,%%globaltimer;" : "=l"(mclk)); *d_clk = mclk; } template S host_nano(cudaStream_t stream = 0) { S h_clk = 0; S* d_clk; CUDA_CHECK(cudaMalloc((void**)&(d_clk), sizeof(S))); host_nano_kernel<<<1, 1, 0, stream>>>(d_clk); CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(cudaMemcpy(&h_clk, d_clk, sizeof(S), cudaMemcpyDeviceToHost)); CUDA_CHECK(cudaFree(d_clk)); return h_clk; } template struct ExportIfPredFunctor { __forceinline__ __device__ bool operator()(const K& key, S& score, const K& pattern, const S& threshold) { return score > threshold; } }; } // namespace benchmark ================================================ FILE: benchmark/dual_bucket_benchmark.cc.cu ================================================ /* * Copyright (c) 2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include #include #include #include #include "merlin_hashtable.cuh" using K = uint64_t; using V = float; using S = uint64_t; using TableOptions = nv::merlin::HashTableOptions; using TableMode = nv::merlin::TableMode; using EvictStrategy = nv::merlin::EvictStrategy; template double benchmark_insert(Table& table, size_t n, K* d_keys, V* d_values, S* d_scores, cudaStream_t stream) { CUDA_CHECK(cudaStreamSynchronize(stream)); auto start = std::chrono::high_resolution_clock::now(); table.insert_or_assign(n, d_keys, d_values, d_scores, stream, true); CUDA_CHECK(cudaStreamSynchronize(stream)); auto end = std::chrono::high_resolution_clock::now(); double ms = std::chrono::duration_cast(end - start) .count() / 1000.0; return static_cast(n) / ms / 1000.0; // Mops/s } template double benchmark_find(Table& table, size_t n, K* d_keys, V* d_values, bool* d_founds, cudaStream_t stream) { CUDA_CHECK(cudaStreamSynchronize(stream)); auto start = std::chrono::high_resolution_clock::now(); table.find(n, d_keys, d_values, d_founds, nullptr, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); auto end = std::chrono::high_resolution_clock::now(); double ms = std::chrono::duration_cast(end - start) .count() / 1000.0; return static_cast(n) / ms / 1000.0; // Mops/s } void run_benchmark(size_t capacity, size_t dim, TableMode mode, const char* mode_name) { using Table = nv::merlin::HashTable; Table table; TableOptions options; options.init_capacity = capacity; options.max_capacity = capacity; options.max_hbm_for_vectors = 0; options.dim = dim; options.max_bucket_size = 128; options.table_mode = mode; table.init(options); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); // Generate keys. size_t max_n = capacity; std::vector h_keys(max_n); std::vector h_values(max_n * dim, 1.0f); std::vector h_scores(max_n); std::iota(h_keys.begin(), h_keys.end(), 1); for (size_t i = 0; i < max_n; i++) h_scores[i] = i + 1; K* d_keys; V* d_values; S* d_scores; bool* d_founds; V* d_found_values; CUDA_CHECK(cudaMalloc(&d_keys, max_n * sizeof(K))); CUDA_CHECK(cudaMalloc(&d_values, max_n * dim * sizeof(V))); CUDA_CHECK(cudaMalloc(&d_scores, max_n * sizeof(S))); CUDA_CHECK(cudaMalloc(&d_founds, max_n * sizeof(bool))); CUDA_CHECK(cudaMalloc(&d_found_values, max_n * dim * sizeof(V))); CUDA_CHECK(cudaMemcpy(d_keys, h_keys.data(), max_n * sizeof(K), cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMemcpy(d_values, h_values.data(), max_n * dim * sizeof(V), cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMemcpy(d_scores, h_scores.data(), max_n * sizeof(S), cudaMemcpyHostToDevice)); printf("--- %s (capacity=%zuK, dim=%zu) ---\n", mode_name, capacity / 1024, dim); printf(" %-12s %-18s %-18s\n", "Load Factor", "Insert (Mops/s)", "Find (Mops/s)"); float load_factors[] = {0.25f, 0.50f, 0.75f, 0.90f, 0.95f, 1.00f}; size_t prev_n = 0; for (float lf : load_factors) { size_t target_n = static_cast(capacity * lf); if (target_n > max_n) break; size_t batch_n = target_n - prev_n; if (batch_n == 0) continue; // Insert to reach target load factor. double insert_mops = benchmark_insert(table, batch_n, d_keys + prev_n, d_values + prev_n * dim, d_scores + prev_n, stream); // Find all inserted keys. double find_mops = benchmark_find(table, target_n, d_keys, d_found_values, d_founds, stream); printf(" %-12.2f %-18.1f %-18.1f\n", lf, insert_mops, find_mops); prev_n = target_n; } // Memory efficiency: first eviction LF. // (Already covered in test, report here too.) size_t table_size = table.size(stream); CUDA_CHECK(cudaStreamSynchronize(stream)); printf(" Final size: %zu / %zu (LF=%.4f)\n", table_size, capacity, static_cast(table_size) / capacity); CUDA_CHECK(cudaFree(d_keys)); CUDA_CHECK(cudaFree(d_values)); CUDA_CHECK(cudaFree(d_scores)); CUDA_CHECK(cudaFree(d_founds)); CUDA_CHECK(cudaFree(d_found_values)); CUDA_CHECK(cudaStreamDestroy(stream)); } int main(int argc, char** argv) { printf("=== Dual-Bucket Benchmark Results ===\n\n"); // Default: 1M capacity, dim=64. size_t capacity = 128 * 1024 * 8; // ~1M size_t dim = 64; if (argc > 1) capacity = static_cast(atol(argv[1])); if (argc > 2) dim = static_cast(atol(argv[2])); run_benchmark(capacity, dim, TableMode::kThroughput, "THROUGHPUT_MODE"); printf("\n"); run_benchmark(capacity, dim, TableMode::kMemory, "MEMORY_MODE"); printf("\n"); printf("=== Benchmark Complete ===\n"); return 0; } ================================================ FILE: benchmark/find_with_missed_keys_benchmark.cc.cu ================================================ /* * Copyright (c) 2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "benchmark_util.cuh" #include "merlin_hashtable.cuh" using K = uint64_t; using V = float; using S = uint64_t; using EvictStrategy = nv::merlin::EvictStrategy; using TableOptions = nv::merlin::HashTableOptions; using Table = nv::merlin::HashTable; void print_tile() { std::cout << std::endl << "| \u03BB " << "| capacity " << "| max_hbm_for_vectors " << "| max_bucket_size " << "| dim " << "| missed_ratio " << "| througput(BillionKV/secs) "; std::cout << "|\n"; //<< "| load_factor " std::cout << "|------" //<< "| capacity " << "|----------" //<< "| max_hbm_for_vectors " << "|---------------------" //<< "| max_bucket_size " << "|-----------------" //<< "| dim " << "|-----" //<< "| missed_ratio " << "|--------------" //<< "| througput(BillionKV/secs) " << "|---------------------------"; std::cout << "|\n"; } template void print_w(const T& t, size_t width) { std::cout << "|" << std::setw(width) << t; } void print_result(double load_factor, size_t capacity, size_t max_hbm_for_vectors, size_t max_bucket_size, size_t dim, double missed_ratio, float througput) { print_w(load_factor, 6); print_w(capacity, 10); print_w(max_hbm_for_vectors, 21); print_w(max_bucket_size, 17); print_w(dim, 5); print_w(missed_ratio, 14); print_w(througput, 27); std::cout << "|\n"; } void test_find(size_t capacity, size_t dim, size_t max_hbm_for_vectors, double load_factor, size_t max_bucket_size, double missed_ratio) { MERLIN_CHECK(load_factor >= 0.0 && load_factor <= 1.0, "Invalid `load_factor`"); K* h_keys; S* h_scores; V* h_vectors; TableOptions options; options.init_capacity = capacity; options.max_capacity = capacity; options.dim = dim; options.max_hbm_for_vectors = nv::merlin::MB(max_hbm_for_vectors); options.max_bucket_size = max_bucket_size; size_t key_num = capacity; CUDA_CHECK(cudaMallocHost(&h_keys, key_num * sizeof(K))); CUDA_CHECK(cudaMallocHost(&h_scores, key_num * sizeof(S))); CUDA_CHECK(cudaMallocHost(&h_vectors, key_num * options.dim * sizeof(V))); K* d_keys; S* d_scores; V* d_vectors; K* d_missed_keys; int* d_missed_indices; int* d_missed_size; CUDA_CHECK(cudaMalloc(&d_keys, key_num * sizeof(K))); CUDA_CHECK(cudaMalloc(&d_scores, key_num * sizeof(S))); CUDA_CHECK(cudaMalloc(&d_vectors, key_num * sizeof(V) * options.dim)); CUDA_CHECK(cudaMalloc(&d_missed_keys, key_num * sizeof(K))); CUDA_CHECK(cudaMalloc(&d_missed_indices, key_num * sizeof(int))); CUDA_CHECK(cudaMalloc(&d_missed_size, sizeof(int))); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); // insert key-value size_t insert_num = (double)key_num * load_factor; benchmark::create_continuous_keys(h_keys, h_scores, insert_num, 0 /*start*/); benchmark::init_value_using_key(h_keys, h_vectors, insert_num, options.dim); CUDA_CHECK(cudaMemcpy(d_keys, h_keys, insert_num * sizeof(K), cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMemcpy(d_scores, h_scores, insert_num * sizeof(S), cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors, insert_num * sizeof(V) * options.dim, cudaMemcpyHostToDevice)); Table table; table.init(options); table.insert_or_assign(insert_num, d_keys, d_vectors, d_scores, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); // find key-value size_t find_num = (double)insert_num * (1.0 - missed_ratio); benchmark::create_continuous_keys(h_keys, nullptr, find_num, 0 /*start*/); benchmark::create_continuous_keys( h_keys + find_num, nullptr, insert_num - find_num, insert_num /*start*/); CUDA_CHECK(cudaMemcpy(d_keys, h_keys, insert_num * sizeof(K), cudaMemcpyHostToDevice)); auto timer = benchmark::Timer(); timer.start(); table.find(insert_num, d_keys, d_vectors, d_missed_keys, d_missed_indices, d_missed_size, d_scores, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); timer.end(); CUDA_CHECK(cudaFreeHost(h_keys)); CUDA_CHECK(cudaFreeHost(h_scores)); CUDA_CHECK(cudaFreeHost(h_vectors)); CUDA_CHECK(cudaFree(d_keys)); CUDA_CHECK(cudaFree(d_scores)); CUDA_CHECK(cudaFree(d_vectors)); CUDA_CHECK(cudaFree(d_missed_keys)); CUDA_CHECK(cudaFree(d_missed_indices)); CUDA_CHECK(cudaFree(d_missed_size)); CudaCheckError(); float througput = insert_num / timer.getResult() / (1024 * 1024 * 1024.0f); print_result(load_factor, capacity, max_hbm_for_vectors, max_bucket_size, dim, missed_ratio, througput); } void test_main(double load_factor, double missed_ratio) { constexpr size_t CAPACITY = 100000000UL; print_tile(); // pure HBM test_find(CAPACITY, 8, 8 * 1024UL, load_factor, 256, missed_ratio); test_find(CAPACITY, 8, 8 * 1024UL, load_factor, 128, missed_ratio); // hybrid test_find(CAPACITY, 8, 1 * 1024UL, load_factor, 256, missed_ratio); test_find(CAPACITY, 8, 1 * 1024UL, load_factor, 128, missed_ratio); // pure HMEM test_find(CAPACITY, 8, 0, load_factor, 256, missed_ratio); test_find(CAPACITY, 8, 0, load_factor, 128, missed_ratio); } int main() { test_main(0.2, 0); test_main(0.2, 0.5); test_main(0.2, 1.0); test_main(0.5, 0); test_main(0.5, 0.5); test_main(0.5, 1.0); test_main(1.0, 0); test_main(1.0, 0.5); test_main(1.0, 1.0); return 0; } ================================================ FILE: benchmark/merlin_hashtable_benchmark.cc.cu ================================================ /* * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include "benchmark_util.cuh" #include "merlin_hashtable.cuh" using std::cerr; using std::cout; using std::endl; using std::fixed; using std::setfill; using std::setprecision; using std::setw; using namespace nv::merlin; using namespace benchmark; enum class Test_Mode { pure_hbm = 0, hybrid = 1, }; const float EPSILON = 0.001f; std::string rep(int n) { return std::string(n, ' '); } using K = uint64_t; using S = uint64_t; using V = float; using EvictStrategy = nv::merlin::EvictStrategy; using TableOptions = nv::merlin::HashTableOptions; template float test_one_api(std::shared_ptr& table, const API_Select api, const size_t dim, const size_t init_capacity, const size_t key_num_per_op, const float load_factor, const float hitrate = 0.6f) { K* h_keys; S* h_scores; V* h_vectors; bool* h_found; CUDA_CHECK(cudaMallocHost(&h_keys, key_num_per_op * sizeof(K))); CUDA_CHECK(cudaMallocHost(&h_scores, key_num_per_op * sizeof(S))); CUDA_CHECK(cudaMallocHost(&h_vectors, key_num_per_op * sizeof(V) * dim)); CUDA_CHECK(cudaMallocHost(&h_found, key_num_per_op * sizeof(bool))); CUDA_CHECK(cudaMemset(h_vectors, 0, key_num_per_op * sizeof(V) * dim)); bool need_scores = (Table::evict_strategy == EvictStrategy::kLfu || Table::evict_strategy == EvictStrategy::kEpochLfu || Table::evict_strategy == EvictStrategy::kCustomized); K* d_keys; S* d_scores_real; S* d_scores; V* d_vectors; V* d_def_val; V** d_vectors_ptr; bool* d_found; K* d_keys_out; K* d_evict_keys; S* d_evict_scores; CUDA_CHECK(cudaMalloc(&d_keys, key_num_per_op * sizeof(K))); CUDA_CHECK(cudaMalloc(&d_scores_real, key_num_per_op * sizeof(S))); CUDA_CHECK(cudaMalloc(&d_vectors, key_num_per_op * sizeof(V) * dim)); CUDA_CHECK(cudaMalloc(&d_def_val, key_num_per_op * sizeof(V) * dim)); CUDA_CHECK(cudaMalloc(&d_vectors_ptr, key_num_per_op * sizeof(V*))); CUDA_CHECK(cudaMalloc(&d_found, key_num_per_op * sizeof(bool))); CUDA_CHECK(cudaMalloc(&d_keys_out, key_num_per_op * sizeof(K))); CUDA_CHECK(cudaMalloc(&d_evict_keys, key_num_per_op * sizeof(K))); CUDA_CHECK(cudaMalloc(&d_evict_scores, key_num_per_op * sizeof(S))); CUDA_CHECK(cudaMemset(d_vectors, 1, key_num_per_op * sizeof(V) * dim)); CUDA_CHECK(cudaMemset(d_def_val, 2, key_num_per_op * sizeof(V) * dim)); CUDA_CHECK(cudaMemset(d_vectors_ptr, 0, key_num_per_op * sizeof(V*))); CUDA_CHECK(cudaMemset(d_found, 0, key_num_per_op * sizeof(bool))); d_scores = need_scores ? d_scores_real : nullptr; cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); // initialize insert // step 1, no need to load load_factor uint64_t key_num_init = static_cast(init_capacity * load_factor); const float target_load_factor = key_num_init * 1.0f / init_capacity; uint64_t key_num_remain = key_num_init % key_num_per_op == 0 ? key_num_per_op : key_num_init % key_num_per_op; int32_t loop_num_init = (key_num_init + key_num_per_op - 1) / key_num_per_op; K start = 0UL; S threshold = benchmark::host_nano(); int global_epoch = 0; for (; global_epoch < loop_num_init; global_epoch++) { table->set_global_epoch(global_epoch); uint64_t key_num_cur_insert = global_epoch == loop_num_init - 1 ? key_num_remain : key_num_per_op; create_continuous_keys(h_keys, h_scores, key_num_cur_insert, start); CUDA_CHECK(cudaMemcpy(d_keys, h_keys, key_num_cur_insert * sizeof(K), cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMemcpy(d_scores_real, h_scores, key_num_cur_insert * sizeof(S), cudaMemcpyHostToDevice)); table->find_or_insert(key_num_cur_insert, d_keys, d_vectors_ptr, d_found, d_scores, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); start += key_num_cur_insert; } // step 2 float real_load_factor = table->load_factor(stream); CUDA_CHECK(cudaStreamSynchronize(stream)); while (target_load_factor - real_load_factor > EPSILON) { auto key_num_append = static_cast( (target_load_factor - real_load_factor) * init_capacity); if (key_num_append <= 0) break; key_num_append = std::min(static_cast(key_num_per_op), key_num_append); create_continuous_keys(h_keys, h_scores, key_num_append, start); CUDA_CHECK(cudaMemcpy(d_keys, h_keys, key_num_append * sizeof(K), cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMemcpy(d_scores_real, h_scores, key_num_append * sizeof(S), cudaMemcpyHostToDevice)); table->insert_or_assign(key_num_append, d_keys, d_vectors, d_scores, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); start += key_num_append; real_load_factor = table->load_factor(stream); CUDA_CHECK(cudaStreamSynchronize(stream)); } // For trigger the kernel selection in advance. int key_num_per_op_warmup = 1; for (int i = 0; i < 9; i++, global_epoch++) { table->set_global_epoch(global_epoch); switch (api) { case API_Select::find: { table->find(key_num_per_op_warmup, d_keys, d_vectors, d_found, d_scores, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); break; } case API_Select::insert_or_assign: { table->insert_or_assign(key_num_per_op_warmup, d_keys, d_vectors, d_scores, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); break; } case API_Select::find_or_insert: { table->find_or_insert(key_num_per_op_warmup, d_keys, d_vectors, d_scores, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); break; } case API_Select::assign: { table->assign(key_num_per_op_warmup, d_keys, d_def_val, d_scores, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); break; } case API_Select::insert_and_evict: { table->insert_and_evict(key_num_per_op_warmup, d_keys, d_vectors, d_scores, d_evict_keys, d_def_val, d_evict_scores, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); break; } case API_Select::find_ptr: { V** d_vectors_ptr = nullptr; CUDA_CHECK( cudaMalloc(&d_vectors_ptr, key_num_per_op_warmup * sizeof(V*))); benchmark::array2ptr(d_vectors_ptr, d_vectors, dim, key_num_per_op_warmup, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); table->find(1, d_keys, d_vectors_ptr, d_found, d_scores, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); benchmark::read_from_ptr(d_vectors_ptr, d_vectors, dim, key_num_per_op_warmup, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(cudaFree(d_vectors_ptr)); break; } case API_Select::find_or_insert_ptr: { V** d_vectors_ptr = nullptr; bool* d_found; CUDA_CHECK(cudaMalloc(&d_found, key_num_per_op_warmup * sizeof(bool))); CUDA_CHECK( cudaMalloc(&d_vectors_ptr, key_num_per_op_warmup * sizeof(V*))); benchmark::array2ptr(d_vectors_ptr, d_vectors, dim, key_num_per_op_warmup, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); table->find_or_insert(key_num_per_op_warmup, d_keys, d_vectors_ptr, d_found, d_scores, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(cudaFree(d_vectors_ptr)); CUDA_CHECK(cudaFree(d_found)); break; } case API_Select::export_batch: { size_t* d_dump_counter = nullptr; CUDA_CHECK(cudaMalloc(&d_dump_counter, sizeof(size_t))); CUDA_CHECK(cudaMemset(d_dump_counter, 0, sizeof(size_t))); table->export_batch(key_num_per_op_warmup, 0, d_dump_counter, d_keys, d_vectors, d_scores, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(cudaFree(d_dump_counter)); break; } case API_Select::export_batch_if: { size_t* d_dump_counter = nullptr; CUDA_CHECK(cudaMalloc(&d_dump_counter, sizeof(size_t))); CUDA_CHECK(cudaMemset(d_dump_counter, 0, sizeof(size_t))); K pattern = 0; table->template export_batch_if( pattern, threshold, key_num_per_op_warmup, 0, d_dump_counter, d_keys, d_vectors, d_scores, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(cudaFree(d_dump_counter)); break; } case API_Select::contains: { table->contains(1, d_keys, d_found, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); break; } default: { std::cout << "[Unsupport API]\n"; } } } create_keys_for_hitrate(h_keys, h_scores, key_num_per_op, hitrate, Hit_Mode::last_insert, start, true /*reset*/); CUDA_CHECK(cudaMemcpy(d_keys, h_keys, key_num_per_op * sizeof(K), cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMemcpy(d_scores_real, h_scores, key_num_per_op * sizeof(K), cudaMemcpyHostToDevice)); auto timer = benchmark::Timer(); global_epoch++; table->set_global_epoch(global_epoch); switch (api) { case API_Select::find: { timer.start(); table->find(key_num_per_op, d_keys, d_vectors, d_found, d_scores, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); timer.end(); break; } case API_Select::insert_or_assign: { timer.start(); table->insert_or_assign(key_num_per_op, d_keys, d_vectors, d_scores, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); timer.end(); break; } case API_Select::find_or_insert: { timer.start(); table->find_or_insert(key_num_per_op, d_keys, d_vectors, d_scores, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); timer.end(); break; } case API_Select::assign: { timer.start(); table->assign(key_num_per_op, d_keys, d_def_val, d_scores, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); timer.end(); break; } case API_Select::insert_and_evict: { timer.start(); table->insert_and_evict(key_num_per_op, d_keys, d_vectors, d_scores, d_evict_keys, d_def_val, d_evict_scores, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); timer.end(); break; } case API_Select::find_ptr: { V** d_vectors_ptr = nullptr; CUDA_CHECK(cudaMalloc(&d_vectors_ptr, key_num_per_op * sizeof(V*))); benchmark::array2ptr(d_vectors_ptr, d_vectors, dim, key_num_per_op, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); timer.start(); table->find(key_num_per_op, d_keys, d_vectors_ptr, d_found, d_scores, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); timer.end(); benchmark::read_from_ptr(d_vectors_ptr, d_vectors, dim, key_num_per_op, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(cudaFree(d_vectors_ptr)); break; } case API_Select::find_or_insert_ptr: { V** d_vectors_ptr = nullptr; bool* d_found; CUDA_CHECK(cudaMalloc(&d_found, key_num_per_op * sizeof(bool))); CUDA_CHECK(cudaMalloc(&d_vectors_ptr, key_num_per_op * sizeof(V*))); benchmark::array2ptr(d_vectors_ptr, d_vectors, dim, key_num_per_op, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); timer.start(); table->find_or_insert(key_num_per_op, d_keys, d_vectors_ptr, d_found, d_scores, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); timer.end(); CUDA_CHECK(cudaFree(d_vectors_ptr)); CUDA_CHECK(cudaFree(d_found)); break; } case API_Select::export_batch: { size_t* d_dump_counter; // Try to export close to but less than `key_num_per_op` data. // It's normal to happen `illegal memory access` error occasionally. float safe_ratio = 0.995; CUDA_CHECK(cudaMalloc(&d_dump_counter, sizeof(size_t))); CUDA_CHECK(cudaMemset(d_dump_counter, 0, sizeof(size_t))); timer.start(); table->export_batch(key_num_per_op / target_load_factor * safe_ratio, 0, d_dump_counter, d_keys, d_vectors, d_scores, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); timer.end(); CUDA_CHECK(cudaFree(d_dump_counter)); break; } case API_Select::export_batch_if: { size_t* d_dump_counter; // Try to export close to but less than `key_num_per_op` data. // It's normal to happen `illegal memory access` error occasionally. float safe_ratio = 0.995; CUDA_CHECK(cudaMalloc(&d_dump_counter, sizeof(size_t))); CUDA_CHECK(cudaMemset(d_dump_counter, 0, sizeof(size_t))); timer.start(); K pattern = 0; table->template export_batch_if( pattern, threshold, key_num_per_op / target_load_factor * safe_ratio, 0, d_dump_counter, d_keys, d_vectors, d_scores, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); timer.end(); CUDA_CHECK(cudaFree(d_dump_counter)); break; } case API_Select::contains: { timer.start(); table->contains(key_num_per_op, d_keys, d_found, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); timer.end(); break; } default: { std::cout << "[Unsupport API]\n"; } } CUDA_CHECK(cudaStreamDestroy(stream)); CUDA_CHECK(cudaFreeHost(h_keys)); CUDA_CHECK(cudaFreeHost(h_scores)); CUDA_CHECK(cudaFreeHost(h_found)); CUDA_CHECK(cudaFree(d_keys)); CUDA_CHECK(cudaFree(d_scores_real)); CUDA_CHECK(cudaFree(d_vectors)); CUDA_CHECK(cudaFree(d_def_val)); CUDA_CHECK(cudaFree(d_vectors_ptr)); CUDA_CHECK(cudaFree(d_found)); CUDA_CHECK(cudaFree(d_evict_keys)); CUDA_CHECK(cudaFree(d_evict_scores)); CUDA_CHECK(cudaDeviceSynchronize()); CudaCheckError(); float througput = key_num_per_op / timer.getResult() / (1024 * 1024 * 1024.0f); return througput; } static Test_Mode test_mode = Test_Mode::pure_hbm; void print_title_a() { cout << endl << "| \u03BB " << "| insert_or_assign " << "| find " << "| find_or_insert " << "| assign " << "| find* " << "| find_or_insert* "; if (Test_Mode::pure_hbm == test_mode) { cout << "| insert_and_evict "; } cout << "|\n"; //<< "| load_factor " cout << "|-----:" //<< "| insert_or_assign " << "|-----------------:" //<< "| find " << "|-------:" //<< "| find_or_insert " << "|---------------:" //<< "| assign " << "|-------:" //<< "| find* " << "|-------:" //<< "| find_or_insert* " << "|----------------:"; if (Test_Mode::pure_hbm == test_mode) { //<< "| insert_and_evict " cout << "|-----------------:"; } cout << "|\n"; } void print_title_b() { cout << endl << "| \u03BB " << "| export_batch " << "| export_batch_if " << "| contains "; cout << "|\n"; //<< "| load_factor " cout << "|-----:" //<< "| export_batch " << "|-------------:" //<< "| export_batch_if " << "|----------------:" //<< "| contains " << "|----------:"; cout << "|\n"; } void test_main(std::vector& apis, const size_t dim, const size_t init_capacity = 64 * 1024 * 1024UL, const size_t key_num_per_op = 1 * 1024 * 1024UL, const size_t hbm4values = 16, const float load_factor = 1.0f, const bool io_by_cpu = false, const std::vector load_factors = {0.50f, 0.75f, 1.00f}) { size_t free, total; CUDA_CHECK(cudaSetDevice(0)); CUDA_CHECK(cudaMemGetInfo(&free, &total)); if (free / (1 << 30) < hbm4values) { std::cout << "free HBM is not enough, ignore current benchmark!" << std::endl; return; } TableOptions options; options.init_capacity = init_capacity; options.max_capacity = init_capacity; options.dim = dim; options.max_hbm_for_vectors = nv::merlin::GB(hbm4values); options.io_by_cpu = io_by_cpu; using Table = nv::merlin::HashTable; std::shared_ptr
table = std::make_shared
(); table->init(options); for (float load_factor : load_factors) { std::cout << "|" << rep(1) << fixed << setprecision(2) << load_factor << " "; for (auto api : apis) { table->clear(); CUDA_CHECK(cudaDeviceSynchronize()); // There is a sampling of load_factor after several times call to target // API. Two consecutive calls can avoid the impact of sampling. auto res1 = test_one_api
(table, api, dim, init_capacity, key_num_per_op, load_factor); auto res2 = test_one_api
(table, api, dim, init_capacity, key_num_per_op, load_factor); auto res = std::max(res1, res2); std::cout << "|"; switch (api) { case API_Select::find: { std::cout << rep(1); break; } case API_Select::insert_or_assign: { std::cout << rep(11); break; } case API_Select::find_or_insert: { std::cout << rep(9); break; } case API_Select::assign: { std::cout << rep(1); break; } case API_Select::insert_and_evict: { std::cout << rep(11); break; } case API_Select::find_ptr: { std::cout << rep(1); break; } case API_Select::find_or_insert_ptr: { std::cout << rep(10); break; } case API_Select::export_batch: { std::cout << rep(7); break; } case API_Select::export_batch_if: { std::cout << rep(10); break; } case API_Select::contains: { std::cout << rep(4); break; } default: { std::cout << "[Unsupport API]"; } } std::cout << fixed << setprecision(3) << setw(6) << setfill(' ') << res << " "; } std::cout << "|\n"; } } int main() { size_t key_num_per_op = 1 * 1024 * 1024UL; cudaDeviceProp props; CUDA_CHECK(cudaGetDeviceProperties(&props, 0)); cout << endl << "## Benchmark" << endl << endl << "* GPU: 1 x " << props.name << ": " << props.major << "." << props.minor << endl << "* Key Type = uint64_t" << endl << "* Value Type = float32 * {dim}" << endl << "* Key-Values per OP = " << key_num_per_op << endl << "* Evict strategy: LRU" << endl << "* `\u03BB`" << ": load factor" << endl << "* `find*` means the `find` API that directly returns the addresses " "of values." << endl << "* `find_or_insert*` means the `find_or_insert` API that directly " "returns the addresses of values." << endl << "* ***Throughput Unit: Billion-KV/second***" << endl << endl; auto print_configuration = [](const size_t dim, const size_t init_capacity, const size_t hbm4values) { using V = float; int32_t capacity = static_cast(init_capacity / (1024 * 1024)); size_t hmem4values = init_capacity * dim * sizeof(V) / (1024 * 1024 * 1024); hmem4values = hmem4values < hbm4values ? 0 : (hmem4values - hbm4values); cout << "\n* dim = " << dim << ", " << "capacity = " << capacity << " Million-KV, " << "HBM = " << hbm4values << " GB, " << "HMEM = " << hmem4values << " GB\n"; }; try { { std::vector apis_a{ API_Select::insert_or_assign, API_Select::find, API_Select::find_or_insert, API_Select::assign, API_Select::find_ptr, API_Select::find_or_insert_ptr, API_Select::insert_and_evict}; std::vector apis_b{API_Select::export_batch, API_Select::export_batch_if, API_Select::contains}; test_mode = Test_Mode::pure_hbm; cout << "### On pure HBM mode: " << endl; print_configuration(8, 128 * 1024 * 1024UL, 4); print_title_a(); test_main(apis_a, 8, 128 * 1024 * 1024UL, key_num_per_op, 4); print_title_b(); test_main(apis_b, 8, 128 * 1024 * 1024UL, key_num_per_op, 4); print_configuration(32, 128 * 1024 * 1024UL, 16); print_title_a(); test_main(apis_a, 32, 128 * 1024 * 1024UL, key_num_per_op, 16); print_title_b(); test_main(apis_b, 32, 128 * 1024 * 1024UL, key_num_per_op, 16); print_configuration(64, 64 * 1024 * 1024UL, 16); print_title_a(); test_main(apis_a, 64, 64 * 1024 * 1024UL, key_num_per_op, 16); print_title_b(); test_main(apis_b, 64, 64 * 1024 * 1024UL, key_num_per_op, 16); cout << endl; } { std::vector apis_a{ API_Select::insert_or_assign, API_Select::find, API_Select::find_or_insert, API_Select::assign, API_Select::find_ptr, API_Select::find_or_insert_ptr}; std::vector apis_b{API_Select::export_batch, API_Select::export_batch_if, API_Select::contains}; cout << "### On HBM+HMEM hybrid mode: " << endl; test_mode = Test_Mode::hybrid; print_configuration(64, 128 * 1024 * 1024UL, 16); print_title_a(); test_main(apis_a, 64, 128 * 1024 * 1024UL, key_num_per_op, 16); print_title_b(); test_main(apis_b, 64, 128 * 1024 * 1024UL, key_num_per_op, 16); print_configuration(64, 512 * 1024 * 1024UL, 32); print_title_a(); test_main(apis_a, 64, 512 * 1024 * 1024UL, key_num_per_op, 32); print_title_b(); test_main(apis_b, 64, 512 * 1024 * 1024UL, key_num_per_op, 32); cout << endl; } CUDA_CHECK(cudaDeviceSynchronize()); } catch (const nv::merlin::CudaException& e) { cerr << e.what() << endl; } CUDA_CHECK(cudaDeviceSynchronize()); return 0; } ================================================ FILE: build_deps/gpus/BUILD ================================================ ================================================ FILE: build_deps/gpus/check_cuda_libs.py ================================================ # Copyright (c) 2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # """Verifies that a list of libraries is installed on the system. Takes a list of arguments with every two subsequent arguments being a logical tuple of (path, check_soname). The path to the library and either True or False to indicate whether to check the soname field on the shared library. Example Usage: ./check_cuda_libs.py /path/to/lib1.so True /path/to/lib2.so False """ import os import os.path import platform import subprocess import sys # pylint: disable=g-import-not-at-top,g-importing-member try: from shutil import which except ImportError: from distutils.spawn import find_executable as which # pylint: enable=g-import-not-at-top,g-importing-member class ConfigError(Exception): pass def check_cuda_lib(path, check_soname=True): """Tests if a library exists on disk and whether its soname matches the filename. Args: path: the path to the library. check_soname: whether to check the soname as well. Raises: ConfigError: If the library does not exist or if its soname does not match the filename. """ if not os.path.isfile(path): raise ConfigError("No library found under: " + path) objdump = which("objdump") if check_soname and objdump is not None: # Decode is necessary as in py3 the return type changed from str to bytes output = subprocess.check_output([objdump, "-p", path]).decode("utf-8") output = [line for line in output.splitlines() if "SONAME" in line] sonames = [line.strip().split(" ")[-1] for line in output] if not any(soname == os.path.basename(path) for soname in sonames): raise ConfigError("None of the libraries match their SONAME: " + path) def main(): try: args = [argv for argv in sys.argv[1:]] if len(args) % 2 == 1: raise ConfigError("Expected even number of arguments") checked_paths = [] for i in range(0, len(args), 2): path = args[i] check_cuda_lib(path, check_soname=args[i + 1] == "True") checked_paths.append(path) # pylint: disable=superfluous-parens print(os.linesep.join(checked_paths)) # pylint: enable=superfluous-parens except ConfigError as e: sys.stderr.write(str(e)) sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: build_deps/gpus/configure.bzl ================================================ """Repository rule for CUDA autoconfiguration. `cuda_configure` depends on the following environment variables: * `NEED_CUDA`: Whether to enable building with CUDA. * `GCC_HOST_COMPILER_PATH`: The GCC host compiler path * `SYSROOT`: The sysroot to use when compiling. * `CUDA_PATHS`: The base paths to look for CUDA and cuDNN. Default is `/usr/local/cuda,usr/`. * `CUDA_TOOLKIT_PATH` (deprecated): The path to the CUDA toolkit. Default is `/usr/local/cuda`. * `CUDA_VERSION`: The version of the CUDA toolkit. If this is blank, then use the system default. * `CUDNN_VERSION`: The version of the cuDNN library. * `CUDNN_INSTALL_PATH` (deprecated): The path to the cuDNN library. Default is `/usr/local/cuda`. * `CUDA_COMPUTE_CAPABILITIES`: The CUDA compute capabilities. Default is `3.5,5.2`. * `PYTHON_BIN_PATH`: The python binary path """ load( "@bazel_tools//tools/cpp:lib_cc_configure.bzl", "escape_string", "get_env_var", ) load( "//build_deps/remote_config:common.bzl", "config_repo_label", "err_out", "execute", "get_bash_bin", "get_cpu_value", "get_host_environ", "get_python_bin", "raw_exec", "read_dir", "realpath", "which", ) _GCC_HOST_COMPILER_PATH = "GCC_HOST_COMPILER_PATH" _GCC_HOST_COMPILER_PREFIX = "GCC_HOST_COMPILER_PREFIX" _SYSROOT = "SYSROOT" _CUDA_TOOLKIT_PATH = "CUDA_TOOLKIT_PATH" _CUDA_VERSION = "CUDA_VERSION" _CUDNN_VERSION = "CUDNN_VERSION" _CUDNN_INSTALL_PATH = "CUDNN_INSTALL_PATH" _CUDA_COMPUTE_CAPABILITIES = "CUDA_COMPUTE_CAPABILITIES" _CUDA_CONFIG_REPO = "CUDA_CONFIG_REPO" _PYTHON_BIN_PATH = "PYTHON_BIN_PATH" _TENSORRT_VERSION = "TENSORRT_VERSION" _TENSORRT_INSTALL_PATH = "TENSORRT_INSTALL_PATH" _TENSORRT_STATIC_PATH = "TENSORRT_STATIC_PATH" _TENSORRT_LIBS = [ "nvinfer", "nvinfer_plugin", "nvonnxparser", "nvparsers", ] _TENSORRT_HEADERS = [ "NvInfer.h", "NvUtils.h", "NvInferPlugin.h", ] _TENSORRT_HEADERS_V6 = [ "NvInfer.h", "NvUtils.h", "NvInferPlugin.h", "NvInferVersion.h", "NvInferRuntime.h", "NvInferRuntimeCommon.h", "NvInferPluginUtils.h", "NvOnnxParser.h", "NvOnnxConfig.h", ] _TENSORRT_HEADERS_V8 = [ "NvInfer.h", "NvInferLegacyDims.h", "NvInferImpl.h", "NvUtils.h", "NvInferPlugin.h", "NvInferVersion.h", "NvInferRuntime.h", "NvInferRuntimeCommon.h", "NvInferPluginUtils.h", "NvOnnxParser.h", "NvOnnxConfig.h", ] def _at_least_version(actual_version, required_version): actual = [int(v) for v in actual_version.split(".")] required = [int(v) for v in required_version.split(".")] return actual >= required def _get_tensorrt_headers(tensorrt_version): if _at_least_version(tensorrt_version, "8"): return _TENSORRT_HEADERS_V8 if _at_least_version(tensorrt_version, "6"): return _TENSORRT_HEADERS_V6 return _TENSORRT_HEADERS def to_list_of_strings(elements): """Convert the list of ["a", "b", "c"] into '"a", "b", "c"'. This is to be used to put a list of strings into the bzl file templates so it gets interpreted as list of strings in Starlark. Args: elements: list of string elements Returns: single string of elements wrapped in quotes separated by a comma.""" quoted_strings = ["\"" + element + "\"" for element in elements] return ", ".join(quoted_strings) def verify_build_defines(params): """Verify all variables that crosstool/BUILD.tpl expects are substituted. Args: params: dict of variables that will be passed to the BUILD.tpl template. """ missing = [] for param in [ "cxx_builtin_include_directories", "extra_no_canonical_prefixes_flags", "host_compiler_path", "host_compiler_prefix", "host_compiler_warnings", "linker_bin_path", "compiler_deps", "unfiltered_compile_flags", ]: if ("%{" + param + "}") not in params: missing.append(param) if missing: auto_configure_fail( "BUILD.tpl template is missing these variables: " + str(missing) + ".\nWe only got: " + str(params) + ".", ) # TODO(dzc): Once these functions have been factored out of Bazel's # cc_configure.bzl, load them from @bazel_tools instead. # BEGIN cc_configure common functions. def find_cc(repository_ctx): """Find the C++ compiler.""" target_cc_name = "gcc" cc_path_envvar = _GCC_HOST_COMPILER_PATH cc_name = target_cc_name cc_name_from_env = get_host_environ(repository_ctx, cc_path_envvar) if cc_name_from_env: cc_name = cc_name_from_env if cc_name.startswith("/"): # Absolute path, maybe we should make this supported by our which function. return cc_name cc = which(repository_ctx, cc_name) if cc == None: fail(("Cannot find {}, either correct your path or set the {}" + " environment variable").format(target_cc_name, cc_path_envvar)) return cc _INC_DIR_MARKER_BEGIN = "#include <...>" # OSX add " (framework directory)" at the end of line, strip it. _OSX_FRAMEWORK_SUFFIX = " (framework directory)" _OSX_FRAMEWORK_SUFFIX_LEN = len(_OSX_FRAMEWORK_SUFFIX) def _cxx_inc_convert(path): """Convert path returned by cc -E xc++ in a complete path.""" path = path.strip() if path.endswith(_OSX_FRAMEWORK_SUFFIX): path = path[:-_OSX_FRAMEWORK_SUFFIX_LEN].strip() return path def _normalize_include_path(repository_ctx, path): """Normalizes include paths before writing them to the crosstool. If path points inside the 'crosstool' folder of the repository, a relative path is returned. If path points outside the 'crosstool' folder, an absolute path is returned. """ path = str(repository_ctx.path(path)) crosstool_folder = str(repository_ctx.path(".").get_child("crosstool")) if path.startswith(crosstool_folder): # We drop the path to "$REPO/crosstool" and a trailing path separator. return path[len(crosstool_folder) + 1:] return path def _is_compiler_option_supported(repository_ctx, cc, option): """Checks that `option` is supported by the C compiler. Doesn't %-escape the option.""" result = repository_ctx.execute([ cc, option, "-o", "/dev/null", "-c", str(repository_ctx.path("tools/cpp/empty.cc")), ]) return result.stderr.find(option) == -1 def _get_cxx_inc_directories_impl(repository_ctx, cc, lang_is_cpp, tf_sysroot): """Compute the list of default C or C++ include directories.""" if lang_is_cpp: lang = "c++" else: lang = "c" sysroot = [] if tf_sysroot: sysroot += ["--sysroot", tf_sysroot] result = raw_exec( repository_ctx, [cc, "-E", "-x" + lang, "-", "-v"] + sysroot, ) stderr = err_out(result) index1 = stderr.find(_INC_DIR_MARKER_BEGIN) if index1 == -1: return [] index1 = stderr.find("\n", index1) if index1 == -1: return [] index2 = stderr.rfind("\n ") if index2 == -1 or index2 < index1: return [] index2 = stderr.find("\n", index2 + 1) if index2 == -1: inc_dirs = stderr[index1 + 1:] else: inc_dirs = stderr[index1 + 1:index2].strip() print_resource_dir_supported = _is_compiler_option_supported( repository_ctx, cc, "-print-resource-dir", ) if print_resource_dir_supported: resource_dir = repository_ctx.execute( [cc, "-print-resource-dir"], ).stdout.strip() + "/share" inc_dirs += "\n" + resource_dir return [ _normalize_include_path(repository_ctx, _cxx_inc_convert(p)) for p in inc_dirs.split("\n") ] def get_cxx_inc_directories(repository_ctx, cc, tf_sysroot): """Compute the list of default C and C++ include directories.""" includes_cpp = _get_cxx_inc_directories_impl( repository_ctx, cc, True, tf_sysroot, ) includes_c = _get_cxx_inc_directories_impl( repository_ctx, cc, False, tf_sysroot, ) return includes_cpp + [ inc for inc in includes_c if inc not in includes_cpp ] def auto_configure_fail(msg): """Output failure message when cuda configuration fails.""" red = "\033[0;31m" no_color = "\033[0m" fail("\n%sCuda Configuration Error:%s %s\n" % (red, no_color, msg)) # END cc_configure common functions (see TODO above). def _cuda_include_path(repository_ctx, cuda_config): """Generates the Starlark string with cuda include directories. Args: repository_ctx: The repository context. cc: The path to the gcc host compiler. Returns: A list of the gcc host compiler include directories. """ nvcc_path = repository_ctx.path("%s/bin/nvcc%s" % ( cuda_config.cuda_toolkit_path, ".exe" if cuda_config.cpu_value == "Windows" else "", )) # The expected exit code of this command is non-zero. Bazel remote execution # only caches commands with zero exit code. So force a zero exit code. cmd = "%s -v /dev/null -o /dev/null ; [ $? -eq 1 ]" % str(nvcc_path) result = raw_exec( repository_ctx, [get_bash_bin(repository_ctx), "-c", cmd], ) target_dir = "" for one_line in err_out(result).splitlines(): if one_line.startswith("#$ _TARGET_DIR_="): target_dir = (cuda_config.cuda_toolkit_path + "/" + one_line.replace( "#$ _TARGET_DIR_=", "", ) + "/include") inc_entries = [] if target_dir != "": inc_entries.append(realpath(repository_ctx, target_dir)) inc_entries.append( realpath(repository_ctx, cuda_config.cuda_toolkit_path + "/include"), ) return inc_entries def matches_version(environ_version, detected_version): """Checks whether the user-specified version matches the detected version. This function performs a weak matching so that if the user specifies only the major or major and minor versions, the versions are still considered matching if the version parts match. To illustrate: environ_version detected_version result ----------------------------------------- 5.1.3 5.1.3 True 5.1 5.1.3 True 5 5.1 True 5.1.3 5.1 False 5.2.3 5.1.3 False Args: environ_version: The version specified by the user via environment variables. detected_version: The version autodetected from the CUDA installation on the system. Returns: True if user-specified version matches detected version and False otherwise. """ environ_version_parts = environ_version.split(".") detected_version_parts = detected_version.split(".") if len(detected_version_parts) < len(environ_version_parts): return False for i, part in enumerate(detected_version_parts): if i >= len(environ_version_parts): break if part != environ_version_parts[i]: return False return True _NVCC_VERSION_PREFIX = "Cuda compilation tools, release " _DEFINE_CUDNN_MAJOR = "#define CUDNN_MAJOR" def compute_capabilities(repository_ctx): """Returns a list of strings representing cuda compute capabilities. Args: repository_ctx: the repo rule's context. Returns: list of cuda architectures to compile for. 'compute_xy' refers to both PTX and SASS, 'sm_xy' refers to SASS only. """ capabilities = get_host_environ( repository_ctx, _CUDA_COMPUTE_CAPABILITIES, "compute_35,compute_52", ).split(",") # Map old 'x.y' capabilities to 'compute_xy'. if len(capabilities) > 0 and all( [len(x.split(".")) == 2 for x in capabilities], ): # If all capabilities are in 'x.y' format, only include PTX for the # highest capability. cc_list = sorted([x.replace(".", "") for x in capabilities]) capabilities = [ "sm_%s" % x for x in cc_list[:-1] ] + ["compute_%s" % cc_list[-1]] for i, capability in enumerate(capabilities): parts = capability.split(".") if len(parts) != 2: continue capabilities[i] = "compute_%s%s" % (parts[0], parts[1]) # Make list unique capabilities = dict(zip(capabilities, capabilities)).keys() # Validate capabilities. for capability in capabilities: if not capability.startswith(("compute_", "sm_")): auto_configure_fail("Invalid compute capability: %s" % capability) for prefix in ["compute_", "sm_"]: if not capability.startswith(prefix): continue if len(capability) == len(prefix) + 2 and capability[-2:].isdigit( ): continue auto_configure_fail("Invalid compute capability: %s" % capability) return capabilities def lib_name(base_name, cpu_value, version = None, static = False): """Constructs the platform-specific name of a library. Args: base_name: The name of the library, such as "cudart" cpu_value: The name of the host operating system. version: The version of the library. static: True the library is static or False if it is a shared object. Returns: The platform-specific name of the library. """ version = "" if not version else "." + version if cpu_value in ("Linux", "FreeBSD"): if static: return "lib%s.a" % base_name return "lib%s.so%s" % (base_name, version) elif cpu_value == "Windows": return "%s.lib" % base_name elif cpu_value == "Darwin": if static: return "lib%s.a" % base_name return "lib%s%s.dylib" % (base_name, version) else: auto_configure_fail("Invalid cpu_value: %s" % cpu_value) def _lib_path(lib, cpu_value, basedir, version, static): file_name = lib_name(lib, cpu_value, version, static) return "%s/%s" % (basedir, file_name) def _should_check_soname(version, static): return version and not static def _check_cuda_lib_params(lib, cpu_value, basedir, version, static = False): return ( _lib_path(lib, cpu_value, basedir, version, static), _should_check_soname(version, static), ) def _check_cuda_libs(repository_ctx, script_path, libs): python_bin = get_python_bin(repository_ctx) contents = repository_ctx.read(script_path).splitlines() cmd = "from os import linesep;" cmd += "f = open('script.py', 'w');" for line in contents: cmd += "f.write('%s' + linesep);" % line cmd += "f.close();" cmd += "from os import system;" args = " ".join(["\"" + path + "\" " + str(check) for path, check in libs]) cmd += "system('%s script.py %s');" % (python_bin, args) all_paths = [path for path, _ in libs] checked_paths = execute( repository_ctx, [python_bin, "-c", cmd], ).stdout.splitlines() # Filter out empty lines from splitting on '\r\n' on Windows checked_paths = [path for path in checked_paths if len(path) > 0] if all_paths != checked_paths: auto_configure_fail( "Error with installed CUDA libs. Expected '%s'. Actual '%s'." % (all_paths, checked_paths), ) def _find_libs(repository_ctx, check_cuda_libs_script, cuda_config): """Returns the CUDA and cuDNN libraries on the system. Also, verifies that the script actually exist. Args: repository_ctx: The repository context. check_cuda_libs_script: The path to a script verifying that the cuda libraries exist on the system. cuda_config: The CUDA config as returned by _get_cuda_config Returns: Map of library names to structs of filename and path. """ cpu_value = cuda_config.cpu_value stub_dir = "/stubs" check_cuda_libs_params = { "cuda": _check_cuda_lib_params( "cuda", cpu_value, cuda_config.config["cuda_library_dir"] + stub_dir, version = None, static = False, ), "cudart": _check_cuda_lib_params( "cudart", cpu_value, cuda_config.config["cuda_library_dir"], cuda_config.cudart_version, static = False, ), "cudart_static": _check_cuda_lib_params( "cudart_static", cpu_value, cuda_config.config["cuda_library_dir"], cuda_config.cudart_version, static = True, ), "cublas": _check_cuda_lib_params( "cublas", cpu_value, cuda_config.config["cublas_library_dir"], cuda_config.cublas_version, static = False, ), "cublasLt": _check_cuda_lib_params( "cublasLt", cpu_value, cuda_config.config["cublas_library_dir"], cuda_config.cublas_version, static = False, ), "cusolver": _check_cuda_lib_params( "cusolver", cpu_value, cuda_config.config["cusolver_library_dir"], cuda_config.cusolver_version, static = False, ), "curand": _check_cuda_lib_params( "curand", cpu_value, cuda_config.config["curand_library_dir"], cuda_config.curand_version, static = False, ), "cufft": _check_cuda_lib_params( "cufft", cpu_value, cuda_config.config["cufft_library_dir"], cuda_config.cufft_version, static = False, ), "cudnn": _check_cuda_lib_params( "cudnn", cpu_value, cuda_config.config["cudnn_library_dir"], cuda_config.cudnn_version, static = False, ), "cupti": _check_cuda_lib_params( "cupti", cpu_value, cuda_config.config["cupti_library_dir"], cuda_config.cupti_version, static = False, ), "cusparse": _check_cuda_lib_params( "cusparse", cpu_value, cuda_config.config["cusparse_library_dir"], cuda_config.cusparse_version, static = False, ), } # Verify that the libs actually exist at their locations. _check_cuda_libs( repository_ctx, check_cuda_libs_script, check_cuda_libs_params.values(), ) paths = { filename: v[0] for (filename, v) in check_cuda_libs_params.items() } return paths def _cudart_static_linkopt(cpu_value): """Returns additional platform-specific linkopts for cudart.""" return "" if cpu_value == "Darwin" else "\"-lrt\"," def _exec_find_cuda_config(repository_ctx, script_path, cuda_libraries): python_bin = get_python_bin(repository_ctx) cmd = "from os import system;" + "system('\"%s\" %s %s');" % ( python_bin, script_path, " ".join(cuda_libraries), ) return execute(repository_ctx, [python_bin, "-c", cmd]) # TODO(csigg): Only call once instead of from here, tensorrt_configure.bzl, # and nccl_configure.bzl. def find_cuda_config(repository_ctx, script_path, cuda_libraries): """Returns CUDA config dictionary from running find_cuda_config.py""" exec_result = _exec_find_cuda_config( repository_ctx, script_path, cuda_libraries, ) if exec_result.return_code: auto_configure_fail("Failed to run find_cuda_config.py: %s" % err_out(exec_result)) # Parse the dict from stdout. return dict( [tuple(x.split(": ")) for x in exec_result.stdout.splitlines()], ) def _get_cuda_config(repository_ctx, find_cuda_config_script): """Detects and returns information about the CUDA installation on the system. Args: repository_ctx: The repository context. Returns: A struct containing the following fields: cuda_toolkit_path: The CUDA toolkit installation directory. cudnn_install_basedir: The cuDNN installation directory. cuda_version: The version of CUDA on the system. cudart_version: The CUDA runtime version on the system. cudnn_version: The version of cuDNN on the system. compute_capabilities: A list of the system's CUDA compute capabilities. cpu_value: The name of the host operating system. """ config = find_cuda_config( repository_ctx, find_cuda_config_script, ["cuda", "cudnn"], ) cpu_value = get_cpu_value(repository_ctx) toolkit_path = config["cuda_toolkit_path"] cuda_version = config["cuda_version"].split(".") cuda_major = cuda_version[0] cuda_minor = cuda_version[1] cuda_version = "%s.%s" % (cuda_major, cuda_minor) cudnn_version = "%s" % config["cudnn_version"] if int(cuda_major) >= 11: # The libcudart soname in CUDA 11.x is versioned as 11.0 for backward compatability. if int(cuda_major) == 11: cudart_version = "11.0" cupti_version = cuda_version else: cudart_version = ("%s") % cuda_major cupti_version = cudart_version cublas_version = ("%s") % config["cublas_version"].split(".")[0] cusolver_version = ("%s") % config["cusolver_version"].split(".")[0] curand_version = ("%s") % config["curand_version"].split(".")[0] cufft_version = ("%s") % config["cufft_version"].split(".")[0] cusparse_version = ("%s") % config["cusparse_version"].split(".")[0] elif (int(cuda_major), int(cuda_minor)) >= (10, 1): # cuda_lib_version is for libraries like cuBLAS, cuFFT, cuSOLVER, etc. # It changed from 'x.y' to just 'x' in CUDA 10.1. cuda_lib_version = ("%s") % cuda_major cudart_version = cuda_version cupti_version = cuda_version cublas_version = cuda_lib_version cusolver_version = cuda_lib_version curand_version = cuda_lib_version cufft_version = cuda_lib_version cusparse_version = cuda_lib_version else: cudart_version = cuda_version cupti_version = cuda_version cublas_version = cuda_version cusolver_version = cuda_version curand_version = cuda_version cufft_version = cuda_version cusparse_version = cuda_version return struct( cuda_toolkit_path = toolkit_path, cuda_version = cuda_version, cupti_version = cupti_version, cuda_version_major = cuda_major, cudart_version = cudart_version, cublas_version = cublas_version, cusolver_version = cusolver_version, curand_version = curand_version, cufft_version = cufft_version, cusparse_version = cusparse_version, cudnn_version = cudnn_version, compute_capabilities = compute_capabilities(repository_ctx), cpu_value = cpu_value, config = config, ) def _tpl(repository_ctx, tpl, substitutions = {}, out = None): if not out: out = tpl.replace(":", "/") repository_ctx.template( out, Label("//build_deps/gpus/%s.tpl" % tpl), substitutions, ) def _file(repository_ctx, label): repository_ctx.template( label.replace(":", "/"), Label("//build_deps/gpus/%s.tpl" % label), {}, ) _DUMMY_CROSSTOOL_BZL_FILE = """ def error_gpu_disabled(): fail("ERROR: Building with --config=cuda but TensorFlow is not configured " + "to build with GPU support. Please re-run ./configure and enter 'Y' " + "at the prompt to build with GPU support.") native.genrule( name = "error_gen_crosstool", outs = ["CROSSTOOL"], cmd = "echo 'Should not be run.' && exit 1", ) native.filegroup( name = "crosstool", srcs = [":CROSSTOOL"], output_licenses = ["unencumbered"], ) """ _DUMMY_CROSSTOOL_BUILD_FILE = """ load("//crosstool:error_gpu_disabled.bzl", "error_gpu_disabled") error_gpu_disabled() """ def _norm_path(path): """Returns a path with '/' and remove the trailing slash.""" path = path.replace("\\", "/") if path[-1] == "/": path = path[:-1] return path def make_copy_files_rule(repository_ctx, name, srcs, outs): """Returns a rule to copy a set of files.""" cmds = [] # Copy files. for src, out in zip(srcs, outs): cmds.append('cp -f "%s" "$(location %s)"' % (src, out)) outs = [(' "%s",' % out) for out in outs] return """genrule( name = "%s", outs = [ %s ], cmd = \"""%s \""", )""" % (name, "\n".join(outs), " && \\\n".join(cmds)) def make_copy_dir_rule( repository_ctx, name, src_dir, out_dir, exceptions = None): """Returns a rule to recursively copy a directory. If exceptions is not None, it must be a list of files or directories in 'src_dir'; these will be excluded from copying. """ src_dir = _norm_path(src_dir) out_dir = _norm_path(out_dir) outs = read_dir(repository_ctx, src_dir) post_cmd = "" if exceptions != None: outs = [ x for x in outs if not any([x.startswith(src_dir + "/" + y) for y in exceptions]) ] outs = [(' "%s",' % out.replace(src_dir, out_dir)) for out in outs] # '@D' already contains the relative path for a single file, see # http://docs.bazel.build/versions/master/be/make-variables.html#predefined_genrule_variables out_dir = "$(@D)/%s" % out_dir if len(outs) > 1 else "$(@D)" if exceptions != None: for x in exceptions: post_cmd += " ; rm -fR " + out_dir + "/" + x return """genrule( name = "%s", outs = [ %s ], cmd = \"""cp -rLf "%s/." "%s/" %s\""", )""" % (name, "\n".join(outs), src_dir, out_dir, post_cmd) def _flag_enabled(repository_ctx, flag_name): return get_host_environ(repository_ctx, flag_name) == "1" def _tf_sysroot(repository_ctx): return get_host_environ(repository_ctx, _SYSROOT, "") def _compute_cuda_extra_copts(repository_ctx, compute_capabilities): copts = [] for capability in compute_capabilities: if capability.startswith("compute_"): capability = capability.replace("compute_", "sm_") copts.append("--cuda-include-ptx=%s" % capability) copts.append("--cuda-gpu-arch=%s" % capability) return str(copts) def _tpl_path(repository_ctx, filename): return repository_ctx.path(Label("//build_deps/gpus/%s.tpl" % filename)) def _basename(repository_ctx, path_str): """Returns the basename of a path of type string. """ num_chars = len(path_str) for i in range(num_chars): r_i = num_chars - 1 - i if path_str[r_i] == "/": return path_str[r_i + 1:] return path_str def _create_local_cuda_repository(repository_ctx): """Creates the repository containing files set up to build with CUDA.""" tpl_paths = { filename: _tpl_path(repository_ctx, filename) for filename in [ "cuda:build_defs.bzl", "crosstool:crosstool_compiler_wrapper", "crosstool:BUILD", "crosstool:cc_toolchain_config.bzl", "cuda:cuda_config.h", "cuda:cuda_config.py", ] } tpl_paths["cuda:BUILD"] = _tpl_path(repository_ctx, "cuda:BUILD") find_cuda_config_script = repository_ctx.path( Label("//build_deps/gpus:find_cuda_config.py"), ) cuda_config = _get_cuda_config(repository_ctx, find_cuda_config_script) cuda_include_path = cuda_config.config["cuda_include_dir"] cublas_include_path = cuda_config.config["cublas_include_dir"] cudnn_header_dir = cuda_config.config["cudnn_include_dir"] cupti_header_dir = cuda_config.config["cupti_include_dir"] nvvm_libdevice_dir = cuda_config.config["nvvm_library_dir"] # Create genrule to copy files from the installed CUDA toolkit into execroot. copy_rules = [ make_copy_dir_rule( repository_ctx, name = "cuda-include", src_dir = cuda_include_path, out_dir = "cuda/include", ), make_copy_dir_rule( repository_ctx, name = "cuda-nvvm", src_dir = nvvm_libdevice_dir, out_dir = "cuda/nvvm/libdevice", ), make_copy_dir_rule( repository_ctx, name = "cuda-extras", src_dir = cupti_header_dir, out_dir = "cuda/extras/CUPTI/include", ), ] copy_rules.append( make_copy_files_rule( repository_ctx, name = "cublas-include", srcs = [ cublas_include_path + "/cublas.h", cublas_include_path + "/cublas_v2.h", cublas_include_path + "/cublas_api.h", cublas_include_path + "/cublasLt.h", ], outs = [ "cublas/include/cublas.h", "cublas/include/cublas_v2.h", "cublas/include/cublas_api.h", "cublas/include/cublasLt.h", ], ), ) cusolver_include_path = cuda_config.config["cusolver_include_dir"] copy_rules.append( make_copy_files_rule( repository_ctx, name = "cusolver-include", srcs = [ cusolver_include_path + "/cusolver_common.h", cusolver_include_path + "/cusolverDn.h", ], outs = [ "cusolver/include/cusolver_common.h", "cusolver/include/cusolverDn.h", ], ), ) cufft_include_path = cuda_config.config["cufft_include_dir"] copy_rules.append( make_copy_files_rule( repository_ctx, name = "cufft-include", srcs = [ cufft_include_path + "/cufft.h", ], outs = [ "cufft/include/cufft.h", ], ), ) cusparse_include_path = cuda_config.config["cusparse_include_dir"] copy_rules.append( make_copy_files_rule( repository_ctx, name = "cusparse-include", srcs = [ cusparse_include_path + "/cusparse.h", ], outs = [ "cusparse/include/cusparse.h", ], ), ) curand_include_path = cuda_config.config["curand_include_dir"] copy_rules.append( make_copy_files_rule( repository_ctx, name = "curand-include", srcs = [ curand_include_path + "/curand.h", ], outs = [ "curand/include/curand.h", ], ), ) check_cuda_libs_script = repository_ctx.path( Label("//build_deps/gpus:check_cuda_libs.py"), ) cuda_libs = _find_libs(repository_ctx, check_cuda_libs_script, cuda_config) cuda_lib_srcs = [] cuda_lib_outs = [] for path in cuda_libs.values(): cuda_lib_srcs.append(path) cuda_lib_outs.append("cuda/lib/" + _basename(repository_ctx, path)) copy_rules.append( make_copy_files_rule( repository_ctx, name = "cuda-lib", srcs = cuda_lib_srcs, outs = cuda_lib_outs, ), ) file_ext = "" bin_files = ( ["crt/link.stub"] + [f + file_ext for f in ["bin2c", "fatbinary", "nvlink", "nvprune"]] ) copy_rules.append( make_copy_files_rule( repository_ctx, name = "cuda-bin", srcs = [ cuda_config.cuda_toolkit_path + "/bin/" + f for f in bin_files ], outs = ["cuda/bin/" + f for f in bin_files], ), ) # Select the headers based on the cuDNN version (strip '64_' for Windows). cudnn_headers = ["cudnn.h"] if cuda_config.cudnn_version.rsplit("_", 1)[-1] >= "8": cudnn_headers += [ "cudnn_backend.h", "cudnn_adv_infer.h", "cudnn_adv_train.h", "cudnn_cnn_infer.h", "cudnn_cnn_train.h", "cudnn_ops_infer.h", "cudnn_ops_train.h", "cudnn_version.h", ] cudnn_srcs = [] cudnn_outs = [] for header in cudnn_headers: cudnn_srcs.append(cudnn_header_dir + "/" + header) cudnn_outs.append("cudnn/include/" + header) copy_rules.append( make_copy_files_rule( repository_ctx, name = "cudnn-include", srcs = cudnn_srcs, outs = cudnn_outs, ), ) # Set up BUILD file for cuda/ repository_ctx.template( "cuda/build_defs.bzl", tpl_paths["cuda:build_defs.bzl"], { "%{cuda_is_configured}": "True", "%{cuda_extra_copts}": _compute_cuda_extra_copts( repository_ctx, cuda_config.compute_capabilities, ), "%{cuda_gpu_architectures}": str(cuda_config.compute_capabilities), }, ) cub_actual = "@cub_archive//:cub" if int(cuda_config.cuda_version_major) >= 11: cub_actual = ":cuda_headers" repository_ctx.template( "cuda/BUILD", tpl_paths["cuda:BUILD"], { "%{cuda_driver_lib}": _basename(repository_ctx, cuda_libs["cuda"]), "%{cudart_static_lib}": _basename(repository_ctx, cuda_libs["cudart_static"]), "%{cudart_static_linkopt}": _cudart_static_linkopt(cuda_config.cpu_value), "%{cudart_lib}": _basename(repository_ctx, cuda_libs["cudart"]), "%{cublas_lib}": _basename(repository_ctx, cuda_libs["cublas"]), "%{cublasLt_lib}": _basename(repository_ctx, cuda_libs["cublasLt"]), "%{cusolver_lib}": _basename(repository_ctx, cuda_libs["cusolver"]), "%{cudnn_lib}": _basename(repository_ctx, cuda_libs["cudnn"]), "%{cufft_lib}": _basename(repository_ctx, cuda_libs["cufft"]), "%{curand_lib}": _basename(repository_ctx, cuda_libs["curand"]), "%{cupti_lib}": _basename(repository_ctx, cuda_libs["cupti"]), "%{cusparse_lib}": _basename(repository_ctx, cuda_libs["cusparse"]), "%{cub_actual}": cub_actual, "%{copy_rules}": "\n".join(copy_rules), }, ) tf_sysroot = _tf_sysroot(repository_ctx) # Set up crosstool/ cc = find_cc(repository_ctx) cc_fullpath = cc host_compiler_includes = get_cxx_inc_directories( repository_ctx, cc_fullpath, tf_sysroot, ) cuda_defines = {} cuda_defines["%{builtin_sysroot}"] = tf_sysroot cuda_defines["%{cuda_toolkit_path}"] = "" cuda_defines["%{compiler}"] = "unknown" host_compiler_prefix = get_host_environ( repository_ctx, _GCC_HOST_COMPILER_PREFIX, ) if not host_compiler_prefix: host_compiler_prefix = "/usr/bin" cuda_defines["%{host_compiler_prefix}"] = host_compiler_prefix cuda_defines["%{linker_bin_path}"] = host_compiler_prefix cuda_defines["%{extra_no_canonical_prefixes_flags}"] = "" cuda_defines["%{unfiltered_compile_flags}"] = "" cuda_defines["%{host_compiler_path}"] = "crosstool_compiler_wrapper" cuda_defines["%{host_compiler_warnings}"] = "" # nvcc has the system include paths built in and will automatically # search them; we cannot work around that, so we add the relevant cuda # system paths to the allowed compiler specific include paths. cuda_defines["%{cxx_builtin_include_directories}"] = to_list_of_strings( host_compiler_includes + _cuda_include_path( repository_ctx, cuda_config, ) + [cupti_header_dir, cudnn_header_dir], ) # For gcc, do not canonicalize system header paths; some versions of gcc # pick the shortest possible path for system includes when creating the # .d file - given that includes that are prefixed with "../" multiple # time quickly grow longer than the root of the tree, this can lead to # bazel's header check failing. cuda_defines["%{extra_no_canonical_prefixes_flags}"] = "\"-fno-canonical-system-headers\"" file_ext = "" nvcc_path = "%s/nvcc%s" % (cuda_config.config["cuda_binary_dir"], file_ext) cuda_defines["%{compiler_deps}"] = ":crosstool_compiler" wrapper_defines = { "%{cpu_compiler}": str(cc), "%{cuda_version}": cuda_config.cuda_version, "%{nvcc_path}": nvcc_path, "%{gcc_host_compiler_path}": str(cc), } repository_ctx.template( "crosstool/crosstool_compiler_wrapper", tpl_paths["crosstool:crosstool_compiler_wrapper"], wrapper_defines, ) verify_build_defines(cuda_defines) # Only expand template variables in the BUILD file repository_ctx.template( "crosstool/BUILD", tpl_paths["crosstool:BUILD"], cuda_defines, ) # No templating of cc_toolchain_config - use attributes and templatize the # BUILD file. repository_ctx.template( "crosstool/cc_toolchain_config.bzl", tpl_paths["crosstool:cc_toolchain_config.bzl"], {}, ) # Set up cuda_config.h repository_ctx.template( "cuda/cuda/cuda_config.h", tpl_paths["cuda:cuda_config.h"], { "%{cuda_version}": cuda_config.cuda_version, "%{cudart_version}": cuda_config.cudart_version, "%{cupti_version}": cuda_config.cupti_version, "%{cublas_version}": cuda_config.cublas_version, "%{cusolver_version}": cuda_config.cusolver_version, "%{curand_version}": cuda_config.curand_version, "%{cufft_version}": cuda_config.cufft_version, "%{cusparse_version}": cuda_config.cusparse_version, "%{cudnn_version}": cuda_config.cudnn_version, "%{cuda_toolkit_path}": cuda_config.cuda_toolkit_path, "%{cuda_compute_capabilities}": ", ".join( [cc.split("_")[1] for cc in cuda_config.compute_capabilities], ), }, ) # Set up cuda_config.py, which is used by gen_build_info to provide # static build environment info to the API repository_ctx.template( "cuda/cuda/cuda_config.py", tpl_paths["cuda:cuda_config.py"], _py_tmpl_dict({ "cuda_version": cuda_config.cuda_version, "cudnn_version": cuda_config.cudnn_version, "cuda_compute_capabilities": cuda_config.compute_capabilities, "cpu_compiler": str(cc), }), ) def _get_tensorrt_static_path(repository_ctx): return get_host_environ(repository_ctx, _TENSORRT_STATIC_PATH, None) def _create_local_tensorrt_repository(repository_ctx): find_cuda_config_path = repository_ctx.path( Label("//build_deps/gpus:find_cuda_config.py"), ) config = find_cuda_config( repository_ctx, find_cuda_config_path, ["tensorrt"], ) tensorrt_version = config["tensorrt_version"] cpu_value = get_cpu_value(repository_ctx) # Copy the library and header files libraries = [ lib_name(lib, cpu_value, tensorrt_version) for lib in _TENSORRT_LIBS ] library_dir = config["tensorrt_library_dir"] + "/" headers = _get_tensorrt_headers(tensorrt_version) include_dir = config["tensorrt_include_dir"] + "/" copy_rules = [ make_copy_files_rule( repository_ctx, name = "tensorrt_lib", srcs = [library_dir + library for library in libraries], outs = ["tensorrt/lib/" + library for library in libraries], ), make_copy_files_rule( repository_ctx, name = "tensorrt_include", srcs = [include_dir + header for header in headers], outs = ["tensorrt/include/" + header for header in headers], ), ] tensorrt_static_path = _get_tensorrt_static_path(repository_ctx) if tensorrt_static_path: tensorrt_static_path = tensorrt_static_path + "/" if _at_least_version(tensorrt_version, "8"): raw_static_library_names = _TENSORRT_LIBS else: raw_static_library_names = _TENSORRT_LIBS + [ "nvrtc", "myelin_compiler", "myelin_executor", "myelin_pattern_library", "myelin_pattern_runtime", ] static_library_names = [ "%s_static" % name for name in raw_static_library_names ] static_libraries = [ lib_name(lib, cpu_value, tensorrt_version, static = True) for lib in static_library_names ] copy_rules = copy_rules + [ make_copy_files_rule( repository_ctx, name = "tensorrt_static_lib", srcs = [ tensorrt_static_path + library for library in static_libraries ], outs = [ "tensorrt/lib/" + library for library in static_libraries ], ), ] tpl_paths = { "tensorrt/build_defs.bzl": _tpl_path(repository_ctx, "tensorrt:build_defs.bzl"), "tensorrt/BUILD": _tpl_path(repository_ctx, "tensorrt:BUILD"), "tensorrt/tensorrt_config.h": _tpl_path(repository_ctx, "tensorrt:tensorrt_config.h"), "tensorrt/tensorrt_config.py": _tpl_path(repository_ctx, "tensorrt:tensorrt_config.py"), } # Set up config file. repository_ctx.template( "tensorrt/build_defs.bzl", tpl_paths["tensorrt/build_defs.bzl"], {"%{if_tensorrt}": "if_true"}, ) # Set up BUILD file. repository_ctx.template( "tensorrt/BUILD", tpl_paths["tensorrt/BUILD"], { "%{copy_rules}": "\n".join(copy_rules), }, ) # Set up tensorrt_config.h, which is used by # tensorflow/stream_executor/dso_loader.cc. repository_ctx.template( "tensorrt/tensorrt_config.h", tpl_paths["tensorrt/tensorrt_config.h"], {"%{tensorrt_version}": tensorrt_version}, ) # Set up tensorrt_config.py, which is used by gen_build_info to provide # build environment info to the API repository_ctx.template( "tensorrt/tensorrt_config.py", tpl_paths["tensorrt/tensorrt_config.py"], _py_tmpl_dict({ "tensorrt_version": tensorrt_version, }), ) def _py_tmpl_dict(d): return {"%{cuda_config}": str(d)} _CUDA_ENVIRONS = [ _GCC_HOST_COMPILER_PATH, _GCC_HOST_COMPILER_PREFIX, "NEED_CUDA", _CUDA_TOOLKIT_PATH, _CUDNN_INSTALL_PATH, _CUDA_VERSION, _CUDNN_VERSION, _CUDA_COMPUTE_CAPABILITIES, "NVVMIR_LIBRARY_DIR", _PYTHON_BIN_PATH, "TMP", "TMPDIR", "CUDA_PATHS", ] cuda_configure = repository_rule( implementation = _create_local_cuda_repository, environ = _CUDA_ENVIRONS, ) _TENSORRT_ENVIRONS = [ _TENSORRT_INSTALL_PATH, _TENSORRT_VERSION, _TENSORRT_STATIC_PATH, "CUDA_PATHS", ] tensorrt_configure = repository_rule( implementation = _create_local_tensorrt_repository, environ = _TENSORRT_ENVIRONS, ) ================================================ FILE: build_deps/gpus/crosstool/BUILD ================================================ ================================================ FILE: build_deps/gpus/crosstool/BUILD.tpl ================================================ # This file is expanded from a template by cuda_configure.bzl # Update cuda_configure.bzl#verify_build_defines when adding new variables. load(":cc_toolchain_config.bzl", "cc_toolchain_config") licenses(["restricted"]) package(default_visibility = ["//visibility:public"]) toolchain( name = "toolchain-linux-x86_64", exec_compatible_with = [ "@platforms//os:linux", "@platforms//cpu:x86_64", ], target_compatible_with = [ "@platforms//os:linux", "@platforms//cpu:x86_64", ], toolchain = ":cc-compiler-local", toolchain_type = "@bazel_tools//tools/cpp:toolchain_type", ) cc_toolchain_suite( name = "toolchain", toolchains = { "local|compiler": ":cc-compiler-local", "darwin|compiler": ":cc-compiler-darwin", "arm": ":cc-compiler-local", "aarch64": ":cc-compiler-local", "k8": ":cc-compiler-local", "piii": ":cc-compiler-local", "ppc": ":cc-compiler-local", "darwin": ":cc-compiler-darwin", }, ) cc_toolchain( name = "cc-compiler-local", all_files = "%{compiler_deps}", compiler_files = "%{compiler_deps}", ar_files = "%{compiler_deps}", as_files = "%{compiler_deps}", dwp_files = ":empty", linker_files = "%{compiler_deps}", objcopy_files = ":empty", strip_files = ":empty", # To support linker flags that need to go to the start of command line # we need the toolchain to support parameter files. Parameter files are # last on the command line and contain all shared libraries to link, so all # regular options will be left of them. supports_param_files = 1, toolchain_identifier = "local_linux", toolchain_config = ":cc-compiler-local-config", ) cc_toolchain_config( name = "cc-compiler-local-config", cpu = "local", builtin_include_directories = [%{cxx_builtin_include_directories}], extra_no_canonical_prefixes_flags = [%{extra_no_canonical_prefixes_flags}], host_compiler_path = "%{host_compiler_path}", host_compiler_prefix = "%{host_compiler_prefix}", host_compiler_warnings = [%{host_compiler_warnings}], host_unfiltered_compile_flags = [%{unfiltered_compile_flags}], linker_bin_path = "%{linker_bin_path}", builtin_sysroot = "%{builtin_sysroot}", cuda_path = "%{cuda_toolkit_path}", compiler = "%{compiler}", ) cc_toolchain( name = "cc-compiler-darwin", all_files = "%{compiler_deps}", compiler_files = "%{compiler_deps}", ar_files = "%{compiler_deps}", as_files = "%{compiler_deps}", dwp_files = ":empty", linker_files = "%{compiler_deps}", objcopy_files = ":empty", strip_files = ":empty", supports_param_files = 0, toolchain_identifier = "local_darwin", toolchain_config = ":cc-compiler-local-darwin", ) cc_toolchain_config( name = "cc-compiler-local-darwin", cpu = "darwin", builtin_include_directories = [%{cxx_builtin_include_directories}], extra_no_canonical_prefixes_flags = [%{extra_no_canonical_prefixes_flags}], host_compiler_path = "%{host_compiler_path}", host_compiler_prefix = "%{host_compiler_prefix}", host_compiler_warnings = [%{host_compiler_warnings}], host_unfiltered_compile_flags = [%{unfiltered_compile_flags}], linker_bin_path = "%{linker_bin_path}", ) filegroup( name = "empty", srcs = [], ) filegroup( name = "crosstool_compiler", srcs = ["crosstool_compiler_wrapper"], ) ================================================ FILE: build_deps/gpus/crosstool/cc_toolchain_config.bzl.tpl ================================================ """cc_toolchain_config rule for configuring CUDA toolchains on Linux, Mac, and Windows.""" load( "@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl", "action_config", "artifact_name_pattern", "env_entry", "env_set", "feature", "feature_set", "flag_group", "flag_set", "tool", "tool_path", "variable_with_value", "with_feature_set", ) load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES") def all_assembly_actions(): return [ ACTION_NAMES.assemble, ACTION_NAMES.preprocess_assemble, ] def all_compile_actions(): return [ ACTION_NAMES.assemble, ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile, ACTION_NAMES.cpp_header_parsing, ACTION_NAMES.cpp_module_codegen, ACTION_NAMES.cpp_module_compile, ACTION_NAMES.linkstamp_compile, ACTION_NAMES.preprocess_assemble, ] def all_c_compile_actions(): return [ ACTION_NAMES.c_compile, ] def all_cpp_compile_actions(): return [ ACTION_NAMES.cpp_compile, ACTION_NAMES.cpp_header_parsing, ACTION_NAMES.cpp_module_codegen, ACTION_NAMES.cpp_module_compile, ACTION_NAMES.linkstamp_compile, ] def all_preprocessed_actions(): return [ ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile, ACTION_NAMES.cpp_header_parsing, ACTION_NAMES.cpp_module_codegen, ACTION_NAMES.cpp_module_compile, ACTION_NAMES.linkstamp_compile, ACTION_NAMES.preprocess_assemble, ] def all_link_actions(): return [ ACTION_NAMES.cpp_link_executable, ACTION_NAMES.cpp_link_dynamic_library, ACTION_NAMES.cpp_link_nodeps_dynamic_library, ] def all_executable_link_actions(): return [ ACTION_NAMES.cpp_link_executable, ] def all_shared_library_link_actions(): return [ ACTION_NAMES.cpp_link_dynamic_library, ACTION_NAMES.cpp_link_nodeps_dynamic_library, ] def all_archive_actions(): return [ACTION_NAMES.cpp_link_static_library] def all_strip_actions(): return [ACTION_NAMES.strip] def _library_to_link(flag_prefix, value, iterate = None): return flag_group( flags = [ "{}%{{libraries_to_link.{}}}".format( flag_prefix, iterate if iterate else "name", ), ], iterate_over = ("libraries_to_link." + iterate if iterate else None), expand_if_equal = variable_with_value( name = "libraries_to_link.type", value = value, ), ) def _surround_static_library(prefix, suffix): return [ flag_group( flags = [prefix, "%{libraries_to_link.name}", suffix], expand_if_true = "libraries_to_link.is_whole_archive", ), flag_group( flags = ["%{libraries_to_link.name}"], expand_if_false = "libraries_to_link.is_whole_archive", ), ] def _prefix_static_library(prefix): return [ flag_group( flags = ["%{libraries_to_link.name}"], expand_if_false = "libraries_to_link.is_whole_archive", ), flag_group( flags = [prefix + "%{libraries_to_link.name}"], expand_if_true = "libraries_to_link.is_whole_archive", ), ] def _static_library_to_link(alwayslink_prefix, alwayslink_suffix = None): if alwayslink_suffix: flag_groups = _surround_static_library(alwayslink_prefix, alwayslink_suffix) else: flag_groups = _prefix_static_library(alwayslink_prefix) return flag_group( flag_groups = flag_groups, expand_if_equal = variable_with_value( name = "libraries_to_link.type", value = "static_library", ), ) def _iterate_flag_group(iterate_over, flags = [], flag_groups = []): return flag_group( iterate_over = iterate_over, expand_if_available = iterate_over, flag_groups = flag_groups, flags = flags, ) def _libraries_to_link_group(flavour): if flavour == "linux": return _iterate_flag_group( iterate_over = "libraries_to_link", flag_groups = [ flag_group( flags = ["-Wl,--start-lib"], expand_if_equal = variable_with_value( name = "libraries_to_link.type", value = "object_file_group", ), ), _library_to_link("", "object_file_group", "object_files"), flag_group( flags = ["-Wl,--end-lib"], expand_if_equal = variable_with_value( name = "libraries_to_link.type", value = "object_file_group", ), ), _library_to_link("", "object_file"), _library_to_link("", "interface_library"), _static_library_to_link("-Wl,-whole-archive", "-Wl,-no-whole-archive"), _library_to_link("-l", "dynamic_library"), _library_to_link("-l:", "versioned_dynamic_library"), ], ) elif flavour == "darwin": return _iterate_flag_group( iterate_over = "libraries_to_link", flag_groups = [ _library_to_link("", "object_file_group", "object_files"), _library_to_link("", "object_file"), _library_to_link("", "interface_library"), _static_library_to_link("-Wl,-force_load,"), _library_to_link("-l", "dynamic_library"), _library_to_link("-l:", "versioned_dynamic_library"), ], ) def _action_configs_with_tool(path, actions): return [ action_config( action_name = name, enabled = True, tools = [tool(path = path)], ) for name in actions ] def _action_configs(assembly_path, c_compiler_path, cc_compiler_path, archiver_path, linker_path, strip_path): return _action_configs_with_tool( assembly_path, all_assembly_actions(), ) + _action_configs_with_tool( c_compiler_path, all_c_compile_actions(), ) + _action_configs_with_tool( cc_compiler_path, all_cpp_compile_actions(), ) + _action_configs_with_tool( archiver_path, all_archive_actions(), ) + _action_configs_with_tool( linker_path, all_link_actions(), ) + _action_configs_with_tool( strip_path, all_strip_actions(), ) def _tool_paths(cpu, ctx): if cpu in ["local", "darwin"]: return [ tool_path(name = "gcc", path = ctx.attr.host_compiler_path), tool_path(name = "ar", path = ctx.attr.host_compiler_prefix + ( "/ar" if cpu == "local" else "/libtool" )), tool_path(name = "compat-ld", path = ctx.attr.host_compiler_prefix + "/ld"), tool_path(name = "cpp", path = ctx.attr.host_compiler_prefix + "/cpp"), tool_path(name = "dwp", path = ctx.attr.host_compiler_prefix + "/dwp"), tool_path(name = "gcov", path = ctx.attr.host_compiler_prefix + "/gcov"), tool_path(name = "ld", path = ctx.attr.host_compiler_prefix + "/ld"), tool_path(name = "nm", path = ctx.attr.host_compiler_prefix + "/nm"), tool_path(name = "objcopy", path = ctx.attr.host_compiler_prefix + "/objcopy"), tool_path(name = "objdump", path = ctx.attr.host_compiler_prefix + "/objdump"), tool_path(name = "strip", path = ctx.attr.host_compiler_prefix + "/strip"), ] else: fail("Unreachable") def _sysroot_group(): return flag_group( flags = ["--sysroot=%{sysroot}"], expand_if_available = "sysroot", ) def _no_canonical_prefixes_group(extra_flags): return flag_group( flags = [ "-no-canonical-prefixes", ] + extra_flags, ) def _cuda_set(cuda_path, actions): if cuda_path: return [flag_set( actions = actions, flag_groups = [ flag_group( flags = ["--cuda-path=" + cuda_path], ), ], )] else: return [] def _nologo(): return flag_group(flags = ["/nologo"]) def _features(cpu, compiler, ctx): if cpu in ["local", "darwin"]: return [ feature(name = "no_legacy_features"), feature( name = "all_compile_flags", enabled = True, flag_sets = [ flag_set( actions = all_compile_actions(), flag_groups = [ flag_group( flags = ["-MD", "-MF", "%{dependency_file}"], expand_if_available = "dependency_file", ), flag_group( flags = ["-gsplit-dwarf"], expand_if_available = "per_object_debug_info_file", ), ], ), flag_set( actions = all_preprocessed_actions(), flag_groups = [ flag_group( flags = ["-frandom-seed=%{output_file}"], expand_if_available = "output_file", ), _iterate_flag_group( flags = ["-D%{preprocessor_defines}"], iterate_over = "preprocessor_defines", ), _iterate_flag_group( flags = ["-include", "%{includes}"], iterate_over = "includes", ), _iterate_flag_group( flags = ["-iquote", "%{quote_include_paths}"], iterate_over = "quote_include_paths", ), _iterate_flag_group( flags = ["-I%{include_paths}"], iterate_over = "include_paths", ), _iterate_flag_group( flags = ["-isystem", "%{system_include_paths}"], iterate_over = "system_include_paths", ), _iterate_flag_group( flags = ["-F", "%{framework_include_paths}"], iterate_over = "framework_include_paths", ), ], ), flag_set( actions = all_cpp_compile_actions(), flag_groups = [], ), flag_set( actions = all_compile_actions(), flag_groups = [ flag_group( flags = [ "-Wno-builtin-macro-redefined", "-D__DATE__=\"redacted\"", "-D__TIMESTAMP__=\"redacted\"", "-D__TIME__=\"redacted\"", ], ), flag_group( flags = ["-fPIC"], expand_if_available = "pic", ), flag_group( flags = ["-fPIE"], expand_if_not_available = "pic", ), flag_group( flags = [ "-U_FORTIFY_SOURCE", "-D_FORTIFY_SOURCE=1", "-fstack-protector", "-Wall", ] + ctx.attr.host_compiler_warnings + [ "-fno-omit-frame-pointer", ], ), _no_canonical_prefixes_group( ctx.attr.extra_no_canonical_prefixes_flags, ), ], ), flag_set( actions = all_compile_actions(), flag_groups = [flag_group(flags = ["-DNDEBUG"])], with_features = [with_feature_set(features = ["disable-assertions"])], ), flag_set( actions = all_compile_actions(), flag_groups = [ flag_group( flags = [ "-g0", "-O2", "-ffunction-sections", "-fdata-sections", ], ), ], with_features = [with_feature_set(features = ["opt"])], ), flag_set( actions = all_compile_actions(), flag_groups = [flag_group(flags = ["-g"])], with_features = [with_feature_set(features = ["dbg"])], ), ] + _cuda_set( ctx.attr.cuda_path, all_compile_actions(), ) + [ flag_set( actions = all_compile_actions(), flag_groups = [ _iterate_flag_group( flags = ["%{user_compile_flags}"], iterate_over = "user_compile_flags", ), _sysroot_group(), flag_group( expand_if_available = "source_file", flags = ["-c", "%{source_file}"], ), flag_group( expand_if_available = "output_assembly_file", flags = ["-S"], ), flag_group( expand_if_available = "output_preprocess_file", flags = ["-E"], ), flag_group( expand_if_available = "output_file", flags = ["-o", "%{output_file}"], ), ], ), ], ), feature( name = "all_archive_flags", enabled = True, flag_sets = [ flag_set( actions = all_archive_actions(), flag_groups = [ flag_group( expand_if_available = "linker_param_file", flags = ["@%{linker_param_file}"], ), flag_group(flags = ["rcsD"]), flag_group( flags = ["%{output_execpath}"], expand_if_available = "output_execpath", ), flag_group( iterate_over = "libraries_to_link", flag_groups = [ flag_group( flags = ["%{libraries_to_link.name}"], expand_if_equal = variable_with_value( name = "libraries_to_link.type", value = "object_file", ), ), flag_group( flags = ["%{libraries_to_link.object_files}"], iterate_over = "libraries_to_link.object_files", expand_if_equal = variable_with_value( name = "libraries_to_link.type", value = "object_file_group", ), ), ], expand_if_available = "libraries_to_link", ), ], ), ], ), feature( name = "all_link_flags", enabled = True, flag_sets = [ flag_set( actions = all_shared_library_link_actions(), flag_groups = [flag_group(flags = ["-shared"])], ), flag_set( actions = all_link_actions(), flag_groups = ([ flag_group(flags = ["-Wl,-no-as-needed"]) ] if cpu == "local" else []) + ([ flag_group(flags = ["-B" + ctx.attr.linker_bin_path]) ] if ctx.attr.linker_bin_path else []) + [ flag_group( flags = ["@%{linker_param_file}"], expand_if_available = "linker_param_file", ), _iterate_flag_group( flags = ["%{linkstamp_paths}"], iterate_over = "linkstamp_paths", ), flag_group( flags = ["-o", "%{output_execpath}"], expand_if_available = "output_execpath", ), _iterate_flag_group( flags = ["-L%{library_search_directories}"], iterate_over = "library_search_directories", ), _iterate_flag_group( iterate_over = "runtime_library_search_directories", flags = [ "-Wl,-rpath,$ORIGIN/%{runtime_library_search_directories}", ] if cpu == "local" else [ "-Wl,-rpath,@loader_path/%{runtime_library_search_directories}", ], ), _libraries_to_link_group("darwin" if cpu == "darwin" else "linux"), _iterate_flag_group( flags = ["%{user_link_flags}"], iterate_over = "user_link_flags", ), flag_group( flags = ["-Wl,--gdb-index"], expand_if_available = "is_using_fission", ), flag_group( flags = ["-Wl,-S"], expand_if_available = "strip_debug_symbols", ), flag_group(flags = ["-lc++" if cpu == "darwin" else "-lstdc++"]), _no_canonical_prefixes_group( ctx.attr.extra_no_canonical_prefixes_flags, ), ], ), flag_set( actions = all_executable_link_actions(), flag_groups = [flag_group(flags = ["-pie"])], ), ] + ([ flag_set( actions = all_link_actions(), flag_groups = [flag_group(flags = [ "-Wl,-z,relro,-z,now", ])], ), ] if cpu == "local" else []) + ([ flag_set( actions = all_link_actions(), flag_groups = [ flag_group(flags = ["-Wl,--gc-sections"]), flag_group( flags = ["-Wl,--build-id=md5", "-Wl,--hash-style=gnu"], ), ], ), ] if cpu == "local" else []) + ([ flag_set( actions = all_link_actions(), flag_groups = [flag_group(flags = ["-undefined", "dynamic_lookup"])], ), ] if cpu == "darwin" else []) + _cuda_set( ctx.attr.cuda_path, all_link_actions(), ) + [ flag_set( actions = all_link_actions(), flag_groups = [ _sysroot_group(), ], ), ], ), feature(name = "disable-assertions"), feature( name = "opt", implies = ["disable-assertions"], ), feature(name = "fastbuild"), feature(name = "dbg"), feature(name = "supports_dynamic_linker", enabled = True), feature(name = "pic", enabled = True), feature(name = "supports_pic", enabled = True), feature(name = "has_configured_linker_path", enabled = True), ] else: fail("Unreachable") def _impl(ctx): cpu = ctx.attr.cpu compiler = ctx.attr.compiler if (cpu == "darwin"): toolchain_identifier = "local_darwin" target_cpu = "darwin" target_libc = "macosx" compiler = "compiler" action_configs = _action_configs( assembly_path = ctx.attr.host_compiler_path, c_compiler_path = ctx.attr.host_compiler_path, cc_compiler_path = ctx.attr.host_compiler_path, archiver_path = ctx.attr.host_compiler_prefix + "/libtool", linker_path = ctx.attr.host_compiler_path, strip_path = ctx.attr.host_compiler_prefix + "/strip", ) artifact_name_patterns = [] elif (cpu == "local"): toolchain_identifier = "local_linux" target_cpu = "local" target_libc = "local" action_configs = _action_configs( assembly_path = ctx.attr.host_compiler_path, c_compiler_path = ctx.attr.host_compiler_path, cc_compiler_path = ctx.attr.host_compiler_path, archiver_path = ctx.attr.host_compiler_prefix + "/ar", linker_path = ctx.attr.host_compiler_path, strip_path = ctx.attr.host_compiler_prefix + "/strip", ) artifact_name_patterns = [] else: fail("Unreachable") out = ctx.actions.declare_file(ctx.label.name) ctx.actions.write(out, "Fake executable") return [ cc_common.create_cc_toolchain_config_info( ctx = ctx, features = _features(cpu, compiler, ctx), action_configs = action_configs, artifact_name_patterns = artifact_name_patterns, cxx_builtin_include_directories = ctx.attr.builtin_include_directories, toolchain_identifier = toolchain_identifier, host_system_name = "local", target_system_name = "local", target_cpu = target_cpu, target_libc = target_libc, compiler = compiler, abi_version = "local", abi_libc_version = "local", tool_paths = _tool_paths(cpu, ctx), make_variables = [], builtin_sysroot = ctx.attr.builtin_sysroot, cc_target_os = None, ), DefaultInfo( executable = out, ), ] cc_toolchain_config = rule( implementation = _impl, attrs = { "cpu": attr.string(mandatory = True, values = ["darwin", "local"]), "compiler": attr.string(values = ["unknown"], default = "unknown"), "builtin_include_directories": attr.string_list(), "extra_no_canonical_prefixes_flags": attr.string_list(), "host_compiler_path": attr.string(), "host_compiler_prefix": attr.string(), "host_compiler_warnings": attr.string_list(), "host_unfiltered_compile_flags": attr.string_list(), "linker_bin_path": attr.string(), "builtin_sysroot": attr.string(), "cuda_path": attr.string(), }, provides = [CcToolchainConfigInfo], executable = True, ) ================================================ FILE: build_deps/gpus/crosstool/crosstool_compiler_wrapper.tpl ================================================ #!/usr/bin/env python # Copyright (c) 2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # """Crosstool wrapper for compiling CUDA programs. SYNOPSIS: crosstool_compiler_wrapper [options passed in by cc_library() or cc_binary() rule] DESCRIPTION: This script is expected to be called by the cc_library() or cc_binary() bazel rules. When the option "-x cuda" is present in the list of arguments passed to this script, it invokes the nvcc CUDA compiler. Most arguments are passed as is as a string to --compiler-options of nvcc. When "-x cuda" is not present, this wrapper invokes hybrid_driver_is_not_gcc with the input arguments as is. """ __author__ = 'keveman@google.com (Manjunath Kudlur)' import os import pipes import re import subprocess import sys from argparse import ArgumentParser # Template values set by cuda_autoconf. CPU_COMPILER = ('%{cpu_compiler}') GCC_HOST_COMPILER_PATH = ('%{gcc_host_compiler_path}') NVCC_PATH = '%{nvcc_path}' PREFIX_DIR = os.path.dirname(GCC_HOST_COMPILER_PATH) NVCC_VERSION = '%{cuda_version}' def Log(s): print('gpus/crosstool: {0}'.format(s)) def GetOptionValue(argv, option): """Extract the list of values for option from the argv list. Args: argv: A list of strings, possibly the argv passed to main(). option: The option whose value to extract, with the leading '-'. Returns: A list of values, either directly following the option, (eg., -opt val1 val2) or values collected from multiple occurrences of the option (eg., -opt val1 -opt val2). """ parser = ArgumentParser() parser.add_argument(option, nargs='*', action='append') option = option.lstrip('-').replace('-', '_') args, _ = parser.parse_known_args(argv) if not args or not vars(args)[option]: return [] else: return sum(vars(args)[option], []) def GetHostCompilerOptions(argv): """Collect the -isystem, -iquote, and --sysroot option values from argv. Args: argv: A list of strings, possibly the argv passed to main(). Returns: The string that can be used as the --compiler-options to nvcc. """ parser = ArgumentParser() parser.add_argument('-isystem', nargs='*', action='append') parser.add_argument('-iquote', nargs='*', action='append') parser.add_argument('--sysroot', nargs=1) parser.add_argument('-g', nargs='*', action='append') parser.add_argument('-fno-canonical-system-headers', action='store_true') parser.add_argument('-no-canonical-prefixes', action='store_true') args, _ = parser.parse_known_args(argv) opts = '' if args.isystem: opts += ' -isystem ' + ' -isystem '.join(sum(args.isystem, [])) if args.iquote: opts += ' -iquote ' + ' -iquote '.join(sum(args.iquote, [])) if args.g: opts += ' -g' + ' -g'.join(sum(args.g, [])) if args.fno_canonical_system_headers: opts += ' -fno-canonical-system-headers' if args.no_canonical_prefixes: opts += ' -no-canonical-prefixes' if args.sysroot: opts += ' --sysroot ' + args.sysroot[0] return opts def _update_options(nvcc_options): if NVCC_VERSION in ("7.0", ): return nvcc_options update_options = {"relaxed-constexpr": "expt-relaxed-constexpr"} return [ update_options[opt] if opt in update_options else opt for opt in nvcc_options ] def GetNvccOptions(argv): """Collect the -nvcc_options values from argv. Args: argv: A list of strings, possibly the argv passed to main(). Returns: The string that can be passed directly to nvcc. """ parser = ArgumentParser() parser.add_argument('-nvcc_options', nargs='*', action='append') args, _ = parser.parse_known_args(argv) if args.nvcc_options: options = _update_options(sum(args.nvcc_options, [])) return ' '.join(['--' + a for a in options]) return '' def system(cmd): """Invokes cmd with os.system(). Args: cmd: The command. Returns: The exit code if the process exited with exit() or -signal if the process was terminated by a signal. """ retv = os.system(cmd) if os.WIFEXITED(retv): return os.WEXITSTATUS(retv) else: return -os.WTERMSIG(retv) def InvokeNvcc(argv, log=False): """Call nvcc with arguments assembled from argv. Args: argv: A list of strings, possibly the argv passed to main(). log: True if logging is requested. Returns: The return value of calling system('nvcc ' + args) """ host_compiler_options = GetHostCompilerOptions(argv) nvcc_compiler_options = GetNvccOptions(argv) opt_option = GetOptionValue(argv, '-O') m_options = GetOptionValue(argv, '-m') m_options = ''.join([' -m' + m for m in m_options if m in ['32', '64']]) include_options = GetOptionValue(argv, '-I') out_file = GetOptionValue(argv, '-o') depfiles = GetOptionValue(argv, '-MF') defines = GetOptionValue(argv, '-D') defines = ''.join([' -D' + define for define in defines]) undefines = GetOptionValue(argv, '-U') undefines = ''.join([' -U' + define for define in undefines]) std_options = GetOptionValue(argv, '-std') nvcc_allowed_std_options = ["c++03", "c++11", "c++14"] nvcc_std_map = {} if int(NVCC_VERSION.split('.')[0]) >= 11: nvcc_std_map["c++1z"] = "c++17" nvcc_allowed_std_options += ["c++17", "c++1z"] std_options = ''.join([ ' -std=' + (nvcc_std_map[define] if define in nvcc_std_map else define) for define in std_options if define in nvcc_allowed_std_options ][-1:]) fatbin_options = ''.join([ ' --fatbin-options=' + option for option in GetOptionValue(argv, '-Xcuda-fatbinary') ]) # The list of source files get passed after the -c option. I don't know of # any other reliable way to just get the list of source files to be compiled. src_files = GetOptionValue(argv, '-c') # Pass -w through from host to nvcc, but don't do anything fancier with # warnings-related flags, since they're not necessarily the same across # compilers. warning_options = ' -w' if '-w' in argv else '' if len(src_files) == 0: return 1 if len(out_file) != 1: return 1 opt = (' -O2' if (len(opt_option) > 0 and int(opt_option[0]) > 0) else ' -g') includes = (' -I ' + ' -I '.join(include_options) if len(include_options) > 0 else '') # Unfortunately, there are other options that have -c prefix too. # So allowing only those look like C/C++ files. src_files = [ f for f in src_files if re.search('\.cpp$|\.cc$|\.c$|\.cxx$|\.C|\.cu|\.cuh$', f) ] srcs = ' '.join(src_files) out = ' -o ' + out_file[0] nvccopts = '-D_FORCE_INLINES ' capabilities_sm = set(GetOptionValue(argv, "--cuda-gpu-arch")) capabilities_compute = set(GetOptionValue(argv, '--cuda-include-ptx')) # When both "code=sm_xy" and "code=compute_xy" are requested for a single # arch, they can be combined using "code=xy,compute_xy" which avoids a # redundant PTX generation during compilation. capabilities_both = capabilities_sm.intersection(capabilities_compute) for capability in capabilities_both: capability = capability[len('sm_'):] nvccopts += r'-gencode=arch=compute_%s,code=\"sm_%s,compute_%s\" ' % ( capability, capability, capability) for capability in capabilities_sm - capabilities_both: capability = capability[len('sm_'):] nvccopts += r'-gencode=arch=compute_%s,\"code=sm_%s\" ' % (capability, capability) for capability in capabilities_compute - capabilities_both: capability = capability[len('sm_'):] nvccopts += r'-gencode=arch=compute_%s,\"code=compute_%s\" ' % ( capability, capability) nvccopts += nvcc_compiler_options nvccopts += undefines nvccopts += defines nvccopts += std_options nvccopts += m_options nvccopts += warning_options # Force C++17 dialect (note, everything in just one string!) nvccopts += ' --std c++17 ' nvccopts += fatbin_options if depfiles: # Generate the dependency file depfile = depfiles[0] cmd = (NVCC_PATH + ' ' + nvccopts + ' --compiler-options "' + host_compiler_options + '"' + ' --compiler-bindir=' + GCC_HOST_COMPILER_PATH + ' -I .' + ' -x cu ' + opt + includes + ' ' + srcs + ' -M -o ' + depfile) if log: Log(cmd) exit_status = system(cmd) if exit_status != 0: return exit_status cmd = (NVCC_PATH + ' ' + nvccopts + ' --compiler-options "' + host_compiler_options + ' -fPIC"' + ' --compiler-bindir=' + GCC_HOST_COMPILER_PATH + ' -I .' + ' -x cu ' + opt + includes + ' -c ' + srcs + out) # TODO(zhengxq): for some reason, 'gcc' needs this help to find 'as'. # Need to investigate and fix. cmd = 'PATH=' + PREFIX_DIR + ':$PATH ' + cmd if log: Log(cmd) return system(cmd) def main(): parser = ArgumentParser() parser.add_argument('-x', nargs=1) parser.add_argument('--cuda_log', action='store_true') args, leftover = parser.parse_known_args(sys.argv[1:]) if args.x and args.x[0] == 'cuda': if args.cuda_log: Log('-x cuda') leftover = [pipes.quote(s) for s in leftover] if args.cuda_log: Log('using nvcc') return InvokeNvcc(leftover, log=args.cuda_log) # Strip our flags before passing through to the CPU compiler for files which # are not -x cuda. We can't just pass 'leftover' because it also strips -x. # We not only want to pass -x to the CPU compiler, but also keep it in its # relative location in the argv list (the compiler is actually sensitive to # this). cpu_compiler_flags = [ flag for flag in sys.argv[1:] if not flag.startswith(('--cuda_log')) ] return subprocess.call([CPU_COMPILER] + cpu_compiler_flags) if __name__ == '__main__': sys.exit(main()) ================================================ FILE: build_deps/gpus/cuda/BUILD ================================================ ================================================ FILE: build_deps/gpus/cuda/BUILD.tpl ================================================ load(":build_defs.bzl", "cuda_header_library") load("@bazel_skylib//:bzl_library.bzl", "bzl_library") load("@bazel_skylib//lib:selects.bzl", "selects") load("@bazel_skylib//rules:common_settings.bzl", "bool_flag") licenses(["restricted"]) # MPL2, portions GPL v3, LGPL v3, BSD-like package(default_visibility = ["//visibility:public"]) bool_flag( name = "enable_cuda", build_setting_default = False, ) config_setting( name = "is_cuda_enabled", flag_values = {":enable_cuda": "True"}, ) # Config setting whether built with CUDA support using nvcc. # # TODO(b/174244321), DEPRECATED: this target will be removed when all users # have been converted to :is_cuda_enabled (most) or :is_cuda_compiler_nvcc. selects.config_setting_group( name = "using_nvcc", match_all = [ "//:is_cuda_enabled", "//:is_cuda_compiler_nvcc", ], ) config_setting( name = "_opt", values = {"compilation_mode": "opt"}, visibility = ["//visibility:private"], ) # Provides CUDA headers for '#include "third_party/gpus/cuda/include/cuda.h"' # All clients including TensorFlow should use these directives. cuda_header_library( name = "cuda_headers", hdrs = [ "cuda/cuda_config.h", ":cuda-include", ], include_prefix = "third_party/gpus", includes = [ ".", # required to include cuda/cuda/cuda_config.h as cuda/config.h "cuda/include", ], ) cc_library( name = "cudart_static", srcs = ["cuda/lib/%{cudart_static_lib}"], linkopts = [ "-ldl", "-lpthread", %{cudart_static_linkopt} ], ) cc_library( name = "cuda_driver", srcs = ["cuda/lib/%{cuda_driver_lib}"], ) cc_library( name = "cudart", srcs = ["cuda/lib/%{cudart_lib}"], data = ["cuda/lib/%{cudart_lib}"], linkstatic = 1, ) cuda_header_library( name = "cublas_headers", hdrs = [":cublas-include"], include_prefix = "third_party/gpus/cuda/include", includes = ["cublas/include"], strip_include_prefix = "cublas/include", deps = [":cuda_headers"], ) cuda_header_library( name = "cusolver_headers", hdrs = [":cusolver-include"], include_prefix = "third_party/gpus/cuda/include", includes = ["cusolver/include"], strip_include_prefix = "cusolver/include", deps = [":cuda_headers"], ) cuda_header_library( name = "cufft_headers", hdrs = [":cufft-include"], include_prefix = "third_party/gpus/cuda/include", includes = ["cufft/include"], strip_include_prefix = "cufft/include", deps = [":cuda_headers"], ) cuda_header_library( name = "cusparse_headers", hdrs = [":cusparse-include"], include_prefix = "third_party/gpus/cuda/include", includes = ["cusparse/include"], strip_include_prefix = "cusparse/include", deps = [":cuda_headers"], ) cuda_header_library( name = "curand_headers", hdrs = [":curand-include"], include_prefix = "third_party/gpus/cuda/include", includes = ["curand/include"], strip_include_prefix = "curand/include", deps = [":cuda_headers"], ) cc_library( name = "cublas", srcs = ["cuda/lib/%{cublas_lib}"], data = ["cuda/lib/%{cublas_lib}"], linkstatic = 1, ) cc_library( name = "cublasLt", srcs = ["cuda/lib/%{cublasLt_lib}"], data = ["cuda/lib/%{cublasLt_lib}"], linkstatic = 1, ) cc_library( name = "cusolver", srcs = ["cuda/lib/%{cusolver_lib}"], data = ["cuda/lib/%{cusolver_lib}"], linkopts = ["-lgomp"], linkstatic = 1, ) cc_library( name = "cudnn", srcs = ["cuda/lib/%{cudnn_lib}"], data = ["cuda/lib/%{cudnn_lib}"], linkstatic = 1, ) cc_library( name = "cudnn_header", hdrs = [":cudnn-include"], include_prefix = "third_party/gpus/cudnn", strip_include_prefix = "cudnn/include", deps = [":cuda_headers"], ) cc_library( name = "cufft", srcs = ["cuda/lib/%{cufft_lib}"], data = ["cuda/lib/%{cufft_lib}"], linkstatic = 1, ) cc_library( name = "curand", srcs = ["cuda/lib/%{curand_lib}"], data = ["cuda/lib/%{curand_lib}"], linkstatic = 1, ) cc_library( name = "cuda", deps = [ ":cublas", ":cublasLt", ":cuda_headers", ":cudart", ":cudnn", ":cufft", ":curand", ], ) alias( name = "cub_headers", actual = "%{cub_actual}", ) cuda_header_library( name = "cupti_headers", hdrs = [":cuda-extras"], include_prefix = "third_party/gpus", includes = ["cuda/extras/CUPTI/include/"], deps = [":cuda_headers"], ) cc_library( name = "cupti_dsos", data = ["cuda/lib/%{cupti_lib}"], ) cc_library( name = "cusparse", srcs = ["cuda/lib/%{cusparse_lib}"], data = ["cuda/lib/%{cusparse_lib}"], linkopts = ["-lgomp"], linkstatic = 1, ) cc_library( name = "libdevice_root", data = [":cuda-nvvm"], ) bzl_library( name = "build_defs_bzl", srcs = ["build_defs.bzl"], deps = [ "@bazel_skylib//lib:selects", ], ) py_library( name = "cuda_config_py", srcs = ["cuda/cuda_config.py"], ) %{copy_rules} ================================================ FILE: build_deps/gpus/cuda/build_defs.bzl.tpl ================================================ # Macros for building CUDA code. def cuda_default_copts(): """Default options for all CUDA compilations.""" return [ "-x", "cuda", "-DUSE_CUDA=1", "-Xcuda-fatbinary=--compress-all", ] + %{cuda_extra_copts} def cuda_gpu_architectures(): """Returns a list of supported GPU architectures.""" return %{cuda_gpu_architectures} def cuda_header_library(name, hdrs, include_prefix=None, strip_include_prefix=None, deps=[], **kwargs): """Generates a cc_library containing both virtual and system include paths. Generates both a header-only target with virtual includes plus the full target without virtual includes. This works around the fact that bazel can't mix 'includes' and 'include_prefix' in the same target.""" native.cc_library( name=name + "_virtual", hdrs=hdrs, include_prefix=include_prefix, strip_include_prefix=strip_include_prefix, deps=deps, visibility=["//visibility:private"], ) native.cc_library(name=name, textual_hdrs=hdrs, deps=deps + [":%s_virtual" % name], **kwargs) def cuda_cc_library(copts=[], **kwargs): """Wrapper over cc_library which adds default CUDA options.""" native.cc_library(copts=cuda_default_copts() + copts, **kwargs) def cuda_cc_binary(copts=[], **kwargs): """Wrapper over cc_library which adds default CUDA options.""" native.cc_binary(copts=cuda_default_copts() + copts, **kwargs) def cuda_cc_test(copts=[], **kwargs): """Wrapper over cc_test which adds default CUDA options.""" native.cc_test(copts=copts, **kwargs) ================================================ FILE: build_deps/gpus/cuda/cuda_config.h.tpl ================================================ /* * Copyright (c) 2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef CUDA_CUDA_CONFIG_H_ #define CUDA_CUDA_CONFIG_H_ #define CUDA_VERSION "%{cuda_version}" #define CUDART_VERSION "%{cudart_version}" #define CUPTI_VERSION "%{cupti_version}" #define CUBLAS_VERSION "%{cublas_version}" #define CUSOLVER_VERSION "%{cusolver_version}" #define CURAND_VERSION "%{curand_version}" #define CUFFT_VERSION "%{cufft_version}" #define CUSPARSE_VERSION "%{cusparse_version}" #define CUDNN_VERSION "%{cudnn_version}" #define CUDA_TOOLKIT_PATH "%{cuda_toolkit_path}" #define CUDA_COMPUTE_CAPABILITIES %{cuda_compute_capabilities} #endif // CUDA_CUDA_CONFIG_H_ ================================================ FILE: build_deps/gpus/cuda/cuda_config.py.tpl ================================================ # Copyright (c) 2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # config = %{cuda_config} ================================================ FILE: build_deps/gpus/find_cuda_config.py ================================================ # Copyright (c) 2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # """Prints CUDA library and header directories and versions found on the system. The script searches for CUDA library and header files on the system, inspects them to determine their version and prints the configuration to stdout. The paths to inspect and the required versions are specified through environment variables. If no valid configuration is found, the script prints to stderr and returns an error code. The list of libraries to find is specified as arguments. Supported libraries are CUDA (includes cuBLAS), cuDNN, NCCL, and TensorRT. The script takes a list of base directories specified by the CUDA_PATHS environment variable as comma-separated glob list. The script looks for headers and library files in a hard-coded set of subdirectories from these base paths. If CUDA_PATHS is not specified, a OS specific default is used: Linux: /usr/local/cuda, /usr, and paths from 'ldconfig -p'. Windows: CUDA_PATH environment variable, or C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\* For backwards compatibility, some libraries also use alternative base directories from other environment variables if they are specified. List of library-specific environment variables: Library Version env variable Additional base directories ---------------------------------------------------------------- CUDA CUDA_VERSION CUDA_TOOLKIT_PATH cuBLAS CUBLAS_VERSION CUDA_TOOLKIT_PATH cuDNN CUDNN_VERSION CUDNN_INSTALL_PATH NCCL NCCL_VERSION NCCL_INSTALL_PATH, NCCL_HDR_PATH TensorRT TENSORRT_VERSION TENSORRT_INSTALL_PATH Versions environment variables can be of the form 'x' or 'x.y' to request a specific version, empty or unspecified to accept any version. The output of a found library is of the form: tf__version: x.y.z tf__header_dir: ... tf__library_dir: ... """ import glob import io import os import platform import re import subprocess import sys # pylint: disable=g-import-not-at-top try: from shutil import which except ImportError: from distutils.spawn import find_executable as which # pylint: enable=g-import-not-at-top class ConfigError(Exception): pass def _is_linux(): return platform.system() == "Linux" def _is_macos(): return platform.system() == "Darwin" def _matches_version(actual_version, required_version): """Checks whether some version meets the requirements. All elements of the required_version need to be present in the actual_version. required_version actual_version result ----------------------------------------- 1 1.1 True 1.2 1 False 1.2 1.3 False 1 True Args: required_version: The version specified by the user. actual_version: The version detected from the CUDA installation. Returns: Whether the actual version matches the required one. """ if actual_version is None: return False # Strip spaces from the versions. actual_version = actual_version.strip() required_version = required_version.strip() return actual_version.startswith(required_version) def _at_least_version(actual_version, required_version): actual = [int(v) for v in actual_version.split(".")] required = [int(v) for v in required_version.split(".")] return actual >= required def _get_header_version(path, name): """Returns preprocessor defines in C header file.""" for line in io.open(path, "r", encoding="utf-8").readlines(): match = re.match("\s*#\s*define %s\s+(\d+)" % name, line) if match: return match.group(1) return "" def _cartesian_product(first, second): """Returns all path combinations of first and second.""" return [os.path.join(f, s) for f in first for s in second] def _get_ld_config_paths(): """Returns all directories from 'ldconfig -p'.""" if not _is_linux(): return [] ldconfig_path = which("ldconfig") or "/sbin/ldconfig" output = subprocess.check_output([ldconfig_path, "-p"]) pattern = re.compile(".* => (.*)") result = set() for line in output.splitlines(): try: match = pattern.match(line.decode("ascii")) except UnicodeDecodeError: match = False if match: result.add(os.path.dirname(match.group(1))) return sorted(list(result)) def _get_default_cuda_paths(cuda_version): if not cuda_version: cuda_version = "*" elif not "." in cuda_version: cuda_version = cuda_version + ".*" return [ "/usr/local/cuda-%s" % cuda_version, "/usr/local/cuda", "/usr", "/usr/local/cudnn" ] + _get_ld_config_paths() def _header_paths(): """Returns hard-coded set of relative paths to look for header files.""" return [ "", "include", "include/cuda", "include/*-linux-gnu", "extras/CUPTI/include", "include/cuda/CUPTI", "local/cuda/extras/CUPTI/include", ] def _library_paths(): """Returns hard-coded set of relative paths to look for library files.""" return [ "", "lib64", "lib", "lib/*-linux-gnu", "lib/x64", "extras/CUPTI/*", "local/cuda/lib64", "local/cuda/extras/CUPTI/lib64", ] def _not_found_error(base_paths, relative_paths, filepattern): base_paths = "".join( ["\n '%s'" % path for path in sorted(base_paths)]) relative_paths = "".join( ["\n '%s'" % path for path in relative_paths]) return ConfigError( "Could not find any %s in any subdirectory:%s\nof:%s\n" % (filepattern, relative_paths, base_paths)) def _find_file(base_paths, relative_paths, filepattern): for path in _cartesian_product(base_paths, relative_paths): for file in glob.glob(os.path.join(path, filepattern)): return file raise _not_found_error(base_paths, relative_paths, filepattern) def _find_library(base_paths, library_name, required_version): """Returns first valid path to the requested library.""" if _is_macos(): filepattern = "%s*.dylib" % (".".join(["lib" + library_name] + required_version.split(".")[:1])) else: filepattern = ".".join(["lib" + library_name, "so"] + required_version.split(".")[:1]) + "*" return _find_file(base_paths, _library_paths(), filepattern) def _find_versioned_file(base_paths, relative_paths, filepatterns, required_version, get_version): """Returns first valid path to a file that matches the requested version.""" if type(filepatterns) not in [list, tuple]: filepatterns = [filepatterns] for path in _cartesian_product(base_paths, relative_paths): for filepattern in filepatterns: for file in glob.glob(os.path.join(path, filepattern)): actual_version = get_version(file) if _matches_version(actual_version, required_version): return file, actual_version raise _not_found_error( base_paths, relative_paths, ", ".join(filepatterns) + " matching version '%s'" % required_version) def _find_header(base_paths, header_name, required_version, get_version): """Returns first valid path to a header that matches the requested version.""" return _find_versioned_file(base_paths, _header_paths(), header_name, required_version, get_version) def _find_cuda_config(base_paths, required_version): def get_header_version(path): version = int(_get_header_version(path, "CUDA_VERSION")) if not version: return None return "%d.%d" % (version // 1000, version % 1000 // 10) cuda_header_path, header_version = _find_header(base_paths, "cuda.h", required_version, get_header_version) cuda_version = header_version # x.y, see above. cuda_library_path = _find_library(base_paths, "cudart", cuda_version) def get_nvcc_version(path): pattern = "Cuda compilation tools, release \d+\.\d+, V(\d+\.\d+\.\d+)" for line in subprocess.check_output([path, "--version"]).splitlines(): match = re.match(pattern, line.decode("ascii")) if match: return match.group(1) return None nvcc_name = "nvcc" nvcc_path, nvcc_version = _find_versioned_file(base_paths, [ "", "bin", "local/cuda/bin", ], nvcc_name, cuda_version, get_nvcc_version) nvvm_path = _find_file(base_paths, [ "nvvm/libdevice", "share/cuda", "lib/nvidia-cuda-toolkit/libdevice", "local/cuda/nvvm/libdevice", ], "libdevice*.10.bc") cupti_header_path = _find_file(base_paths, _header_paths(), "cupti.h") cupti_library_path = _find_library(base_paths, "cupti", required_version) cuda_binary_dir = os.path.dirname(nvcc_path) nvvm_library_dir = os.path.dirname(nvvm_path) # XLA requires the toolkit path to find ptxas and libdevice. # TODO(csigg): pass in both directories instead. cuda_toolkit_paths = ( os.path.normpath(os.path.join(cuda_binary_dir, "..")), os.path.normpath(os.path.join(nvvm_library_dir, "../..")), ) if cuda_toolkit_paths[0] != cuda_toolkit_paths[1]: raise ConfigError("Inconsistent CUDA toolkit path: %s vs %s" % cuda_toolkit_paths) return { "cuda_version": cuda_version, "cuda_include_dir": os.path.dirname(cuda_header_path), "cuda_library_dir": os.path.dirname(cuda_library_path), "cuda_binary_dir": cuda_binary_dir, "nvvm_library_dir": nvvm_library_dir, "cupti_include_dir": os.path.dirname(cupti_header_path), "cupti_library_dir": os.path.dirname(cupti_library_path), "cuda_toolkit_path": cuda_toolkit_paths[0], } def _find_cublas_config(base_paths, required_version, cuda_version): if _at_least_version(cuda_version, "10.1"): def get_header_version(path): version = (v for v in (_get_header_version(path, name) for name in ("CUBLAS_VER_MAJOR", "CUBLAS_VER_MINOR", "CUBLAS_VER_PATCH", "CUBLAS_VER_BUILD")) if v != "") return ".".join(version) header_path, header_version = _find_header(base_paths, "cublas_api.h", required_version, get_header_version) # cuBLAS uses the major version only. cublas_version = header_version.split(".")[0] else: # There is no version info available before CUDA 10.1, just find the file. header_version = cuda_version header_path = _find_file(base_paths, _header_paths(), "cublas_api.h") # cuBLAS version is the same as CUDA version (x.y). cublas_version = required_version library_path = _find_library(base_paths, "cublas", cublas_version) return { "cublas_version": header_version, "cublas_include_dir": os.path.dirname(header_path), "cublas_library_dir": os.path.dirname(library_path), } def _find_cusolver_config(base_paths, required_version, cuda_version): if _at_least_version(cuda_version, "11.0"): def get_header_version(path): version = (v for v in (_get_header_version(path, name) for name in ("CUSOLVER_VER_MAJOR", "CUSOLVER_VER_MINOR", "CUSOLVER_VER_PATCH", "CUSOLVER_VER_BUILD")) if v != "") return ".".join(version) header_path, header_version = _find_header(base_paths, "cusolver_common.h", required_version, get_header_version) cusolver_version = header_version.split(".")[0] else: header_version = cuda_version header_path = _find_file(base_paths, _header_paths(), "cusolver_common.h") cusolver_version = required_version library_path = _find_library(base_paths, "cusolver", cusolver_version) return { "cusolver_version": header_version, "cusolver_include_dir": os.path.dirname(header_path), "cusolver_library_dir": os.path.dirname(library_path), } def _find_curand_config(base_paths, required_version, cuda_version): if _at_least_version(cuda_version, "11.0"): def get_header_version(path): version = (v for v in (_get_header_version(path, name) for name in ("CURAND_VER_MAJOR", "CURAND_VER_MINOR", "CURAND_VER_PATCH", "CURAND_VER_BUILD")) if v != "") return ".".join(version) header_path, header_version = _find_header(base_paths, "curand.h", required_version, get_header_version) curand_version = header_version.split(".")[0] else: header_version = cuda_version header_path = _find_file(base_paths, _header_paths(), "curand.h") curand_version = required_version library_path = _find_library(base_paths, "curand", curand_version) return { "curand_version": header_version, "curand_include_dir": os.path.dirname(header_path), "curand_library_dir": os.path.dirname(library_path), } def _find_cufft_config(base_paths, required_version, cuda_version): if _at_least_version(cuda_version, "11.0"): def get_header_version(path): version = (v for v in (_get_header_version(path, name) for name in ("CUFFT_VER_MAJOR", "CUFFT_VER_MINOR", "CUFFT_VER_PATCH", "CUFFT_VER_BUILD")) if v != "") return ".".join(version) header_path, header_version = _find_header(base_paths, "cufft.h", required_version, get_header_version) cufft_version = header_version.split(".")[0] else: header_version = cuda_version header_path = _find_file(base_paths, _header_paths(), "cufft.h") cufft_version = required_version library_path = _find_library(base_paths, "cufft", cufft_version) return { "cufft_version": header_version, "cufft_include_dir": os.path.dirname(header_path), "cufft_library_dir": os.path.dirname(library_path), } def _find_cudnn_config(base_paths, required_version): def get_header_version(path): version = [ _get_header_version(path, name) for name in ("CUDNN_MAJOR", "CUDNN_MINOR", "CUDNN_PATCHLEVEL") ] return ".".join(version) if version[0] else None header_path, header_version = _find_header(base_paths, ("cudnn.h", "cudnn_version.h"), required_version, get_header_version) cudnn_version = header_version.split(".")[0] library_path = _find_library(base_paths, "cudnn", cudnn_version) return { "cudnn_version": cudnn_version, "cudnn_include_dir": os.path.dirname(header_path), "cudnn_library_dir": os.path.dirname(library_path), } def _find_cusparse_config(base_paths, required_version, cuda_version): if _at_least_version(cuda_version, "11.0"): def get_header_version(path): version = (v for v in (_get_header_version(path, name) for name in ("CUSPARSE_VER_MAJOR", "CUSPARSE_VER_MINOR", "CUSPARSE_VER_PATCH", "CUSPARSE_VER_BUILD")) if v != "") return ".".join(version) header_path, header_version = _find_header(base_paths, "cusparse.h", required_version, get_header_version) cusparse_version = header_version.split(".")[0] else: header_version = cuda_version header_path = _find_file(base_paths, _header_paths(), "cusparse.h") cusparse_version = required_version library_path = _find_library(base_paths, "cusparse", cusparse_version) return { "cusparse_version": header_version, "cusparse_include_dir": os.path.dirname(header_path), "cusparse_library_dir": os.path.dirname(library_path), } def _find_nccl_config(base_paths, required_version): def get_header_version(path): version = (_get_header_version(path, name) for name in ("NCCL_MAJOR", "NCCL_MINOR", "NCCL_PATCH")) return ".".join(version) header_path, header_version = _find_header(base_paths, "nccl.h", required_version, get_header_version) nccl_version = header_version.split(".")[0] library_path = _find_library(base_paths, "nccl", nccl_version) return { "nccl_version": nccl_version, "nccl_include_dir": os.path.dirname(header_path), "nccl_library_dir": os.path.dirname(library_path), } def _find_tensorrt_config(base_paths, required_version): def get_header_version(path): version = (_get_header_version(path, name) for name in ("NV_TENSORRT_MAJOR", "NV_TENSORRT_MINOR", "NV_TENSORRT_PATCH")) # `version` is a generator object, so we convert it to a list before using # it (muitiple times below). version = list(version) if not all(version): # Versions not found, make _matches_version returns False. return None return ".".join(version) header_path, header_version = _find_header(base_paths, "NvInferVersion.h", required_version, get_header_version) tensorrt_version = header_version.split(".")[0] library_path = _find_library(base_paths, "nvinfer", tensorrt_version) return { "tensorrt_version": tensorrt_version, "tensorrt_include_dir": os.path.dirname(header_path), "tensorrt_library_dir": os.path.dirname(library_path), } def _list_from_env(env_name, default=[]): """Returns comma-separated list from environment variable.""" if env_name in os.environ: return os.environ[env_name].split(",") return default def _get_legacy_path(env_name, default=[]): """Returns a path specified by a legacy environment variable. CUDNN_INSTALL_PATH, NCCL_INSTALL_PATH, TENSORRT_INSTALL_PATH set to '/usr/lib/x86_64-linux-gnu' would previously find both library and header paths. Detect those and return '/usr', otherwise forward to _list_from_env(). """ if env_name in os.environ: match = re.match("^(/[^/ ]*)+/lib/\w+-linux-gnu/?$", os.environ[env_name]) if match: return [match.group(1)] return _list_from_env(env_name, default) def _normalize_path(path): """Returns normalized path, with forward slashes on Windows.""" return os.path.realpath(path) def find_cuda_config(): """Returns a dictionary of CUDA library and header file paths.""" libraries = [argv.lower() for argv in sys.argv[1:]] cuda_version = os.environ.get("CUDA_VERSION", "") base_paths = _list_from_env("CUDA_PATHS", _get_default_cuda_paths(cuda_version)) base_paths = [path for path in base_paths if os.path.exists(path)] result = {} if "cuda" in libraries: cuda_paths = _list_from_env("CUDA_TOOLKIT_PATH", base_paths) result.update(_find_cuda_config(cuda_paths, cuda_version)) cuda_version = result["cuda_version"] cublas_paths = base_paths if tuple(int(v) for v in cuda_version.split(".")) < (10, 1): # Before CUDA 10.1, cuBLAS was in the same directory as the toolkit. cublas_paths = cuda_paths cublas_version = os.environ.get("CUBLAS_VERSION", "") result.update( _find_cublas_config(cublas_paths, cublas_version, cuda_version)) cusolver_paths = base_paths if tuple(int(v) for v in cuda_version.split(".")) < (11, 0): cusolver_paths = cuda_paths cusolver_version = os.environ.get("CUSOLVER_VERSION", "") result.update( _find_cusolver_config(cusolver_paths, cusolver_version, cuda_version)) curand_paths = base_paths if tuple(int(v) for v in cuda_version.split(".")) < (11, 0): curand_paths = cuda_paths curand_version = os.environ.get("CURAND_VERSION", "") result.update( _find_curand_config(curand_paths, curand_version, cuda_version)) cufft_paths = base_paths if tuple(int(v) for v in cuda_version.split(".")) < (11, 0): cufft_paths = cuda_paths cufft_version = os.environ.get("CUFFT_VERSION", "") result.update( _find_cufft_config(cufft_paths, cufft_version, cuda_version)) cusparse_paths = base_paths if tuple(int(v) for v in cuda_version.split(".")) < (11, 0): cusparse_paths = cuda_paths cusparse_version = os.environ.get("CUSPARSE_VERSION", "") result.update( _find_cusparse_config(cusparse_paths, cusparse_version, cuda_version)) if "cudnn" in libraries: cudnn_paths = _get_legacy_path("CUDNN_INSTALL_PATH", base_paths) cudnn_version = os.environ.get("CUDNN_VERSION", "") result.update(_find_cudnn_config(cudnn_paths, cudnn_version)) if "nccl" in libraries: nccl_paths = _get_legacy_path("NCCL_INSTALL_PATH", base_paths) nccl_version = os.environ.get("NCCL_VERSION", "") result.update(_find_nccl_config(nccl_paths, nccl_version)) if "tensorrt" in libraries: tensorrt_paths = _get_legacy_path("TENSORRT_INSTALL_PATH", base_paths) tensorrt_version = os.environ.get("TENSORRT_VERSION", "") result.update(_find_tensorrt_config(tensorrt_paths, tensorrt_version)) for k, v in result.items(): if k.endswith("_dir") or k.endswith("_path"): result[k] = _normalize_path(v) return result def main(): try: for key, value in sorted(find_cuda_config().items()): print("%s: %s" % (key, value)) except ConfigError as e: sys.stderr.write(str(e) + '\n') sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: build_deps/remote_config/BUILD ================================================ ================================================ FILE: build_deps/remote_config/BUILD.tpl ================================================ # Each platform creates a constraint @//:platform_constraint that # is listed in its constraint_values; rule that want to select a specific # platform to run on can put @//:platform_constraing into their # exec_compatible_with attribute. # Toolchains can similarly be marked with target_compatible_with or # exec_compatible_with to bind them to this platform. constraint_setting( name = "platform_setting" ) constraint_value( name = "platform_constraint", constraint_setting = ":platform_setting", visibility = ["//visibility:public"], ) platform( name = "platform", visibility = ["//visibility:public"], constraint_values = [ "@platforms//cpu:%{cpu}", "@platforms//os:%{platform}", ":platform_constraint", ], exec_properties = %{exec_properties}, ) ================================================ FILE: build_deps/remote_config/common.bzl ================================================ """Functions common across configure rules.""" BAZEL_SH = "BAZEL_SH" PYTHON_BIN_PATH = "PYTHON_BIN_PATH" PYTHON_LIB_PATH = "PYTHON_LIB_PATH" PYTHON_CONFIG_REPO = "PYTHON_CONFIG_REPO" def auto_config_fail(msg): """Output failure message when auto configuration fails.""" red = "\033[0;31m" no_color = "\033[0m" fail("%sConfiguration Error:%s %s\n" % (red, no_color, msg)) def which(repository_ctx, program_name, allow_failure=False): """Returns the full path to a program on the execution platform. Args: repository_ctx: the repository_ctx program_name: name of the program on the PATH Returns: The full path to a program on the execution platform. """ out = execute( repository_ctx, ["which", program_name], allow_failure=allow_failure, ).stdout if out != None: out = out.replace("\\", "\\\\").rstrip() return out def get_python_bin(repository_ctx): """Gets the python bin path. Args: repository_ctx: the repository_ctx Returns: The python bin path. """ python_bin = get_host_environ(repository_ctx, PYTHON_BIN_PATH) if python_bin: return python_bin # First check for an explicit "python3" python_bin = which(repository_ctx, "python3", True) if python_bin: return python_bin # Some systems just call pythone3 "python" python_bin = which(repository_ctx, "python", True) if python_bin: return python_bin auto_config_fail( "Cannot find python in PATH, please make sure " + "python is installed and add its directory in PATH, or --define " + "%s='/something/else'.\nPATH=%s" % ( PYTHON_BIN_PATH, get_environ(repository_ctx, "PATH"), )) return python_bin # unreachable def get_bash_bin(repository_ctx): """Gets the bash bin path. Args: repository_ctx: the repository_ctx Returns: The bash bin path. """ bash_bin = get_host_environ(repository_ctx, BAZEL_SH) if bash_bin != None: return bash_bin bash_bin_path = which(repository_ctx, "bash") if bash_bin_path == None: auto_config_fail( "Cannot find bash in PATH, please make sure " + "bash is installed and add its directory in PATH, or --define " + "%s='/path/to/bash'.\nPATH=%s" % ( BAZEL_SH, get_environ(repository_ctx, "PATH"), )) return bash_bin_path def read_dir(repository_ctx, src_dir): """Returns a sorted list with all files in a directory. Finds all files inside a directory, traversing subfolders and following symlinks. Args: repository_ctx: the repository_ctx src_dir: the directory to traverse Returns: A sorted list with all files in a directory. """ find_result = execute( repository_ctx, ["find", src_dir, "-follow", "-type", "f"], allow_failure=True, ) result = find_result.stdout return sorted(result.splitlines()) def get_environ(repository_ctx, name, default_value=None): """Returns the value of an environment variable on the execution platform. Args: repository_ctx: the repository_ctx name: the name of environment variable default_value: the value to return if not set Returns: The value of the environment variable 'name' on the execution platform or 'default_value' if it's not set. """ cmd = "echo -n \"$%s\"" % name result = execute( repository_ctx, [get_bash_bin(repository_ctx), "-c", cmd], allow_failure=True, ) if len(result.stdout) == 0: return default_value return result.stdout def get_host_environ(repository_ctx, name, default_value=None): """Returns the value of an environment variable on the host platform. The host platform is the machine that Bazel runs on. Args: repository_ctx: the repository_ctx name: the name of environment variable Returns: The value of the environment variable 'name' on the host platform. """ if name in repository_ctx.os.environ: return repository_ctx.os.environ.get(name).strip() if hasattr(repository_ctx.attr, "environ") and name in repository_ctx.attr.environ: return repository_ctx.attr.environ.get(name).strip() return default_value def get_cpu_value(repository_ctx): """Returns the name of the host operating system. Args: repository_ctx: The repository context. Returns: A string containing the name of the host operating system. """ result = raw_exec(repository_ctx, ["uname", "-s"]) return result.stdout.strip() def execute(repository_ctx, cmdline, error_msg=None, error_details=None, allow_failure=False): """Executes an arbitrary shell command. Args: repository_ctx: the repository_ctx object cmdline: list of strings, the command to execute error_msg: string, a summary of the error if the command fails error_details: string, details about the error or steps to fix it allow_failure: bool, if True, an empty stdout result or output to stderr is fine, otherwise either of these is an error Returns: The result of repository_ctx.execute(cmdline) """ result = raw_exec(repository_ctx, cmdline) if (result.stderr or not result.stdout) and not allow_failure: fail( "\n".join([ error_msg.strip() if error_msg else "Repository command failed", result.stderr.strip(), error_details if error_details else "", ]), ) return result def raw_exec(repository_ctx, cmdline): """Executes a command via repository_ctx.execute() and returns the result. This method is useful for debugging purposes. For example, to print all commands executed as well as their return code. Args: repository_ctx: the repository_ctx cmdline: the list of args Returns: The 'exec_result' of repository_ctx.execute(). """ return repository_ctx.execute(cmdline) def files_exist(repository_ctx, paths, bash_bin=None): """Checks which files in paths exists. Args: repository_ctx: the repository_ctx paths: a list of paths bash_bin: path to the bash interpreter Returns: Returns a list of Bool. True means that the path at the same position in the paths list exists. """ if bash_bin == None: bash_bin = get_bash_bin(repository_ctx) cmd_tpl = "[ -e \"%s\" ] && echo True || echo False" cmds = [cmd_tpl % path for path in paths] cmd = " ; ".join(cmds) stdout = execute(repository_ctx, [bash_bin, "-c", cmd]).stdout.strip() return [val == "True" for val in stdout.splitlines()] def realpath(repository_ctx, path, bash_bin=None): """Returns the result of "realpath path". Args: repository_ctx: the repository_ctx path: a path on the file system bash_bin: path to the bash interpreter Returns: Returns the result of "realpath path" """ if bash_bin == None: bash_bin = get_bash_bin(repository_ctx) return execute(repository_ctx, [bash_bin, "-c", "realpath \"%s\"" % path]).stdout.strip() def err_out(result): """Returns stderr if set, else stdout. This function is a workaround for a bug in RBE where stderr is returned as stdout. Instead of using result.stderr use err_out(result) instead. Args: result: the exec_result. Returns: The stderr if set, else stdout """ if len(result.stderr) == 0: return result.stdout return result.stderr def config_repo_label(config_repo, target): """Construct a label from config_repo and target. This function exists to ease the migration from preconfig to remote config. In preconfig the *_CONFIG_REPO environ variables are set to packages in the main repo while in remote config they will point to remote repositories. Args: config_repo: a remote repository or package. target: a target Returns: A label constructed from config_repo and target. """ if config_repo.startswith("@") and not config_repo.find("//") > 0: # remote config is being used. return Label(config_repo + "//" + target) elif target.startswith(":"): return Label(config_repo + target) else: return Label(config_repo + "/" + target) ================================================ FILE: build_deps/remote_config/remote_platform_configure.bzl ================================================ """Repository rule to create a platform for a docker image to be used with RBE.""" def _remote_platform_configure_impl(repository_ctx): platform = repository_ctx.attr.platform if platform == "local": os = repository_ctx.os.name.lower() if os.startswith("mac os"): platform = "osx" else: platform = "linux" cpu = "x86_64" machine_type = repository_ctx.execute(["bash", "-c", "echo $MACHTYPE"]).stdout if (machine_type.startswith("ppc") or machine_type.startswith("powerpc")): cpu = "ppc" elif machine_type.startswith("s390x"): cpu = "s390x" elif machine_type.startswith("aarch64"): cpu = "aarch64" elif machine_type.startswith("arm64"): cpu = "aarch64" elif machine_type.startswith("arm"): cpu = "arm" elif machine_type.startswith("mips64"): cpu = "mips64" elif machine_type.startswith("riscv64"): cpu = "riscv64" exec_properties = repository_ctx.attr.platform_exec_properties serialized_exec_properties = "{" for k, v in exec_properties.items(): serialized_exec_properties += "\"%s\" : \"%s\"," % (k, v) serialized_exec_properties += "}" repository_ctx.template( "BUILD", Label("//remote_config:BUILD.tpl"), { "%{platform}": platform, "%{exec_properties}": serialized_exec_properties, "%{cpu}": cpu, }, ) remote_platform_configure = repository_rule( implementation=_remote_platform_configure_impl, attrs={ "platform_exec_properties": attr.string_dict(mandatory=True), "platform": attr.string(default="linux", values=["linux", "local"]), }, ) ================================================ FILE: cmake/modules/ClangFormat.cmake ================================================ # Copyright Tomas Zeman 2018. # Distributed under the Boost Software License, Version 1.0. # (See accompanying file LICENSE_1_0.txt or copy at # http://www.boost.org/LICENSE_1_0.txt) function(clangformat_setup clangformat_srcs) if(NOT CLANGFORMAT_EXECUTABLE) set(CLANGFORMAT_EXECUTABLE clang-format) endif() if(NOT EXISTS ${CLANGFORMAT_EXECUTABLE}) find_program(clangformat_executable_tmp ${CLANGFORMAT_EXECUTABLE}) if(clangformat_executable_tmp) set(CLANGFORMAT_EXECUTABLE ${clangformat_executable_tmp}) unset(clangformat_executable_tmp) else() message(FATAL_ERROR "ClangFormat: ${CLANGFORMAT_EXECUTABLE} not found! Aborting") endif() endif() foreach(clangformat_src ${clangformat_srcs}) get_filename_component(clangformat_src ${clangformat_src} ABSOLUTE) list(APPEND clangformat_srcs_tmp ${clangformat_src}) endforeach() set(clangformat_srcs "${clangformat_srcs_tmp}") unset(clangformat_srcs_tmp) add_custom_target(${PROJECT_NAME}_clangformat ALL COMMAND ${CLANGFORMAT_EXECUTABLE} -style=file -i ${clangformat_srcs} COMMENT "Formating with ${CLANGFORMAT_EXECUTABLE} ...") endfunction() ================================================ FILE: docs/Makefile ================================================ # Minimal makefile for Sphinx documentation # # You can set these variables from the command line, and also # from the environment for the first two. SPHINXOPTS ?= SPHINXBUILD ?= sphinx-build SOURCEDIR = source BUILDDIR = build # Put it first so that "make" without argument is like "make help". help: @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) .PHONY: help Makefile clean clean: rm -rf source/api source/README.md source/CONTRIBUTING.md @$(SPHINXBUILD) -M clean "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(0) # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) ================================================ FILE: docs/README.md ================================================ # Documentation This folder contains the scripts necessary to build the documentation for HierarchicalKV. You can view the generated [HierarchicalKV documentation](https://nvidia-merlin.github.io/HierarchicalKV/master/README.html). ## Contributing to Docs Follow the instructions below to be able to build the docs. 1. Install required documentation tools and extensions: ```shell sudo apt-get install doxygen pip install -r docs/requirements-doc.txt ``` 2. Build the documentation: `make -C docs clean html` The preceding command runs Sphinx in your shell and outputs to build/html/index.html. The build process for HierarchicalKV is unique among the Merlin projects because it uses Doxygen, Breathe, and Exhale to create API documentation from the C++ source. ## Preview the changes View docs web page by opening the HTML in your browser. Run the following command from the root of the repository: ```bash python -m http.server 8000 --directory docs/build/html ``` Afterward, open a web browser and access `https://localhost:8000`. Check that your edits formatted correctly and read well. ## Decisions ### Rebuild the documentation on GitHub Pages The `.github/workflows/docs-sched-rebuild.yaml` file rebuilds the documentation for the `master` branch and the six most recent tags. The job runs daily, but you can trigger it manually by going to the following URL and clicking the *Run workflow* button. ### Source management: README and index files * To preserve Sphinx's expectation that all source files are child files and directories of the `docs/source` directory, other content, such as the `README.md` file is copied to the source directory. You can determine which directories and files are copied by viewing `docs/source/conf.py` and looking for the `copydirs_additional_dirs` list. Directories are specified relative to the Sphinx source directory, `docs/source`. * One consequence of the preceding bullet is that any change to the original files, such as adding or removing a topic, requires a similar change to the `docs/source/toc.yaml` file. Updating the `docs/source/toc.yaml` file is not automatic. * Because the GitHub browsing expectation is that a `README.md` file is rendered when you browse a directory, when a directory is copied, the `README.md` file is renamed to `index.md` to meet the HTML web server expectation of locating an `index.html` file in a directory. ### Adding links TIP: When adding a link to a method or any heading that has underscores in it, repeat the underscores in the link even though they are converted to hyphens in the HTML. Refer to the following examples: * `../somefile.md#2heading-with-spaces-and_underscore_separated_words-too` * `./otherfile.md#save_params_to_files-method` #### Docs-to-docs links There is no concern for the GitHub browsing experience for files in the `docs/source/` directory. You can use a relative path for the link. For example--both the `README.md` file and the `CONTRIBUTING.md` file are copied to `docs/source`. Because they are are both in the same directory, you could add a link to a heading in the `README.md` file like this: ```markdown To build HierarchicalKV from scratch, refer to [How to Build](./README.md#how-to-build) in the `README` file. ``` When Sphinx renders the link, the `.md` file suffix is replaced with `.html`. #### Docs-to-repository links Some files that we publish as docs, such as the `CONTRIBUTING.md` file, refer readers to files that are not published as docs. For example, we currently do not publish the `STYLE_GUIDE.md` file. To refer a reader to the `STYLE_GUIDE.md`, a README, or program, state that the link is to the repository: ```markdown ## Coding Style Refer to the [Style Guide](http://github.com/NVIDIA-Merlin/HierarchicalKV/STYLE_GUIDE.md) in the GitHub repository for more details. ``` The idea is to let a reader know that following the link—whether from an HTML docs page or from browsing GitHub—results in viewing our repository on GitHub. ================================================ FILE: docs/make.bat ================================================ @ECHO OFF pushd %~dp0 REM Command file for Sphinx documentation if "%SPHINXBUILD%" == "" ( set SPHINXBUILD=sphinx-build ) set SOURCEDIR=source set BUILDDIR=build if "%1" == "" goto help %SPHINXBUILD% >NUL 2>NUL if errorlevel 9009 ( echo. echo.The 'sphinx-build' command was not found. Make sure you have Sphinx echo.installed, then set the SPHINXBUILD environment variable to point echo.to the full path of the 'sphinx-build' executable. Alternatively you echo.may add the Sphinx directory to PATH. echo. echo.If you don't have Sphinx installed, grab it from echo.http://sphinx-doc.org/ exit /b 1 ) %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% goto end :help %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% :end popd ================================================ FILE: docs/requirements-doc.txt ================================================ # packages necessary to run tests and push PRs # assumes requirements for nvtabular logic are already installed wheel # docs Sphinx<3.6 jinja2<3.1 markupsafe==2.0.1 sphinx-multiversion@git+https://github.com/mikemckiernan/sphinx-multiversion.git sphinxcontrib-copydirs@git+https://github.com/mikemckiernan/sphinxcontrib-copydirs.git sphinx-external-toc<0.4 sphinx_rtd_theme natsort<8.2 myst-nb markdown-it-py linkify-it-py # C++ exhale<0.4 ================================================ FILE: docs/source/_static/.gitkeep ================================================ ================================================ FILE: docs/source/_static/css/banner.css ================================================ .wy-nav-content { margin: 0; background: #fcfcfc; padding-top: 40px; } .wy-side-nav-search { display: block; width: 300px; padding: .809em; padding-top: 0.809em; margin-bottom: .809em; z-index: 200; background-color: #2980b9; text-align: center; color: #fcfcfc; padding-top: 40px; } div.banner { position: fixed; top: 10px; left: 20px; margin: 0; z-index: 1000; width: 1050px; text-align: center; } p.banner { border-radius: 4px; color: #004831; background: #76b900; } ================================================ FILE: docs/source/_static/css/custom.css ================================================ dl.cpp > dt > span.pre { padding-right: 2px; } /* dl.cpp > dt > a > span.pre { padding-right: 2px; } */ dl > dt > em > span.pre { padding-right: 0px; padding-left: 2px; } dl > dt > code.sig-name > span.pre { padding-left: 2px; } footer div p { font-size: 80%; } footer div p a { color: var(--small-font-color); } footer div p a:hover { color: var(--small-font-color); } ================================================ FILE: docs/source/_templates/footer.html ================================================ {% extends '!footer.html' %} {% block contentinfo %} {{ super() }}

Privacy Policy | Manage My Privacy | Do Not Sell or Share My Data | Terms of Service | Accessibility | Corporate Policies | Product Security | Contact

{% endblock %} ================================================ FILE: docs/source/_templates/versions.html ================================================ {%- if current_version %}
Other Versions v: {{ current_version.name }}
{%- if versions.tags %}
Tags
{%- for item in versions.tags %}
{{ item.name }}
{%- endfor %}
{%- endif %} {%- if versions.branches %}
Branches
{%- for item in versions.branches %}
{{ item.name }}
{%- endfor %}
{%- endif %}
{%- endif %} ================================================ FILE: docs/source/conf.py ================================================ """ Copyright (c) 2021, NVIDIA CORPORATION. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. """ # Configuration file for the Sphinx documentation builder. # # This file only contains a selection of the most common options. For a full # list see the documentation: # https://www.sphinx-doc.org/en/master/usage/configuration.html # -- Path setup -------------------------------------------------------------- # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # import os import subprocess import sys from datetime import datetime from natsort import natsorted sys.path.insert(0, os.path.abspath("../..")) repodir = os.path.abspath(os.path.join(__file__, r"../../..")) gitdir = os.path.join(repodir, r".git") # -- Project information ----------------------------------------------------- year_range = "2022" year_now = str(datetime.now().year) if year_range != year_now: year_range = year_range + chr(8211) + year_now project = 'Merlin Key-Value Storage' copyright = year_range + ", NVIDIA" author = 'NVIDIA' # -- General configuration --------------------------------------------------- # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ "myst_nb", "sphinx_external_toc", "sphinx_rtd_theme", "sphinx.ext.autodoc", "sphinx.ext.autosummary", "sphinx.ext.coverage", "sphinx.ext.githubpages", "sphinx.ext.napoleon", "sphinx.ext.viewcode", "sphinx.ext.intersphinx", "sphinx_multiversion", "sphinxcontrib.copydirs", "breathe", "exhale", ] # MyST configuration settings external_toc_path = "toc.yaml" myst_enable_extensions = [ "deflist", "html_image", "linkify", "replacements", "tasklist", "dollarmath", ] myst_linkify_fuzzy_links = False myst_heading_anchors = 4 nb_execution_mode = "off" # Some documents are RST and include `.. toctree::` directives. suppress_warnings = ["etoc.toctree", "myst.header", "misc.highlighting_failure"] # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. exclude_patterns = [] # -- Options for HTML output ------------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # html_theme = "sphinx_rtd_theme" html_theme_options = { "navigation_depth": 2, "analytics_id": "G-NVJ1Y1YJHK", } html_show_sourcelink = False html_show_sphinx = False # Whitelist pattern for tags (set to None to ignore all tags) # Determine if Sphinx is reading conf.py from the checked out # repo (a Git repo) vs SMV reading conf.py from an archive of the repo # at a commit (not a Git repo). if os.path.exists(gitdir): tag_refs = ( subprocess.check_output(["git", "tag", "-l", "v*"]).decode("utf-8").split() ) tag_refs = natsorted(tag_refs)[-6:] smv_tag_whitelist = r"^(" + r"|".join(tag_refs) + r")$" else: # SMV is reading conf.py from a Git archive of the repo at a specific commit. smv_tag_whitelist = r"^v.*$" # Only include main branch for now smv_branch_whitelist = "^master$" smv_refs_override_suffix = "-docs" html_sidebars = {"**": ["versions.html"]} html_baseurl = "https://nvidia-merlin.github.io/HierarchicalKV/master" html_static_path = [ '_static' ] html_css_files = [ "css/custom.css", "css/banner.css" ] # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". source_suffix = [".rst", ".md"] breathe_projects = { "HierarchicalKV": "/tmp/doxygen/xml" } breathe_default_project = "HierarchicalKV" exhale_args = { "containmentFolder": "./api", "rootFileName": "index.rst", "doxygenStripFromPath": "../../include", "rootFileTitle": "HierarchicalKV C++ API Documentation", "fullApiSubSectionTitle": "Complete HierarchicalKV API", "createTreeView": False, "exhaleExecutesDoxygen": True, "exhaleDoxygenStdin": """ FILE_PATTERNS = *.h *.cuh RECURSIVE = NO EXTENSION_MAPPING = cuh=C++ HIDE_UNDOC_CLASSES = YES HIDE_FRIEND_COMPOUNDS = YES SORT_MEMBERS_CTORS_1ST = YES SHOW_USED_FILES = NO SHOW_FILES = NO SHOW_NAMESPACES = NO INPUT = ../../include INPUT_ENCODING = UTF-8 """, } copydirs_additional_dirs = [ "../../CONTRIBUTING.md", "../../README.md", ] copydirs_file_rename = { "README.md": "index.md", } ================================================ FILE: docs/source/index.rst ================================================ Merlin Key-Value Storage ======================== Merlin Key-Value Storage is an open source library that provides hierarchical key-value storage using on-GPU high-bandwidth memory (HBM) and host RAM. For more information, see the `Introduction `_. Related Resources ----------------- Merlin Key-Value Storage GitHub Repository ``_ About Merlin Merlin is the overarching project that brings together the Merlin projects. See the `documentation `_ or the `repository `_ on GitHub. Developer website for Merlin More information about Merlin is available at our developer website: ``_. ================================================ FILE: docs/source/toc.yaml ================================================ root: index subtrees: - caption: Contents entries: - file: README.md title: Introduction - file: api/index.rst title: API Documentation - file: CONTRIBUTING.md title: Contributing to HierarchicalKV # The multi-modal data example uses several notebooks to demonstrate how to use of multi-modal data (text and images) # to provide movie recommendations based on the MovieLens 25M dataset. # .. toctree:: # :maxdepth: 1 ================================================ FILE: include/BUILD ================================================ load("@local_config_cuda//cuda:build_defs.bzl", "cuda_cc_library") cuda_cc_library( name = "merlin_localfile", hdrs = [ "merlin_localfile.hpp", ], visibility = [ "//visibility:public", ], deps = [ "//include/merlin", "@local_config_cuda//cuda", ], ) cuda_cc_library( name = "merlin_hashtable", hdrs = [ "merlin_hashtable.cuh", ], visibility = [ "//visibility:public", ], deps = [ "//include/merlin", "@local_config_cuda//cuda", ], ) ================================================ FILE: include/merlin/BUILD ================================================ load("@local_config_cuda//cuda:build_defs.bzl", "cuda_cc_library") cuda_cc_library( name = "types_and_utils", srcs = [ ], hdrs = [ "types.cuh", "utils.cuh", ], visibility = [ "//visibility:public", ], deps = [ "@local_config_cuda//cuda", ], ) cuda_cc_library( name = "merlin", srcs = [ ], hdrs = [ "allocator.cuh", "array_kernels.cuh", "core_kernels.cuh", "debug.hpp", "flexible_buffer.cuh", "group_lock.cuh", "memory_pool.cuh", "optimizers.cuh", ], visibility = [ "//visibility:public", ], deps = [ "//include/merlin:types_and_utils", "//include/merlin/core_kernels", "@local_config_cuda//cuda", ], ) ================================================ FILE: include/merlin/allocator.cuh ================================================ /* * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #pragma once #include #include #include "debug.hpp" #include "utils.cuh" namespace nv { namespace merlin { enum MemoryType { Device, // HBM Pinned, // Pinned Host Memory Host, // Host Memory Managed, // Pageable Host Memory(Not required) }; /* This abstract class defines the allocator APIs required by HKV. Any of the customized allocators should inherit from it. */ class BaseAllocator { public: BaseAllocator(const BaseAllocator&) = delete; BaseAllocator(BaseAllocator&&) = delete; BaseAllocator& operator=(const BaseAllocator&) = delete; BaseAllocator& operator=(BaseAllocator&&) = delete; BaseAllocator() = default; virtual ~BaseAllocator() = default; virtual void alloc(const MemoryType type, void** ptr, size_t size, unsigned int pinned_flags = cudaHostAllocDefault) = 0; virtual void alloc_async(const MemoryType type, void** ptr, size_t size, cudaStream_t stream) = 0; virtual void free(const MemoryType type, void* ptr) = 0; virtual void free_async(const MemoryType type, void* ptr, cudaStream_t stream) = 0; }; class DefaultAllocator : public virtual BaseAllocator { public: DefaultAllocator() {}; ~DefaultAllocator() override {}; void alloc(const MemoryType type, void** ptr, size_t size, unsigned int pinned_flags = cudaHostAllocDefault) override { switch (type) { case MemoryType::Device: CUDA_CHECK(cudaMalloc(ptr, size)); break; case MemoryType::Pinned: CUDA_CHECK(cudaMallocHost(ptr, size, pinned_flags)); break; case MemoryType::Host: *ptr = std::malloc(size); break; } return; } void alloc_async(const MemoryType type, void** ptr, size_t size, cudaStream_t stream) override { if (type == MemoryType::Device) { CUDA_CHECK(cudaMallocAsync(ptr, size, stream)); } else { MERLIN_CHECK(false, "[DefaultAllocator] alloc_async is only support for " "MemoryType::Device!"); } return; } void free(const MemoryType type, void* ptr) override { if (ptr == nullptr) { return; } switch (type) { case MemoryType::Pinned: CUDA_CHECK(cudaFreeHost(ptr)); break; case MemoryType::Device: CUDA_CHECK(cudaFree(ptr)); break; case MemoryType::Host: std::free(ptr); break; } return; } void free_async(const MemoryType type, void* ptr, cudaStream_t stream) override { if (ptr == nullptr) { return; } if (type == MemoryType::Device) { CUDA_CHECK(cudaFreeAsync(ptr, stream)); } else { MERLIN_CHECK(false, "[DefaultAllocator] free_async is only support for " "MemoryType::Device!"); } } }; template struct ThrustAllocator : thrust::device_malloc_allocator { public: typedef thrust::device_malloc_allocator super_t; typedef typename super_t::pointer pointer; typedef typename super_t::size_type size_type; public: pointer allocate(size_type n) { void* ptr = nullptr; MERLIN_CHECK( allocator_ != nullptr, "[ThrustAllocator] set_allocator should be called in advance!"); allocator_->alloc(MemoryType::Device, &ptr, sizeof(T) * n); return pointer(reinterpret_cast(ptr)); } void deallocate(pointer p, size_type n) { MERLIN_CHECK( allocator_ != nullptr, "[ThrustAllocator] set_allocator should be called in advance!"); allocator_->free(MemoryType::Device, reinterpret_cast(p.get())); } void set_allocator(BaseAllocator* allocator) { allocator_ = allocator; } public: BaseAllocator* allocator_ = nullptr; }; } // namespace merlin } // namespace nv ================================================ FILE: include/merlin/array_kernels.cuh ================================================ /* * Copyright (c) 2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http:///www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #pragma once #include #include "cuda_runtime.h" #include "thrust/device_vector.h" #include "thrust/execution_policy.h" #include "thrust/scan.h" #include "types.cuh" #include "utils.cuh" namespace nv { namespace merlin { template __global__ void keys_not_empty(const K* keys, bool* masks, size_t n) { int tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid < n) { masks[tid] = keys[tid] != EMPTY_KEY; } } template __global__ void gpu_cell_count(const bool* masks, Tidx* offsets, size_t n, size_t* n_existed) { int tid = blockIdx.x * blockDim.x + threadIdx.x; auto g = cg::tiled_partition(cg::this_thread_block()); int rank = g.thread_rank(); bool is_existed = false; if (tid < n) { if (masks[tid]) { is_existed = true; } } unsigned int vote = g.ballot(is_existed); int g_ones = __popc((int)vote); if (rank == 0 && tid < n) { offsets[tid / TILE_SIZE] = static_cast(g_ones); atomicAdd(static_cast(n_existed), static_cast(g_ones)); } } template __global__ void gpu_select_kvm_kernel(const bool* masks, size_t n, const Tidx* offsets, K* __restrict keys, V* __restrict values, S* __restrict scores, const size_t dim) { int tid = blockIdx.x * blockDim.x + threadIdx.x; auto g = cg::tiled_partition(cg::this_thread_block()); int rank = g.thread_rank(); bool is_existed = false; if (tid < n) { if (masks[tid]) { is_existed = true; } } unsigned int vote = g.ballot(is_existed); unsigned int r_vote = __brev(vote) >> (32 - TILE_SIZE); K empty_key = (K)EMPTY_KEY; if (tid < n) { r_vote = r_vote >> (TILE_SIZE - rank - 1); if (masks[tid]) { int prefix_n = __popc(r_vote) - 1; Tidx bias = offsets[tid / TILE_SIZE] + static_cast(prefix_n); if (bias == tid) return; K target_key = 0; AtomicKey* atomic_key = reinterpret_cast*>(keys) + bias; while (target_key != empty_key) { target_key = empty_key; atomic_key->compare_exchange_weak(target_key, keys[tid], cuda::std::memory_order_relaxed, cuda::std::memory_order_relaxed); } if (scores) scores[bias] = scores[tid]; for (size_t j = 0; j < dim; j++) { values[dim * bias + j] = values[dim * tid + j]; } atomic_key = reinterpret_cast*>(keys) + tid; atomic_key->store(empty_key, cuda::std::memory_order_relaxed); } } } template void gpu_boolean_mask(size_t grid_size, size_t block_size, const bool* masks, size_t n, size_t* n_evicted, Tidx* offsets, K* __restrict keys, V* __restrict values, S* __restrict scores, size_t dim, cudaStream_t stream) { size_t n_offsets = (n + TILE_SIZE - 1) / TILE_SIZE; gpu_cell_count <<>>(masks, offsets, n, n_evicted); #if THRUST_VERSION >= 101600 auto policy = thrust::cuda::par_nosync.on(stream); #else auto policy = thrust::cuda::par.on(stream); #endif thrust::device_ptr d_src(offsets); thrust::device_ptr d_dest(offsets); thrust::exclusive_scan(policy, d_src, d_src + n_offsets, d_dest); gpu_select_kvm_kernel <<>>(masks, n, offsets, keys, values, scores, dim); } } // namespace merlin } // namespace nv ================================================ FILE: include/merlin/core_kernels/BUILD ================================================ load("@local_config_cuda//cuda:build_defs.bzl", "cuda_cc_library") cuda_cc_library( name = "core_kernels", srcs = [], hdrs = glob([ "**/*.cuh", ]), visibility = [ "//visibility:public", ], deps = [ "//include/merlin:types_and_utils", "@local_config_cuda//cuda", ], ) ================================================ FILE: include/merlin/core_kernels/accum_or_assign.cuh ================================================ /* * Copyright (c) 2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http:///www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #pragma once #include "kernel_utils.cuh" namespace nv { namespace merlin { template __device__ __forceinline__ void accum_or_assign_vector( cg::thread_block_tile const& g, const V* delta_or_val, V* dst, const bool is_accum, const size_t dim) { for (auto i = g.thread_rank(); i < dim; i += g.size()) { if (is_accum) { dst[i] += delta_or_val[i]; } else { dst[i] = delta_or_val[i]; } } } /* Write the values of delta_or_val into the table. If the key[i] is already in the table indicted be @exists[i], a @delta_or_val[i] will be added to the the existing value. if the key not exists, the value @val_or_delta[i] will be assigned to the address @dst[i]. `delta_or_val`: will be treated as val and accumlating should be executed. `dst`: A pointer of pointer to V which should be on HBM, but each value (a pointer of V) could point to a memory on HBM or HMEM. `existed`: If the keys existed before this kernel is executed. `status`: The existence status for each key when the kernel is being executed. `N`: number of vectors needed to be writen. */ template __global__ void write_with_accum_kernel(const V* __restrict delta_or_val, V** __restrict dst, const bool* __restrict existed, const bool* __restrict status, const int* __restrict src_offset, const size_t dim, size_t N) { size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x; for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) { int vec_index = int(t / dim); int dim_index = t % dim; if (dst[vec_index] != nullptr && existed[src_offset[vec_index]] == status[src_offset[vec_index]]) { if (status[src_offset[vec_index]]) { dst[vec_index][dim_index] += delta_or_val[src_offset[vec_index] * dim + dim_index]; } else { dst[vec_index][dim_index] = delta_or_val[src_offset[vec_index] * dim + dim_index]; } } } } /* * update with IO operation. This kernel is * usually used for the pure HBM mode for better performance. */ template __global__ void accum_or_assign_kernel_with_io( const Table* __restrict table, const size_t bucket_max_size, const size_t buckets_num, const size_t dim, const K* __restrict keys, const V* __restrict value_or_deltas, const S* __restrict scores, const bool* __restrict accum_or_assigns, const S global_epoch, const size_t N) { auto g = cg::tiled_partition(cg::this_thread_block()); int* buckets_size = table->buckets_size; using ScoreFunctor = ScoreFunctor; for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N; t += blockDim.x * gridDim.x) { int key_pos = -1; size_t key_idx = t / TILE_SIZE; const K insert_key = keys[key_idx]; if (IS_RESERVED_KEY(insert_key)) continue; const S insert_score = ScoreFunctor::desired_when_missed(scores, key_idx, global_epoch); const V* insert_value = value_or_deltas + key_idx * dim; const bool is_accum = accum_or_assigns[key_idx]; size_t bkt_idx = 0; size_t start_idx = 0; int src_lane = -1; K evicted_key; Bucket* bucket = get_key_position(table->buckets, insert_key, bkt_idx, start_idx, buckets_num, bucket_max_size); OccupyResult occupy_result{OccupyResult::INITIAL}; const int bucket_size = buckets_size[bkt_idx]; do { if (bucket_size < bucket_max_size) { occupy_result = find_and_lock_when_vacant( g, bucket, insert_key, insert_score, evicted_key, start_idx, key_pos, src_lane, bucket_max_size); } else { start_idx = (start_idx / TILE_SIZE) * TILE_SIZE; occupy_result = find_and_lock_when_full( g, bucket, insert_key, insert_score, evicted_key, start_idx, key_pos, src_lane, bucket_max_size); } occupy_result = g.shfl(occupy_result, src_lane); } while (occupy_result == OccupyResult::CONTINUE); if (occupy_result == OccupyResult::REFUSED) continue; if ((is_accum && occupy_result != OccupyResult::DUPLICATE) || (!is_accum && occupy_result == OccupyResult::DUPLICATE)) { if (g.thread_rank() == src_lane) { if (occupy_result == OccupyResult::OCCUPIED_EMPTY) { evicted_key = static_cast(EMPTY_KEY); } if (occupy_result == OccupyResult::OCCUPIED_RECLAIMED) { evicted_key = static_cast(RECLAIM_KEY); } if (occupy_result == OccupyResult::DUPLICATE) { evicted_key = insert_key; } (bucket->keys(key_pos)) ->store(evicted_key, ScoreFunctor::UNLOCK_MEM_ORDER); } g.sync(); continue; } if ((occupy_result == OccupyResult::OCCUPIED_EMPTY || occupy_result == OccupyResult::OCCUPIED_RECLAIMED) && g.thread_rank() == src_lane) { atomicAdd(&(buckets_size[bkt_idx]), 1); } accum_or_assign_vector( g, insert_value, bucket->vectors + key_pos * dim, is_accum, dim); if (g.thread_rank() == src_lane) { ScoreFunctor::update(bucket, key_pos, scores, key_idx, insert_score, (occupy_result != OccupyResult::DUPLICATE)); bucket->digests(key_pos)[0] = get_digest(insert_key); (bucket->keys(key_pos)) ->store(insert_key, ScoreFunctor::UNLOCK_MEM_ORDER); } } } template struct SelectAccumOrAssignKernelWithIO { static void execute_kernel( const float& load_factor, const int& block_size, const size_t bucket_max_size, const size_t buckets_num, const size_t dim, cudaStream_t& stream, const size_t& n, const Table* __restrict table, const K* __restrict keys, const V* __restrict value_or_deltas, const S* __restrict scores, const bool* __restrict accum_or_assigns, const S global_epoch) { if (load_factor <= 0.75) { const unsigned int tile_size = 4; const size_t N = n * tile_size; const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); accum_or_assign_kernel_with_io <<>>( table, bucket_max_size, buckets_num, dim, keys, value_or_deltas, scores, accum_or_assigns, global_epoch, N); } else { const unsigned int tile_size = 32; const size_t N = n * tile_size; const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); accum_or_assign_kernel_with_io <<>>( table, bucket_max_size, buckets_num, dim, keys, value_or_deltas, scores, accum_or_assigns, global_epoch, N); } return; } }; template __global__ void accum_or_assign_kernel( const Table* __restrict table, const size_t bucket_max_size, const size_t buckets_num, const size_t dim, const K* __restrict keys, V** __restrict value_or_deltas, const S* __restrict scores, const bool* __restrict accum_or_assigns, int* __restrict src_offset, bool* __restrict founds, const S global_epoch, size_t N) { auto g = cg::tiled_partition(cg::this_thread_block()); int* buckets_size = table->buckets_size; using ScoreFunctor = ScoreFunctor; for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N; t += blockDim.x * gridDim.x) { int key_pos = -1; size_t key_idx = t / TILE_SIZE; const K insert_key = keys[key_idx]; if (IS_RESERVED_KEY(insert_key)) continue; const S insert_score = ScoreFunctor::desired_when_missed(scores, key_idx, global_epoch); const bool is_accum = accum_or_assigns[key_idx]; size_t bkt_idx = 0; size_t start_idx = 0; int src_lane = -1; K evicted_key; Bucket* bucket = get_key_position(table->buckets, insert_key, bkt_idx, start_idx, buckets_num, bucket_max_size); if (g.thread_rank() == 0) { *(src_offset + key_idx) = key_idx; } OccupyResult occupy_result{OccupyResult::INITIAL}; const int bucket_size = buckets_size[bkt_idx]; do { if (bucket_size < bucket_max_size) { occupy_result = find_and_lock_when_vacant( g, bucket, insert_key, insert_score, evicted_key, start_idx, key_pos, src_lane, bucket_max_size); } else { start_idx = (start_idx / TILE_SIZE) * TILE_SIZE; occupy_result = find_and_lock_when_full( g, bucket, insert_key, insert_score, evicted_key, start_idx, key_pos, src_lane, bucket_max_size); } occupy_result = g.shfl(occupy_result, src_lane); } while (occupy_result == OccupyResult::CONTINUE); if (occupy_result == OccupyResult::REFUSED) continue; if ((is_accum && occupy_result != OccupyResult::DUPLICATE) || (!is_accum && occupy_result == OccupyResult::DUPLICATE)) { if (g.thread_rank() == src_lane) { if (occupy_result == OccupyResult::OCCUPIED_EMPTY) { evicted_key = static_cast(EMPTY_KEY); } if (occupy_result == OccupyResult::OCCUPIED_RECLAIMED) { evicted_key = static_cast(RECLAIM_KEY); } if (occupy_result == OccupyResult::DUPLICATE) { evicted_key = insert_key; } (bucket->keys(key_pos)) ->store(evicted_key, ScoreFunctor::UNLOCK_MEM_ORDER); } g.sync(); continue; } if ((occupy_result == OccupyResult::OCCUPIED_EMPTY || occupy_result == OccupyResult::OCCUPIED_RECLAIMED) && g.thread_rank() == src_lane) { atomicAdd(&(buckets_size[bkt_idx]), 1); } if (g.thread_rank() == src_lane) { *(value_or_deltas + key_idx) = (bucket->vectors + key_pos * dim); *(founds + key_idx) = is_accum; bucket->digests(key_pos)[0] = get_digest(insert_key); ScoreFunctor::update(bucket, key_pos, scores, key_idx, insert_score, (occupy_result != OccupyResult::DUPLICATE)); (bucket->keys(key_pos)) ->store(insert_key, ScoreFunctor::UNLOCK_MEM_ORDER); } } } } // namespace merlin } // namespace nv ================================================ FILE: include/merlin/core_kernels/contains.cuh ================================================ /* * Copyright (c) 2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http:///www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #pragma once #include "kernel_utils.cuh" namespace nv { namespace merlin { template struct ContainsKernelParams { ContainsKernelParams(Bucket* __restrict buckets_, size_t buckets_num_, uint32_t dim_, const K* __restrict keys_, bool* __restrict founds_, size_t n_) : buckets(buckets_), buckets_num(buckets_num_), dim(dim_), keys(keys_), founds(founds_), n(n_) {} Bucket* __restrict buckets; size_t buckets_num; uint32_t dim; const K* __restrict keys; bool* __restrict founds; size_t n; }; // Using 32 threads to deal with one key template __global__ void contains_kernel_pipeline(Bucket* buckets, const size_t buckets_num, const int dim, const K* __restrict keys, bool* __restrict founds, size_t n) { constexpr int GROUP_SIZE = 32; constexpr int RESERVE = 16; constexpr int BLOCK_SIZE = 128; constexpr int BUCKET_SIZE = 128; constexpr int GROUP_NUM = BLOCK_SIZE / GROUP_SIZE; constexpr int DIGEST_SPAN = BUCKET_SIZE / 4; __shared__ int sm_target_digests[BLOCK_SIZE]; __shared__ K sm_target_keys[BLOCK_SIZE]; __shared__ K* sm_keys_ptr[BLOCK_SIZE]; // Reuse int* sm_counts = sm_target_digests; // Double buffer __shared__ uint32_t sm_probing_digests[2][GROUP_NUM * DIGEST_SPAN]; __shared__ K sm_possible_keys[2][GROUP_NUM * RESERVE]; // Initialization auto g = cg::tiled_partition(cg::this_thread_block()); int groupID = threadIdx.x / GROUP_SIZE; int rank = g.thread_rank(); int key_idx_base = (blockIdx.x * blockDim.x) + groupID * GROUP_SIZE; if (key_idx_base >= n) return; int loop_num = (n - key_idx_base) < GROUP_SIZE ? (n - key_idx_base) : GROUP_SIZE; if (rank < loop_num) { int idx_block = groupID * GROUP_SIZE + rank; K target_key = keys[key_idx_base + rank]; sm_target_keys[idx_block] = target_key; const K hashed_key = Murmur3HashDevice(target_key); const uint8_t target_digest = static_cast(hashed_key >> 32); sm_target_digests[idx_block] = static_cast(target_digest); int global_idx = hashed_key % (buckets_num * BUCKET_SIZE); int bkt_idx = global_idx / BUCKET_SIZE; Bucket* bucket = buckets + bkt_idx; __pipeline_memcpy_async(sm_keys_ptr + idx_block, bucket->keys_addr(), sizeof(K*)); __pipeline_commit(); } __pipeline_wait_prior(0); // Pipeline loading uint8_t* digests_ptr = reinterpret_cast(sm_keys_ptr[groupID * GROUP_SIZE]) - BUCKET_SIZE; __pipeline_memcpy_async(sm_probing_digests[0] + groupID * DIGEST_SPAN + rank, digests_ptr + rank * 4, sizeof(uint32_t)); __pipeline_commit(); __pipeline_commit(); for (int i = 0; i < loop_num; i++) { int key_idx_block = groupID * GROUP_SIZE + i; /* Step1: prefetch all digests in one bucket */ if ((i + 1) < loop_num) { uint8_t* digests_ptr = reinterpret_cast(sm_keys_ptr[key_idx_block + 1]) - BUCKET_SIZE; __pipeline_memcpy_async( sm_probing_digests[diff_buf(i)] + groupID * DIGEST_SPAN + rank, digests_ptr + rank * 4, sizeof(uint32_t)); } __pipeline_commit(); /* Step2: check digests and load possible keys */ uint32_t target_digest = sm_target_digests[key_idx_block]; uint32_t target_digests = __byte_perm(target_digest, target_digest, 0x0000); sm_counts[key_idx_block] = 0; __pipeline_wait_prior(2); uint32_t probing_digests = sm_probing_digests[same_buf(i)][groupID * DIGEST_SPAN + rank]; uint32_t find_result_ = __vcmpeq4(probing_digests, target_digests); uint32_t find_result = 0; if ((find_result_ & 0x01) != 0) find_result |= 0x01; if ((find_result_ & 0x0100) != 0) find_result |= 0x02; if ((find_result_ & 0x010000) != 0) find_result |= 0x04; if ((find_result_ & 0x01000000) != 0) find_result |= 0x08; int find_number = __popc(find_result); int group_base = 0; if (find_number > 0) { group_base = atomicAdd(sm_counts + key_idx_block, find_number); } bool gt_reserve = (group_base + find_number) > RESERVE; int gt_vote = g.ballot(gt_reserve); K* key_ptr = sm_keys_ptr[key_idx_block]; if (gt_vote == 0) { do { int digest_idx = __ffs(find_result) - 1; if (digest_idx >= 0) { find_result &= (find_result - 1); int key_pos = rank * 4 + digest_idx; __pipeline_memcpy_async( sm_possible_keys[same_buf(i)] + (groupID * RESERVE + group_base), key_ptr + key_pos, sizeof(K)); group_base += 1; } else { break; } } while (true); } else { K target_key = sm_target_keys[key_idx_block]; sm_counts[key_idx_block] = 0; int found_vote = 0; bool found = false; do { int digest_idx = __ffs(find_result) - 1; if (digest_idx >= 0) { find_result &= (find_result - 1); int key_pos = rank * 4 + digest_idx; K possible_key = key_ptr[key_pos]; if (possible_key == target_key) { found = true; sm_counts[key_idx_block] = 1; sm_possible_keys[same_buf(i)][groupID * RESERVE] = possible_key; } } found_vote = g.ballot(found); if (found_vote) { break; } found_vote = digest_idx >= 0; } while (g.any(found_vote)); } __pipeline_commit(); /* Step3: check possible keys, and prefecth the value and score */ if (i > 0) { key_idx_block -= 1; int key_idx_grid = blockIdx.x * blockDim.x + key_idx_block; K target_key = sm_target_keys[key_idx_block]; int possible_num = sm_counts[key_idx_block]; __pipeline_wait_prior(2); bool found_flag = false; if (rank < possible_num) { K possible_key = sm_possible_keys[diff_buf(i)][groupID * RESERVE + rank]; if (possible_key == target_key) { found_flag = true; } } int found_vote = g.ballot(found_flag); founds[key_idx_grid] = (found_vote > 0); } } // End loop /* Pipeline emptying: step3, i = loop_num */ { int key_idx_block = groupID * GROUP_SIZE + (loop_num - 1); int key_idx_grid = blockIdx.x * blockDim.x + key_idx_block; K target_key = sm_target_keys[key_idx_block]; int possible_num = sm_counts[key_idx_block]; __pipeline_wait_prior(0); bool found_flag = false; if (rank < possible_num) { K possible_key = sm_possible_keys[diff_buf(loop_num)][groupID * RESERVE + rank]; if (target_key == possible_key) { found_flag = true; } } int found_vote = g.ballot(found_flag); founds[key_idx_grid] = (found_vote > 0); } } // End function template struct LaunchPipelineContains { static void launch_kernel(ContainsKernelParams& params, cudaStream_t& stream) { constexpr int BLOCK_SIZE = 128; // Using 32 threads to deal with one key contains_kernel_pipeline <<<(params.n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( params.buckets, params.buckets_num, params.dim, params.keys, params.founds, params.n); } }; template struct SelectPipelineContainsKernel { static void select_kernel(ContainsKernelParams& params, cudaStream_t& stream) { LaunchPipelineContains::launch_kernel(params, stream); } }; template __global__ void contains_kernel(const Table* __restrict table, Bucket* buckets, const size_t bucket_max_size, const size_t buckets_num, const size_t dim, const K* __restrict keys, bool* __restrict found, size_t N) { int* buckets_size = table->buckets_size; auto g = cg::tiled_partition(cg::this_thread_block()); int rank = g.thread_rank(); for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N; t += blockDim.x * gridDim.x) { int key_idx = t / TILE_SIZE; const K find_key = keys[key_idx]; if (IS_RESERVED_KEY(find_key)) continue; int key_pos = -1; int src_lane = -1; size_t bkt_idx = 0; size_t start_idx = 0; Bucket* bucket = get_key_position( buckets, find_key, bkt_idx, start_idx, buckets_num, bucket_max_size); const int bucket_size = buckets_size[bkt_idx]; if (bucket_size >= bucket_max_size) { start_idx = (start_idx / TILE_SIZE) * TILE_SIZE; } OccupyResult occupy_result{OccupyResult::INITIAL}; occupy_result = find_without_lock( g, bucket, find_key, start_idx, key_pos, src_lane, bucket_max_size); if (rank == src_lane) { *(found + key_idx) = (occupy_result == OccupyResult::DUPLICATE); } } } template struct SelectContainsKernel { static void execute_kernel(const float& load_factor, const int& block_size, const size_t bucket_max_size, const size_t buckets_num, const size_t dim, cudaStream_t& stream, const size_t& n, const Table* __restrict table, Bucket* buckets, const K* __restrict keys, bool* __restrict found) { if (load_factor <= 0.75) { const unsigned int tile_size = 4; const size_t N = n * tile_size; const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); contains_kernel<<>>( table, buckets, bucket_max_size, buckets_num, dim, keys, found, N); } else { const unsigned int tile_size = 16; const size_t N = n * tile_size; const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); contains_kernel<<>>( table, buckets, bucket_max_size, buckets_num, dim, keys, found, N); } return; } }; } // namespace merlin } // namespace nv ================================================ FILE: include/merlin/core_kernels/dual_bucket_lookup.cuh ================================================ /* * Copyright (c) 2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #pragma once #include "dual_bucket_utils.cuh" #include "kernel_utils.cuh" namespace nv { namespace merlin { /** * Dual-bucket pipeline lookup kernel (sequential two-bucket search). * * For each key, computes (b1, b2) via high/low 32-bit split of Murmur3 hash. * First probes b1; if not found, probes b2. * Uses dual_bucket_digest (bit[56:63]) to avoid digest collision with b2 * addressing. * * Architecture: Based on lookup_kernel_with_io_pipeline_v1 with 32 threads * per key, 128-thread blocks, 128-slot buckets. 4-stage IO pipeline * (prefetch digests -> digest match + key load -> key verify + value prefetch * -> value writeback). */ template , typename CopyValue = CopyValueTwoGroup, typename FoundFunctor = FoundFunctorV1, int VALUE_BUF = 56> __global__ void dual_bucket_pipeline_lookup_kernel_with_io( Bucket* buckets, const int32_t* __restrict__ buckets_size, const size_t buckets_num, const int dim, const K* __restrict keys, VecV* __restrict values, S* __restrict scores, FoundFunctor found_functor, size_t n) { constexpr int GROUP_SIZE = 32; constexpr int RESERVE = 16; constexpr int BLOCK_SIZE = 128; constexpr int BUCKET_SIZE = 128; constexpr int GROUP_NUM = BLOCK_SIZE / GROUP_SIZE; constexpr int DIGEST_SPAN = BUCKET_SIZE / 4; using BUCKET = Bucket; // Shared memory declarations. __shared__ int sm_target_digests[BLOCK_SIZE]; __shared__ K sm_target_keys[BLOCK_SIZE]; __shared__ K* sm_keys_ptr1[BLOCK_SIZE]; // b1 bucket keys ptr __shared__ K* sm_keys_ptr2[BLOCK_SIZE]; // b2 bucket keys ptr __shared__ VecV* sm_values_ptr1[BLOCK_SIZE]; // b1 values ptr __shared__ VecV* sm_values_ptr2[BLOCK_SIZE]; // b2 values ptr __shared__ S sm_target_scores[BLOCK_SIZE]; // Reuse sm_target_digests int* sm_counts = sm_target_digests; int* sm_founds = sm_counts; // Double buffer __shared__ uint32_t sm_probing_digests[2][GROUP_NUM * DIGEST_SPAN]; __shared__ K sm_possible_keys[2][GROUP_NUM * RESERVE]; __shared__ int sm_possible_pos[2][GROUP_NUM * RESERVE]; __shared__ VecV sm_vector[2][GROUP_NUM][VALUE_BUF]; // Initialization. auto g = cg::tiled_partition(cg::this_thread_block()); int groupID = threadIdx.x / GROUP_SIZE; int rank = g.thread_rank(); int key_idx_base = (blockIdx.x * blockDim.x) + groupID * GROUP_SIZE; if (key_idx_base >= n) return; int loop_num = (n - key_idx_base) < GROUP_SIZE ? (n - key_idx_base) : GROUP_SIZE; // Phase 1: Initialize per-key data (hash, digest, bucket pointers). // Save digest in register to avoid recomputing Murmur3 hash in Pass 2 // (sm_target_digests is aliased with sm_counts/sm_founds and gets // corrupted during Pass 1). uint32_t reg_target_digest = 0; if (rank < loop_num) { int idx_block = groupID * GROUP_SIZE + rank; K target_key = keys[key_idx_base + rank]; sm_target_keys[idx_block] = target_key; const K hashed_key = Murmur3HashDevice(target_key); // Dual-bucket digest: bit[56:63] const uint8_t target_digest = static_cast(static_cast(hashed_key) >> 56); reg_target_digest = static_cast(target_digest); sm_target_digests[idx_block] = reg_target_digest; // Dual-bucket positions (centralized in dual_bucket_utils.cuh). size_t bkt_idx1, bkt_idx2; get_dual_bucket_indices(hashed_key, buckets_num, bkt_idx1, bkt_idx2); BUCKET* bucket1 = buckets + bkt_idx1; BUCKET* bucket2 = buckets + bkt_idx2; sm_keys_ptr1[idx_block] = reinterpret_cast(bucket1->keys(0)); sm_keys_ptr2[idx_block] = reinterpret_cast(bucket2->keys(0)); __pipeline_memcpy_async(sm_values_ptr1 + idx_block, &(bucket1->vectors), sizeof(VecV*)); __pipeline_commit(); __pipeline_memcpy_async(sm_values_ptr2 + idx_block, &(bucket2->vectors), sizeof(VecV*)); } __pipeline_wait_prior(0); // Helper lambda-like function to run pipeline lookup on one bucket. // We process keys sequentially through the pipeline for one bucket, // then process missed keys through the second bucket. // --- PASS 1: Search bucket b1 --- // Pipeline loading for b1. { uint8_t* digests_ptr = reinterpret_cast(sm_keys_ptr1[groupID * GROUP_SIZE]) - BUCKET_SIZE; __pipeline_memcpy_async( sm_probing_digests[0] + groupID * DIGEST_SPAN + rank, digests_ptr + rank * 4, sizeof(uint32_t)); } __pipeline_commit(); __pipeline_commit(); __pipeline_commit(); for (int i = 0; i < loop_num; i++) { int key_idx_block = groupID * GROUP_SIZE + i; // Step1: prefetch digests for next key's b1 bucket. if ((i + 1) < loop_num) { uint8_t* digests_ptr = reinterpret_cast(sm_keys_ptr1[key_idx_block + 1]) - BUCKET_SIZE; __pipeline_memcpy_async( sm_probing_digests[diff_buf(i)] + groupID * DIGEST_SPAN + rank, digests_ptr + rank * 4, sizeof(uint32_t)); } __pipeline_commit(); // Step2: check digests and load possible keys. uint32_t target_digest = sm_target_digests[key_idx_block]; uint32_t target_digests_vec = __byte_perm(target_digest, target_digest, 0x0000); sm_counts[key_idx_block] = 0; __pipeline_wait_prior(3); uint32_t probing_digests = sm_probing_digests[same_buf(i)][groupID * DIGEST_SPAN + rank]; uint32_t find_result_ = __vcmpeq4(probing_digests, target_digests_vec); uint32_t find_result = 0; if ((find_result_ & 0x01) != 0) find_result |= 0x01; if ((find_result_ & 0x0100) != 0) find_result |= 0x02; if ((find_result_ & 0x010000) != 0) find_result |= 0x04; if ((find_result_ & 0x01000000) != 0) find_result |= 0x08; int find_number = __popc(find_result); int group_base = 0; if (find_number > 0) { group_base = atomicAdd(sm_counts + key_idx_block, find_number); } bool gt_reserve = (group_base + find_number) > RESERVE; int gt_vote = g.ballot(gt_reserve); K* key_ptr = sm_keys_ptr1[key_idx_block]; if (gt_vote == 0) { do { int digest_idx = __ffs(find_result) - 1; if (digest_idx >= 0) { find_result &= (find_result - 1); int key_pos = rank * 4 + digest_idx; sm_possible_pos[same_buf(i)][groupID * RESERVE + group_base] = key_pos; __pipeline_memcpy_async( sm_possible_keys[same_buf(i)] + (groupID * RESERVE + group_base), key_ptr + key_pos, sizeof(K)); group_base += 1; } else { break; } } while (true); } else { K target_key = sm_target_keys[key_idx_block]; sm_counts[key_idx_block] = 0; int found_vote = 0; bool found = false; do { int digest_idx = __ffs(find_result) - 1; if (digest_idx >= 0) { find_result &= (find_result - 1); int key_pos = rank * 4 + digest_idx; K possible_key = key_ptr[key_pos]; if (possible_key == target_key) { found = true; sm_counts[key_idx_block] = 1; sm_possible_pos[same_buf(i)][groupID * RESERVE] = key_pos; sm_possible_keys[same_buf(i)][groupID * RESERVE] = possible_key; } } found_vote = g.ballot(found); if (found_vote) break; found_vote = digest_idx >= 0; } while (g.any(found_vote)); } __pipeline_commit(); // Step3: verify keys, prefetch values. if (i > 0) { int prev_block = groupID * GROUP_SIZE + i - 1; K target_key = sm_target_keys[prev_block]; int possible_num = sm_counts[prev_block]; sm_founds[prev_block] = 0; S* score_ptr = CopyScore::get_base_ptr(sm_keys_ptr1, prev_block); VecV* value_ptr = sm_values_ptr1[prev_block]; __pipeline_wait_prior(3); int key_pos; bool found_flag = false; if (rank < possible_num) { K possible_key = sm_possible_keys[diff_buf(i)][groupID * RESERVE + rank]; key_pos = sm_possible_pos[diff_buf(i)][groupID * RESERVE + rank]; if (possible_key == target_key) { found_flag = true; CopyScore::ldg_sts(sm_target_scores + prev_block, score_ptr + key_pos); } } int found_vote = g.ballot(found_flag); if (found_vote) { VecV* v_dst = sm_vector[diff_buf(i)][groupID]; sm_founds[prev_block] = 1; int src_lane = __ffs(found_vote) - 1; int target_pos = g.shfl(key_pos, src_lane); VecV* v_src = value_ptr + target_pos * dim; CopyValue::ldg_sts(rank, v_dst, v_src, dim); } } __pipeline_commit(); // Step4: write back value and score. if (i > 1) { int wb_block = groupID * GROUP_SIZE + i - 2; int key_idx_grid = blockIdx.x * blockDim.x + wb_block; VecV* v_src = sm_vector[same_buf(i)][groupID]; VecV* v_dst = values + key_idx_grid * dim; int found_flag = sm_founds[wb_block]; __pipeline_wait_prior(3); if (found_flag > 0) { S score_ = CopyScore::lgs(sm_target_scores + wb_block); CopyValue::lds_stg(rank, v_dst, v_src, dim); CopyScore::stg(scores + key_idx_grid, score_); } } } // Pipeline emptying for b1: step3 for last key. { int key_idx_block = groupID * GROUP_SIZE + (loop_num - 1); K target_key = sm_target_keys[key_idx_block]; int possible_num = sm_counts[key_idx_block]; sm_founds[key_idx_block] = 0; S* score_ptr = CopyScore::get_base_ptr(sm_keys_ptr1, key_idx_block); VecV* value_ptr = sm_values_ptr1[key_idx_block]; __pipeline_wait_prior(1); int key_pos; bool found_flag = false; if (rank < possible_num) { key_pos = sm_possible_pos[diff_buf(loop_num)][groupID * RESERVE + rank]; K possible_key = sm_possible_keys[diff_buf(loop_num)][groupID * RESERVE + rank]; if (target_key == possible_key) { found_flag = true; CopyScore::ldg_sts(sm_target_scores + key_idx_block, score_ptr + key_pos); } } int found_vote = g.ballot(found_flag); if (found_vote) { sm_founds[key_idx_block] = 1; int src_lane = __ffs(found_vote) - 1; int target_pos = g.shfl(key_pos, src_lane); VecV* v_src = value_ptr + target_pos * dim; VecV* v_dst = sm_vector[diff_buf(loop_num)][groupID]; CopyValue::ldg_sts(rank, v_dst, v_src, dim); } } __pipeline_commit(); // Pipeline emptying: step4 for second-to-last key. if (loop_num > 1) { int key_idx_block = groupID * GROUP_SIZE + loop_num - 2; int key_idx_grid = blockIdx.x * blockDim.x + key_idx_block; VecV* v_src = sm_vector[same_buf(loop_num)][groupID]; VecV* v_dst = values + key_idx_grid * dim; int found_flag = sm_founds[key_idx_block]; __pipeline_wait_prior(1); if (found_flag > 0) { S score_ = CopyScore::lgs(sm_target_scores + key_idx_block); CopyValue::lds_stg(rank, v_dst, v_src, dim); CopyScore::stg(scores + key_idx_grid, score_); } } // Pipeline emptying: step4 for last key. { int key_idx_block = groupID * GROUP_SIZE + loop_num - 1; int key_idx_grid = blockIdx.x * blockDim.x + key_idx_block; VecV* v_src = sm_vector[same_buf(loop_num + 1)][groupID]; VecV* v_dst = values + key_idx_grid * dim; int found_flag = sm_founds[key_idx_block]; __pipeline_wait_prior(0); if (found_flag > 0) { S score_ = CopyScore::lgs(sm_target_scores + key_idx_block); CopyValue::lds_stg(rank, v_dst, v_src, dim); CopyScore::stg(scores + key_idx_grid, score_); } } // Finalize b1 pass and record found status. // Keys found in b1 are marked. Unfound keys need b2 search. if (rank < loop_num) { int key_idx_block = groupID * GROUP_SIZE + rank; int key_idx_grid = blockIdx.x * blockDim.x + key_idx_block; // Only write found for b1 hits; b2 pass will handle misses. if (sm_founds[key_idx_block] > 0) { found_functor(key_idx_grid, sm_target_keys[key_idx_block], true); } } // --- PASS 2: Search bucket b2 for keys not found in b1 --- // Count unfound keys. If all found in b1, skip b2 entirely. int any_unfound = 0; if (rank < loop_num) { int key_idx_block = groupID * GROUP_SIZE + rank; if (sm_founds[key_idx_block] == 0) { any_unfound = 1; } } any_unfound = g.any(any_unfound); if (!any_unfound) return; // Save b1 found flags (sm_founds will be reused). // We use a simple approach: store per-thread found flag in register. int b1_found = 0; if (rank < loop_num) { b1_found = sm_founds[groupID * GROUP_SIZE + rank]; } // Restore digests from registers saved during Phase 1 init. // sm_target_digests was aliased with sm_counts/sm_founds and corrupted // during Pass 1. Using the register avoids recomputing Murmur3 hash. if (rank < loop_num) { int idx_block = groupID * GROUP_SIZE + rank; sm_target_digests[idx_block] = reg_target_digest; } __syncwarp(); // Pipeline loading for b2. { uint8_t* digests_ptr = reinterpret_cast(sm_keys_ptr2[groupID * GROUP_SIZE]) - BUCKET_SIZE; __pipeline_memcpy_async( sm_probing_digests[0] + groupID * DIGEST_SPAN + rank, digests_ptr + rank * 4, sizeof(uint32_t)); } __pipeline_commit(); __pipeline_commit(); __pipeline_commit(); for (int i = 0; i < loop_num; i++) { int key_idx_block = groupID * GROUP_SIZE + i; // Check if this key was already found in b1. int skip = g.shfl(b1_found, i); // Step1: prefetch digests for next key's b2 bucket. if ((i + 1) < loop_num) { uint8_t* digests_ptr = reinterpret_cast(sm_keys_ptr2[key_idx_block + 1]) - BUCKET_SIZE; __pipeline_memcpy_async( sm_probing_digests[diff_buf(i)] + groupID * DIGEST_SPAN + rank, digests_ptr + rank * 4, sizeof(uint32_t)); } __pipeline_commit(); // Step2: check digests and load possible keys (skip if found in b1). // Read digest BEFORE zeroing sm_counts (they alias sm_target_digests). uint32_t target_digest = sm_target_digests[key_idx_block]; sm_counts[key_idx_block] = 0; if (!skip) { uint32_t target_digests_vec = __byte_perm(target_digest, target_digest, 0x0000); __pipeline_wait_prior(3); uint32_t probing_digests = sm_probing_digests[same_buf(i)][groupID * DIGEST_SPAN + rank]; uint32_t find_result_ = __vcmpeq4(probing_digests, target_digests_vec); uint32_t find_result = 0; if ((find_result_ & 0x01) != 0) find_result |= 0x01; if ((find_result_ & 0x0100) != 0) find_result |= 0x02; if ((find_result_ & 0x010000) != 0) find_result |= 0x04; if ((find_result_ & 0x01000000) != 0) find_result |= 0x08; int find_number = __popc(find_result); int group_base = 0; if (find_number > 0) { group_base = atomicAdd(sm_counts + key_idx_block, find_number); } bool gt_reserve = (group_base + find_number) > RESERVE; int gt_vote = g.ballot(gt_reserve); K* key_ptr = sm_keys_ptr2[key_idx_block]; if (gt_vote == 0) { do { int digest_idx = __ffs(find_result) - 1; if (digest_idx >= 0) { find_result &= (find_result - 1); int key_pos = rank * 4 + digest_idx; sm_possible_pos[same_buf(i)][groupID * RESERVE + group_base] = key_pos; __pipeline_memcpy_async(sm_possible_keys[same_buf(i)] + (groupID * RESERVE + group_base), key_ptr + key_pos, sizeof(K)); group_base += 1; } else { break; } } while (true); } else { K target_key = sm_target_keys[key_idx_block]; sm_counts[key_idx_block] = 0; int found_vote = 0; bool found = false; do { int digest_idx = __ffs(find_result) - 1; if (digest_idx >= 0) { find_result &= (find_result - 1); int key_pos = rank * 4 + digest_idx; K possible_key = key_ptr[key_pos]; if (possible_key == target_key) { found = true; sm_counts[key_idx_block] = 1; sm_possible_pos[same_buf(i)][groupID * RESERVE] = key_pos; sm_possible_keys[same_buf(i)][groupID * RESERVE] = possible_key; } } found_vote = g.ballot(found); if (found_vote) break; found_vote = digest_idx >= 0; } while (g.any(found_vote)); } } else { __pipeline_wait_prior(3); } __pipeline_commit(); // Step3: verify keys and prefetch values from b2. if (i > 0) { int prev_block = groupID * GROUP_SIZE + i - 1; int prev_skip = g.shfl(b1_found, i - 1); if (!prev_skip) { K target_key = sm_target_keys[prev_block]; // Read count BEFORE zeroing (sm_counts aliases sm_founds). int possible_num = sm_counts[prev_block]; sm_founds[prev_block] = 0; S* score_ptr = CopyScore::get_base_ptr(sm_keys_ptr2, prev_block); VecV* value_ptr = sm_values_ptr2[prev_block]; __pipeline_wait_prior(3); int key_pos; bool found_flag = false; if (rank < possible_num) { K possible_key = sm_possible_keys[diff_buf(i)][groupID * RESERVE + rank]; key_pos = sm_possible_pos[diff_buf(i)][groupID * RESERVE + rank]; if (possible_key == target_key) { found_flag = true; CopyScore::ldg_sts(sm_target_scores + prev_block, score_ptr + key_pos); } } int found_vote = g.ballot(found_flag); if (found_vote) { VecV* v_dst = sm_vector[diff_buf(i)][groupID]; sm_founds[prev_block] = 1; int src_lane = __ffs(found_vote) - 1; int target_pos = g.shfl(key_pos, src_lane); VecV* v_src = value_ptr + target_pos * dim; CopyValue::ldg_sts(rank, v_dst, v_src, dim); } } else { __pipeline_wait_prior(3); } } __pipeline_commit(); // Step4: write back values from b2. if (i > 1) { int wb_block = groupID * GROUP_SIZE + i - 2; int prev_skip = g.shfl(b1_found, i - 2); if (!prev_skip) { int key_idx_grid = blockIdx.x * blockDim.x + wb_block; VecV* v_src = sm_vector[same_buf(i)][groupID]; VecV* v_dst = values + key_idx_grid * dim; int found_flag = sm_founds[wb_block]; __pipeline_wait_prior(3); if (found_flag > 0) { S score_ = CopyScore::lgs(sm_target_scores + wb_block); CopyValue::lds_stg(rank, v_dst, v_src, dim); CopyScore::stg(scores + key_idx_grid, score_); } } else { __pipeline_wait_prior(3); } } } // Pipeline emptying for b2: step3 for last key. { int key_idx_block = groupID * GROUP_SIZE + (loop_num - 1); int last_skip = g.shfl(b1_found, loop_num - 1); if (!last_skip) { K target_key = sm_target_keys[key_idx_block]; // Read count BEFORE zeroing (sm_counts aliases sm_founds). int possible_num = sm_counts[key_idx_block]; sm_founds[key_idx_block] = 0; S* score_ptr = CopyScore::get_base_ptr(sm_keys_ptr2, key_idx_block); VecV* value_ptr = sm_values_ptr2[key_idx_block]; __pipeline_wait_prior(1); int key_pos; bool found_flag = false; if (rank < possible_num) { key_pos = sm_possible_pos[diff_buf(loop_num)][groupID * RESERVE + rank]; K possible_key = sm_possible_keys[diff_buf(loop_num)][groupID * RESERVE + rank]; if (target_key == possible_key) { found_flag = true; CopyScore::ldg_sts(sm_target_scores + key_idx_block, score_ptr + key_pos); } } int found_vote = g.ballot(found_flag); if (found_vote) { sm_founds[key_idx_block] = 1; int src_lane = __ffs(found_vote) - 1; int target_pos = g.shfl(key_pos, src_lane); VecV* v_src = value_ptr + target_pos * dim; VecV* v_dst = sm_vector[diff_buf(loop_num)][groupID]; CopyValue::ldg_sts(rank, v_dst, v_src, dim); } } else { __pipeline_wait_prior(1); } } __pipeline_commit(); // Pipeline emptying: step4 for second-to-last key. if (loop_num > 1) { int key_idx_block = groupID * GROUP_SIZE + loop_num - 2; int prev_skip = g.shfl(b1_found, loop_num - 2); if (!prev_skip) { int key_idx_grid = blockIdx.x * blockDim.x + key_idx_block; VecV* v_src = sm_vector[same_buf(loop_num)][groupID]; VecV* v_dst = values + key_idx_grid * dim; int found_flag = sm_founds[key_idx_block]; __pipeline_wait_prior(1); if (found_flag > 0) { S score_ = CopyScore::lgs(sm_target_scores + key_idx_block); CopyValue::lds_stg(rank, v_dst, v_src, dim); CopyScore::stg(scores + key_idx_grid, score_); } } else { __pipeline_wait_prior(1); } } // Pipeline emptying: step4 for last key. { int key_idx_block = groupID * GROUP_SIZE + loop_num - 1; int last_skip = g.shfl(b1_found, loop_num - 1); if (!last_skip) { int key_idx_grid = blockIdx.x * blockDim.x + key_idx_block; VecV* v_src = sm_vector[same_buf(loop_num + 1)][groupID]; VecV* v_dst = values + key_idx_grid * dim; int found_flag = sm_founds[key_idx_block]; __pipeline_wait_prior(0); if (found_flag > 0) { S score_ = CopyScore::lgs(sm_target_scores + key_idx_block); CopyValue::lds_stg(rank, v_dst, v_src, dim); CopyScore::stg(scores + key_idx_grid, score_); } } else { __pipeline_wait_prior(0); } } // Finalize b2 pass: report found for keys found in b2. if (rank < loop_num) { int key_idx_block = groupID * GROUP_SIZE + rank; int key_idx_grid = blockIdx.x * blockDim.x + key_idx_block; if (b1_found == 0) { // Key was not found in b1; report b2 result. found_functor(key_idx_grid, sm_target_keys[key_idx_block], sm_founds[key_idx_block] > 0); } } } // --- Kernel Launchers --- template struct LaunchDualBucketLookupV1 { template