[
  {
    "path": ".bazeliskrc",
    "content": "USE_BAZEL_VERSION=5.0.0\n"
  },
  {
    "path": ".bazelrc",
    "content": "build -c opt\nbuild --copt -O3\nbuild --copt -pthread\nbuild --linkopt -pthread\nbuild --linkopt -ldl\nbuild --incompatible_linkopts_to_linklibs\nbuild --copt -g --strip=never\nbuild --experimental_repo_remote_exec\n\n# By default, build HKV in C++ 17 mode.\nbuild --cxxopt=-std=c++17\nbuild --host_cxxopt=-std=c++17\n\n# This config refers to building CUDA kernels with nvcc.\nbuild:cuda --crosstool_top=@local_config_cuda//crosstool:toolchain\n\n# CUDA options\nbuild:cuda --action_env GCC_HOST_COMPILER_PATH=\"/usr/bin/gcc\"\nbuild:cuda --action_env CUDA_TOOLKIT_PATH=\"/usr/local/cuda\"\nbuild:cuda --action_env CUDA_VERSION=\"11\"\nbuild:cuda --action_env CUDNN_VERSION=\"8\"\nbuild:cuda --action_env CUDNN_INSTALL_PATH=\"/usr/\"\nbuild:cuda --action_env CUDA_COMPUTE_CAPABILITIES=\"7.5\"\n"
  },
  {
    "path": ".clang-format",
    "content": "BasedOnStyle: Google\nDerivePointerAlignment: false\nIncludeBlocks: Merge\nSortIncludes: true\n"
  },
  {
    "path": ".github/workflows/blossom-ci.yml",
    "content": "# Copyright (c) 2020-2022, NVIDIA CORPORATION.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\n# A workflow to trigger ci on hybrid infra (github + self hosted runner)\nname: Blossom-CI\non:\n  issue_comment:\n    types: [created]\n  workflow_dispatch:\n      inputs:\n          platform:\n            description: 'runs-on argument'     \n            required: false\n          args:\n            description: 'argument'     \n            required: false\njobs:\n  Authorization:\n    name: Authorization\n    runs-on: blossom \n    outputs:\n      args: ${{ env.args }}\n      \n    # This job only runs for pull request comments\n    if: |\n         (github.actor == 'EmmaQiaoCh' || github.actor == 'rhdong' || github.actor == 'Ranjeet-Nvidia' ||  github.actor == 'jiashuy') &&\n         github.event.comment.body == '/blossom-ci'  \n    steps:\n      - name: Check if comment is issued by authorized person\n        run: blossom-ci\n        env:\n          OPERATION: 'AUTH'\n          REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}\n          REPO_KEY_DATA: ${{ secrets.BLOSSOM_KEY }}\n        \n  Vulnerability-scan:\n    name: Vulnerability scan\n    needs: [Authorization]\n    runs-on: ubuntu-latest\n    steps:\n      - name: Checkout code\n        uses: actions/checkout@v2\n        with:\n          repository: ${{ fromJson(needs.Authorization.outputs.args).repo }}\n          ref: ${{ fromJson(needs.Authorization.outputs.args).ref }}\n          lfs: 'true'\n         \n      - name: Run blossom action\n        uses: NVIDIA/blossom-action@main\n        env:\n          REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}\n          REPO_KEY_DATA: ${{ secrets.BLOSSOM_KEY }}\n        with:\n          args1: ${{ fromJson(needs.Authorization.outputs.args).args1 }}\n          args2: ${{ fromJson(needs.Authorization.outputs.args).args2 }}\n          args3: ${{ fromJson(needs.Authorization.outputs.args).args3 }}\n          \n  Job-trigger:\n    name: Start ci job\n    needs: [Vulnerability-scan]\n    runs-on: blossom\n    steps:\n      - name: Start ci job\n        run: blossom-ci\n        env:\n          OPERATION: 'START-CI-JOB'\n          CI_SERVER: ${{ secrets.CI_SERVER }}\n          REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}\n              \n  Upload-Log:\n    name: Upload log\n    runs-on: blossom\n    if : github.event_name == 'workflow_dispatch'\n    steps:\n      - name: Jenkins log for pull request ${{ fromJson(github.event.inputs.args).pr }} (click here)\n        run: blossom-ci\n        env:\n          OPERATION: 'POST-PROCESSING'\n          CI_SERVER: ${{ secrets.CI_SERVER }}\n          REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}\n"
  },
  {
    "path": ".github/workflows/docs-build.yaml",
    "content": "name: docs-build\n\non:\n  pull_request:\n    branches: [master]\n\njobs:\n  build:\n    runs-on: \"ubuntu-latest\"\n\n    steps:\n      - uses: actions/checkout@v3\n      - name: Set up Python 3.8\n        uses: actions/setup-python@v4\n        with:\n          python-version: '3.8'\n      - name: Install Ubuntu packages\n        run: |\n          sudo apt-get update -y\n          sudo apt-get install -y --no-install-recommends doxygen\n      - name: Install dependencies\n        run: |\n          python -m pip install -r docs/requirements-doc.txt\n      - name: Building docs\n        run: |\n          make -C docs html\n      - name: Upload HTML\n        uses: actions/upload-artifact@v4\n        with:\n          name: html-build-artifact\n          path: docs/build/html\n          if-no-files-found: error\n          retention-days: 1\n      - name: Store PR information\n        run: |\n          mkdir ./pr\n          echo ${{ github.event.number }}              > ./pr/pr.txt\n          echo ${{ github.event.pull_request.merged }} > ./pr/merged.txt\n          echo ${{ github.event.action }}              > ./pr/action.txt\n      - name: Upload PR information\n        uses: actions/upload-artifact@v4\n        with:\n          name: pr\n          path: pr/\n"
  },
  {
    "path": ".github/workflows/docs-preview-pr.yaml",
    "content": "name: docs-preview-pr\n\non:\n  workflow_run:\n    workflows: [docs-build]\n    types: [completed]\n\nenv:\n  WF_ID: ${{ github.event.workflow_run.id }}\n\njobs:\n  preview:\n    uses: nvidia-merlin/.github/.github/workflows/docs-preview-pr-common.yaml@main"
  },
  {
    "path": ".github/workflows/docs-remove-stale-reviews.yaml",
    "content": "name: docs-remove-stale-reviews\n\non:\n  schedule:\n    # 42 minutes after 0:00 UTC on Sundays\n    - cron: \"42 0 * * 0\"\n  workflow_dispatch:\n\njobs:\n  remove:\n    uses: nvidia-merlin/.github/.github/workflows/docs-remove-stale-reviews-common.yaml@main\n"
  },
  {
    "path": ".github/workflows/docs-sched-rebuild.yaml",
    "content": "name: docs-sched-rebuild\n\non:\n  push:\n    branches: [master]\n    tags:\n      - v*\n  workflow_dispatch:\n\njobs:\n  build:\n    runs-on: \"ubuntu-latest\"\n\n    steps:\n      - uses: actions/checkout@v3\n        with:\n          fetch-depth: 0\n      - name: Set up Python 3.8\n        uses: actions/setup-python@v4\n        with:\n          python-version: 3.8\n      - name: Install Ubuntu packages\n        run: |\n          sudo apt-get update -y\n          sudo apt-get install -y doxygen\n      - name: Install dependencies\n        run: |\n          python -m pip install --upgrade pip\n          python -m pip install -r docs/requirements-doc.txt\n      - name: Report the versions to build\n        run: |\n          sphinx-multiversion -D 'exhale_args.containmentFolder=${sourcedir}/api' --dump-metadata docs/source docs/build/html | jq \"keys\"\n      - name: Building docs (multiversion)\n        run: |\n          sphinx-multiversion -D 'exhale_args.containmentFolder=${sourcedir}/api' docs/source docs/build/html\n      - name: Delete unnecessary files\n        run: |\n          find docs/build -name .doctrees -prune -exec rm -rf {} \\;\n          find docs/build -name .buildinfo -exec rm {} \\;\n      - name: Upload HTML\n        uses: actions/upload-artifact@v4\n        with:\n          name: html-build-artifact\n          path: docs/build/html\n          if-no-files-found: error\n          retention-days: 1\n\n  # Identify the dir for the HTML.\n  store-html:\n    needs: [build]\n    runs-on: ubuntu-latest\n    steps:\n      - uses: actions/checkout@v3\n        with:\n          ref: \"gh-pages\"\n      - name: Initialize Git configuration\n        run: |\n          git config user.name docs-sched-rebuild\n          git config user.email do-not-send-@github.com\n      - name: Download artifacts\n        uses: actions/download-artifact@v4\n        with:\n          name: html-build-artifact\n      - name: Copy HTML directories\n        run: |\n          ls -asl\n          for i in `ls -d *`\n          do\n            echo \"Git adding ${i}\"\n            git add \"${i}\"\n          done\n      - name: Check or create dot-no-jekyll file\n        run: |\n          if [ -f \".nojekyll\" ]; then\n            echo \"The dot-no-jekyll file already exists.\"\n            exit 0\n          fi\n          touch .nojekyll\n          git add .nojekyll\n      - name: Check or create redirect page\n        env:\n          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}\n        run: |\n          resp=$(grep 'http-equiv=\"refresh\"' index.html 2>/dev/null) || true\n          if [ -n \"${resp}\" ]; then\n            echo \"The redirect file already exists.\"\n            exit 0\n          fi\n          # If any of these commands fail, fail the build.\n          def_branch=$(gh api \"repos/${GITHUB_REPOSITORY}\" --jq \".default_branch\")\n          html_url=$(gh api \"repos/${GITHUB_REPOSITORY}/pages\" --jq \".html_url\")\n          # Beware ugly quotation mark avoidance in the foll lines.\n          echo '<!DOCTYPE html>'                                                                         > index.html\n          echo '<html>'                                                                                 >> index.html\n          echo '  <head>'                                                                               >> index.html\n          echo '    <title>Redirect to documentation</title>'                                           >> index.html\n          echo '    <meta charset=\"utf-8\">'                                                             >> index.html\n          echo '    <meta http=equiv=\"refresh\" content=\"3; URL='${html_url}${def_branch}'/index.html\">' >> index.html\n          echo '    <link rel=\"canonical\" href=\"'${html_url}${def_branch}'/index.html\">'                >> index.html\n          echo '    <script language=\"javascript\">'                                                     >> index.html\n          echo '      function redirect() {'                                                            >> index.html\n          echo '        window.location.assign(\"'${html_url}${def_branch}'/index.html\")'                >> index.html\n          echo '      }'                                                                                >> index.html\n          echo '    </script>'                                                                          >> index.html\n          echo '  </head>'                                                                              >> index.html\n          echo '  <body onload=\"redirect()\">'                                                           >> index.html\n          echo '    <p>Please follow the link to the <a href=\"'${html_url}${def_branch}'/index.html\">'  >> index.html\n          echo      ${def_branch}'</a> branch documentation.</p>'                                       >> index.html\n          echo '  </body>'                                                                              >> index.html\n          echo '</html>'                                                                                >> index.html\n          git add index.html\n      - name: Commit changes to the GitHub Pages branch\n        run: |\n          git status\n          if git commit -m 'Pushing changes to GitHub Pages.'; then\n            git push -f\n          else\n           echo \"Nothing changed.\"\n          fi\n"
  },
  {
    "path": ".gitignore",
    "content": ".DS_Store\n.idea\n.vscode\nbuild\n.clwb\ncmake-build-debug/\ndocs/build\ndocs/source/README.md\ndocs/source/CONTRIBUTING.md\ndocs/source/api"
  },
  {
    "path": ".gitmodules",
    "content": "[submodule \"tests/googletest\"]\n\tpath = tests/googletest\n\turl = https://github.com/google/googletest.git\n\tignore = dirty\n"
  },
  {
    "path": "CMakeLists.txt",
    "content": "# Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n# Apache-2.0\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\ncmake_minimum_required(VERSION 3.10)\nproject(merlin-hkvs LANGUAGES CXX CUDA)\nfind_package(CUDAToolkit)\n\n# TODO(Q3): target_compile_features below still declare cxx_std_14, which is\n# inconsistent with the project-level C++17.  Update them to cxx_std_17 (or\n# remove the per-target lines entirely) once downstream compatibility is\n# confirmed.\nset(CMAKE_CXX_STANDARD 17)\nset(CMAKE_CUDA_STANDARD 17)\nlist(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/modules)\n\noption(CLANGFORMAT \"Clangformat code files before compiling\" OFF)\nif(CLANGFORMAT)\n  include(ClangFormat)\n  file(GLOB_RECURSE clangformat_includes\n    ${PROJECT_SOURCE_DIR}/include/*.h\n    ${PROJECT_SOURCE_DIR}/include/*.hpp\n    ${PROJECT_SOURCE_DIR}/include/*.cuh\n  )\n  file(GLOB clangformat_tests\n    ${PROJECT_SOURCE_DIR}/tests/*.c\n    ${PROJECT_SOURCE_DIR}/tests/*.h\n    ${PROJECT_SOURCE_DIR}/tests/*.cpp\n    ${PROJECT_SOURCE_DIR}/tests/*.hpp\n    ${PROJECT_SOURCE_DIR}/tests/*.cu\n    ${PROJECT_SOURCE_DIR}/tests/*.cuh\n  )\n  set(clangformat_files ${clangformat_includes} ${clangformat_tests})\n  clangformat_setup(\"${clangformat_files}\")\nendif()\n\n# Default to release build.\nif (NOT CMAKE_BUILD_TYPE)\n    set(CMAKE_BUILD_TYPE \"Release\")\n    message(STATUS \"Setting default CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}\")\nendif()\n\n# Some neat defaults.\nset(CUDA_SEPARABLE_COMPILATION ON)\n\n# Select target CUDA binary architecture.\nforeach(cuda_arch ${sm})\n  list(APPEND cuda_arch_list ${cuda_arch})\n  message(STATUS \"Assign GPU architecture (sm=${cuda_arch})\")\nendforeach()\n\nlist(LENGTH cuda_arch_list cuda_arch_list_length)\nif(cuda_arch_list_length EQUAL 0)\n  list(APPEND cuda_arch_list \"80\")\n  message(STATUS \"Assign default GPU architecture sm=80\")\nendif()\n\nif (CMAKE_BUILD_TYPE STREQUAL \"Debug\")\n  add_compile_definitions(CUDA_ERROR_CHECK)\n  set(CMAKE_CUDA_FLAGS \"${CMAKE_CUDA_FLAGS} -lineinfo\")\nendif()\n\nforeach(cuda_arch ${cuda_arch_list})\n  set(CMAKE_CUDA_FLAGS \"${CMAKE_CUDA_FLAGS} -gencode arch=compute_${cuda_arch},code=sm_${cuda_arch}\")\nendforeach()\n\nmessage(CMAKE_CUDA_FLAGS=\"${CMAKE_CUDA_FLAGS}\")\n\ninclude_directories(\n  ${PROJECT_SOURCE_DIR}/include\n  ${PROJECT_SOURCE_DIR}/tests/googletest/googletest/include\n)\n\nADD_SUBDIRECTORY(tests/googletest)\n\nlink_directories(\n)\n\nfile(GLOB_RECURSE merlin_hkvs_src RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.cpp *.cu)\n\n# TODO:\n# add_library(hierarchical_kv STATIC ${hierarchical_kv_src})\n# target_compile_features(hierarchical_kv PUBLIC cxx_std_14)\n# target_link_libraries(hierarchical_kv PUBLIC ...)\n\n\nadd_executable(merlin_hashtable_benchmark benchmark/merlin_hashtable_benchmark.cc.cu)\ntarget_compile_features(merlin_hashtable_benchmark PUBLIC cxx_std_14)\nset_target_properties(merlin_hashtable_benchmark PROPERTIES  CUDA_ARCHITECTURES OFF)\n\nadd_executable(find_with_missed_keys_benchmark benchmark/find_with_missed_keys_benchmark.cc.cu)\ntarget_compile_features(find_with_missed_keys_benchmark PUBLIC cxx_std_14)\nset_target_properties(find_with_missed_keys_benchmark PROPERTIES  CUDA_ARCHITECTURES OFF)\n\nadd_executable(merlin_hashtable_test tests/merlin_hashtable_test.cc.cu)\ntarget_compile_features(merlin_hashtable_test PUBLIC cxx_std_14)\nset_target_properties(merlin_hashtable_test PROPERTIES  CUDA_ARCHITECTURES OFF)\nTARGET_LINK_LIBRARIES(merlin_hashtable_test gtest_main)\n\nadd_executable(find_or_insert_test tests/find_or_insert_test.cc.cu)\ntarget_compile_features(find_or_insert_test PUBLIC cxx_std_14)\nset_target_properties(find_or_insert_test PROPERTIES  CUDA_ARCHITECTURES OFF)\nTARGET_LINK_LIBRARIES(find_or_insert_test gtest_main)\n\nadd_executable(merlin_memory_pool_test tests/memory_pool_test.cc.cu)\ntarget_compile_features(merlin_memory_pool_test PUBLIC cxx_std_14)\nset_target_properties(merlin_memory_pool_test PROPERTIES  CUDA_ARCHITECTURES OFF)\nTARGET_LINK_LIBRARIES(merlin_memory_pool_test gtest_main)\n\nset(CMAKE_BUILD_TYPE \"Debug\")\nadd_executable(save_and_load_test tests/save_and_load_test.cc.cu)\ntarget_compile_features(save_and_load_test PUBLIC cxx_std_14)\nset_target_properties(save_and_load_test PROPERTIES  CUDA_ARCHITECTURES OFF)\nTARGET_LINK_LIBRARIES(save_and_load_test gtest_main)\n\nadd_executable(insert_and_evict_test tests/insert_and_evict_test.cc.cu)\ntarget_compile_features(insert_and_evict_test PUBLIC cxx_std_14)\nset_target_properties(insert_and_evict_test PROPERTIES  CUDA_ARCHITECTURES OFF)\nTARGET_LINK_LIBRARIES(insert_and_evict_test gtest_main)\n\nadd_executable(dynamic_max_capacity_test tests/dynamic_max_capacity_test.cc.cu)\ntarget_compile_features(dynamic_max_capacity_test PUBLIC cxx_std_14)\nset_target_properties(dynamic_max_capacity_test PROPERTIES  CUDA_ARCHITECTURES OFF)\nTARGET_LINK_LIBRARIES(dynamic_max_capacity_test gtest_main)\n\nadd_executable(group_lock_test tests/group_lock_test.cc.cu)\ntarget_compile_features(group_lock_test PUBLIC cxx_std_14)\nset_target_properties(group_lock_test PROPERTIES  CUDA_ARCHITECTURES OFF)\nTARGET_LINK_LIBRARIES(group_lock_test gtest_main)\n\nadd_executable(find_or_insert_ptr_test tests/find_or_insert_ptr_test.cc.cu)\ntarget_compile_features(find_or_insert_ptr_test PUBLIC cxx_std_14)\nset_target_properties(find_or_insert_ptr_test PROPERTIES  CUDA_ARCHITECTURES OFF)\nTARGET_LINK_LIBRARIES(find_or_insert_ptr_test gtest_main)\n\nadd_executable(assign_score_test tests/assign_score_test.cc.cu)\ntarget_compile_features(assign_score_test PUBLIC cxx_std_14)\nset_target_properties(assign_score_test PROPERTIES  CUDA_ARCHITECTURES OFF)\nTARGET_LINK_LIBRARIES(assign_score_test gtest_main)\n\nadd_executable(uint32_score_test tests/uint32_score_test.cc.cu)\ntarget_compile_features(uint32_score_test PUBLIC cxx_std_14)\nset_target_properties(uint32_score_test PROPERTIES  CUDA_ARCHITECTURES OFF)\nTARGET_LINK_LIBRARIES(uint32_score_test gtest_main)\n\nadd_executable(accum_or_assign_test tests/accum_or_assign_test.cc)\ntarget_compile_features(accum_or_assign_test PUBLIC cxx_std_14)\nset_target_properties(accum_or_assign_test PROPERTIES  CUDA_ARCHITECTURES OFF)\nTARGET_LINK_LIBRARIES(accum_or_assign_test gtest_main)\n\nadd_executable(assign_values_test tests/assign_values_test.cc.cu)\ntarget_compile_features(assign_values_test PUBLIC cxx_std_14)\nset_target_properties(assign_values_test PROPERTIES  CUDA_ARCHITECTURES OFF)\nTARGET_LINK_LIBRARIES(assign_values_test gtest_main)\n\nadd_executable(find_with_missed_keys_test tests/find_with_missed_keys_test.cc.cu)\ntarget_compile_features(find_with_missed_keys_test PUBLIC cxx_std_14)\nset_target_properties(find_with_missed_keys_test PROPERTIES  CUDA_ARCHITECTURES OFF)\nTARGET_LINK_LIBRARIES(find_with_missed_keys_test gtest_main)\n\nadd_executable(reserved_keys_test tests/reserved_keys_test.cc.cu)\ntarget_compile_features(reserved_keys_test PUBLIC cxx_std_14)\nset_target_properties(reserved_keys_test PROPERTIES  CUDA_ARCHITECTURES OFF)\nTARGET_LINK_LIBRARIES(reserved_keys_test gtest_main)\n\nadd_executable(export_batch_if_test tests/export_batch_if_test.cc.cu)\ntarget_compile_features(export_batch_if_test PUBLIC cxx_std_14)\nset_target_properties(export_batch_if_test PROPERTIES  CUDA_ARCHITECTURES OFF)\n\nadd_executable(find_or_insert_ptr_lock_test tests/find_or_insert_ptr_lock_test.cc.cu)\ntarget_compile_features(find_or_insert_ptr_lock_test PUBLIC cxx_std_14)\nset_target_properties(find_or_insert_ptr_lock_test PROPERTIES  CUDA_ARCHITECTURES OFF)\nTARGET_LINK_LIBRARIES(find_or_insert_ptr_lock_test gtest_main)\n\nadd_executable(lock_unlock_test tests/lock_unlock_test.cc.cu)\ntarget_compile_features(lock_unlock_test PUBLIC cxx_std_14)\nset_target_properties(lock_unlock_test PROPERTIES  CUDA_ARCHITECTURES OFF)\nTARGET_LINK_LIBRARIES(lock_unlock_test gtest_main)\n\nadd_executable(dual_bucket_test tests/dual_bucket_test.cc.cu)\ntarget_compile_features(dual_bucket_test PUBLIC cxx_std_14)\nset_target_properties(dual_bucket_test PROPERTIES  CUDA_ARCHITECTURES OFF)\nTARGET_LINK_LIBRARIES(dual_bucket_test gtest_main)\n\nadd_executable(dual_bucket_benchmark benchmark/dual_bucket_benchmark.cc.cu)\ntarget_compile_features(dual_bucket_benchmark PUBLIC cxx_std_14)\nset_target_properties(dual_bucket_benchmark PROPERTIES  CUDA_ARCHITECTURES OFF)\n"
  },
  {
    "path": "CONTRIBUTING.md",
    "content": "# Contributing\n\n## About HierarchicalKV\n\nHierarchicalKV is a part of NVIDIA Merlin and provides hierarchical key-value storage to meet RecSys requirements.\n\nThe key capability of HierarchicalKV is to store key-value (feature-embedding) on high-bandwidth memory (HBM) of GPUs and in host memory.\n\nYou can also use the library for generic key-value storage.\n\n## Maintainership\n\nHierarchicalKV is co-maintianed by [NVIDIA Merlin Team](https://github.com/NVIDIA-Merlin) and NVIDIA product end-users,\nand also open for public contributions, bug fixes, and documentation. This project adheres to NVIDIA's Code of Conduct.\n\n## Contributing\n\nWe’re grateful for your interest in HierarchicalKV and value your contributions. \nWe welcome contributions via pull requests(PR). \n\nBefore sending out a pull request for significant change on the end-user API, we recommend you open an issue and\ndiscuss your proposed change. Some changes may require a design review.\nAll submissions require review by project reviewers.\n\n### Coding Style\n\nRefer to the [Style Guide](http://github.com/NVIDIA-Merlin/HierarchicalKV/STYLE_GUIDE.md)\n\n### Additional Requirements\n\nIn addition to the above requirements, contribution also needs to meet the following criteria:\n* The change needs to include unit tests and integration tests if any.\n* Each PR needs to provide necessary documentation for when and how to use it.\n\n## Community\n\n* HierarchicalKV code (https://github.com/NVIDIA-Merlin/HierarchicalKV)\n\n## Licence\nApache License 2.0\n\n"
  },
  {
    "path": "LICENSE",
    "content": "                                 Apache License\n                           Version 2.0, January 2004\n                        http://www.apache.org/licenses/\n\n   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n\n   1. Definitions.\n\n      \"License\" shall mean the terms and conditions for use, reproduction,\n      and distribution as defined by Sections 1 through 9 of this document.\n\n      \"Licensor\" shall mean the copyright owner or entity authorized by\n      the copyright owner that is granting the License.\n\n      \"Legal Entity\" shall mean the union of the acting entity and all\n      other entities that control, are controlled by, or are under common\n      control with that entity. For the purposes of this definition,\n      \"control\" means (i) the power, direct or indirect, to cause the\n      direction or management of such entity, whether by contract or\n      otherwise, or (ii) ownership of fifty percent (50%) or more of the\n      outstanding shares, or (iii) beneficial ownership of such entity.\n\n      \"You\" (or \"Your\") shall mean an individual or Legal Entity\n      exercising permissions granted by this License.\n\n      \"Source\" form shall mean the preferred form for making modifications,\n      including but not limited to software source code, documentation\n      source, and configuration files.\n\n      \"Object\" form shall mean any form resulting from mechanical\n      transformation or translation of a Source form, including but\n      not limited to compiled object code, generated documentation,\n      and conversions to other media types.\n\n      \"Work\" shall mean the work of authorship, whether in Source or\n      Object form, made available under the License, as indicated by a\n      copyright notice that is included in or attached to the work\n      (an example is provided in the Appendix below).\n\n      \"Derivative Works\" shall mean any work, whether in Source or Object\n      form, that is based on (or derived from) the Work and for which the\n      editorial revisions, annotations, elaborations, or other modifications\n      represent, as a whole, an original work of authorship. For the purposes\n      of this License, Derivative Works shall not include works that remain\n      separable from, or merely link (or bind by name) to the interfaces of,\n      the Work and Derivative Works thereof.\n\n      \"Contribution\" shall mean any work of authorship, including\n      the original version of the Work and any modifications or additions\n      to that Work or Derivative Works thereof, that is intentionally\n      submitted to Licensor for inclusion in the Work by the copyright owner\n      or by an individual or Legal Entity authorized to submit on behalf of\n      the copyright owner. For the purposes of this definition, \"submitted\"\n      means any form of electronic, verbal, or written communication sent\n      to the Licensor or its representatives, including but not limited to\n      communication on electronic mailing lists, source code control systems,\n      and issue tracking systems that are managed by, or on behalf of, the\n      Licensor for the purpose of discussing and improving the Work, but\n      excluding communication that is conspicuously marked or otherwise\n      designated in writing by the copyright owner as \"Not a Contribution.\"\n\n      \"Contributor\" shall mean Licensor and any individual or Legal Entity\n      on behalf of whom a Contribution has been received by Licensor and\n      subsequently incorporated within the Work.\n\n   2. Grant of Copyright License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      copyright license to reproduce, prepare Derivative Works of,\n      publicly display, publicly perform, sublicense, and distribute the\n      Work and such Derivative Works in Source or Object form.\n\n   3. Grant of Patent License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      (except as stated in this section) patent license to make, have made,\n      use, offer to sell, sell, import, and otherwise transfer the Work,\n      where such license applies only to those patent claims licensable\n      by such Contributor that are necessarily infringed by their\n      Contribution(s) alone or by combination of their Contribution(s)\n      with the Work to which such Contribution(s) was submitted. If You\n      institute patent litigation against any entity (including a\n      cross-claim or counterclaim in a lawsuit) alleging that the Work\n      or a Contribution incorporated within the Work constitutes direct\n      or contributory patent infringement, then any patent licenses\n      granted to You under this License for that Work shall terminate\n      as of the date such litigation is filed.\n\n   4. Redistribution. You may reproduce and distribute copies of the\n      Work or Derivative Works thereof in any medium, with or without\n      modifications, and in Source or Object form, provided that You\n      meet the following conditions:\n\n      (a) You must give any other recipients of the Work or\n          Derivative Works a copy of this License; and\n\n      (b) You must cause any modified files to carry prominent notices\n          stating that You changed the files; and\n\n      (c) You must retain, in the Source form of any Derivative Works\n          that You distribute, all copyright, patent, trademark, and\n          attribution notices from the Source form of the Work,\n          excluding those notices that do not pertain to any part of\n          the Derivative Works; and\n\n      (d) If the Work includes a \"NOTICE\" text file as part of its\n          distribution, then any Derivative Works that You distribute must\n          include a readable copy of the attribution notices contained\n          within such NOTICE file, excluding those notices that do not\n          pertain to any part of the Derivative Works, in at least one\n          of the following places: within a NOTICE text file distributed\n          as part of the Derivative Works; within the Source form or\n          documentation, if provided along with the Derivative Works; or,\n          within a display generated by the Derivative Works, if and\n          wherever such third-party notices normally appear. The contents\n          of the NOTICE file are for informational purposes only and\n          do not modify the License. You may add Your own attribution\n          notices within Derivative Works that You distribute, alongside\n          or as an addendum to the NOTICE text from the Work, provided\n          that such additional attribution notices cannot be construed\n          as modifying the License.\n\n      You may add Your own copyright statement to Your modifications and\n      may provide additional or different license terms and conditions\n      for use, reproduction, or distribution of Your modifications, or\n      for any such Derivative Works as a whole, provided Your use,\n      reproduction, and distribution of the Work otherwise complies with\n      the conditions stated in this License.\n\n   5. Submission of Contributions. Unless You explicitly state otherwise,\n      any Contribution intentionally submitted for inclusion in the Work\n      by You to the Licensor shall be under the terms and conditions of\n      this License, without any additional terms or conditions.\n      Notwithstanding the above, nothing herein shall supersede or modify\n      the terms of any separate license agreement you may have executed\n      with Licensor regarding such Contributions.\n\n   6. Trademarks. This License does not grant permission to use the trade\n      names, trademarks, service marks, or product names of the Licensor,\n      except as required for reasonable and customary use in describing the\n      origin of the Work and reproducing the content of the NOTICE file.\n\n   7. Disclaimer of Warranty. Unless required by applicable law or\n      agreed to in writing, Licensor provides the Work (and each\n      Contributor provides its Contributions) on an \"AS IS\" BASIS,\n      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\n      implied, including, without limitation, any warranties or conditions\n      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A\n      PARTICULAR PURPOSE. You are solely responsible for determining the\n      appropriateness of using or redistributing the Work and assume any\n      risks associated with Your exercise of permissions under this License.\n\n   8. Limitation of Liability. In no event and under no legal theory,\n      whether in tort (including negligence), contract, or otherwise,\n      unless required by applicable law (such as deliberate and grossly\n      negligent acts) or agreed to in writing, shall any Contributor be\n      liable to You for damages, including any direct, indirect, special,\n      incidental, or consequential damages of any character arising as a\n      result of this License or out of the use or inability to use the\n      Work (including but not limited to damages for loss of goodwill,\n      work stoppage, computer failure or malfunction, or any and all\n      other commercial damages or losses), even if such Contributor\n      has been advised of the possibility of such damages.\n\n   9. Accepting Warranty or Additional Liability. While redistributing\n      the Work or Derivative Works thereof, You may choose to offer,\n      and charge a fee for, acceptance of support, warranty, indemnity,\n      or other liability obligations and/or rights consistent with this\n      License. However, in accepting such obligations, You may act only\n      on Your own behalf and on Your sole responsibility, not on behalf\n      of any other Contributor, and only if You agree to indemnify,\n      defend, and hold each Contributor harmless for any liability\n      incurred by, or claims asserted against, such Contributor by reason\n      of your accepting any such warranty or additional liability.\n\n   END OF TERMS AND CONDITIONS\n\n   APPENDIX: How to apply the Apache License to your work.\n\n      To apply the Apache License to your work, attach the following\n      boilerplate notice, with the fields enclosed by brackets \"{}\"\n      replaced with your own identifying information. (Don't include\n      the brackets!)  The text should be enclosed in the appropriate\n      comment syntax for the file format. We also recommend that a\n      file or class name and description of purpose be included on the\n      same \"printed page\" as the copyright notice for easier\n      identification within third-party archives.\n\n   Copyright 2022 NVIDIA Corporation\n\n   Licensed under the Apache License, Version 2.0 (the \"License\");\n   you may not use this file except in compliance with the License.\n   You may obtain a copy of the License at\n\n       http://www.apache.org/licenses/LICENSE-2.0\n\n   Unless required by applicable law or agreed to in writing, software\n   distributed under the License is distributed on an \"AS IS\" BASIS,\n   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n   See the License for the specific language governing permissions and\n   limitations under the License."
  },
  {
    "path": "README.md",
    "content": "# [NVIDIA HierarchicalKV(Beta)](https://github.com/NVIDIA-Merlin/HierarchicalKV)\n\n[![Version](https://img.shields.io/github/v/release/NVIDIA-Merlin/HierarchicalKV?color=orange&include_prereleases)](https://github.com/NVIDIA-Merlin/HierarchicalKV/releases)\n[![GitHub License](https://img.shields.io/github/license/NVIDIA-Merlin/HierarchicalKV)](https://github.com/NVIDIA-Merlin/HierarchicalKV/blob/master/LICENSE)\n[![Documentation](https://img.shields.io/badge/documentation-blue.svg)](https://nvidia-merlin.github.io/HierarchicalKV/master/README.html)\n\n## About HierarchicalKV\n\nHierarchicalKV is a part of NVIDIA Merlin and provides hierarchical key-value storage to meet RecSys requirements.\n\nThe key capability of HierarchicalKV is to store key-value (feature-embedding) on high-bandwidth memory (HBM) of GPUs and in host memory.\n\nYou can also use the library for generic key-value storage.\n\n## Benefits\n\nWhen building large recommender systems, machine learning (ML) engineers face the following challenges:\n\n- GPUs are needed, but HBM on a single GPU is too small for the large DLRMs that scale to several terabytes.\n- Improving communication performance is getting more difficult in larger and larger CPU clusters.\n- It is difficult to efficiently control consumption growth of limited HBM with customized strategies.\n- Most generic key-value libraries provide low HBM and host memory utilization.\n\nHierarchicalKV alleviates these challenges and helps the machine learning engineers in RecSys with the following benefits:\n\n- Supports training large RecSys models on **HBM and host memory** at the same time.\n- Provides better performance by **full bypassing CPUs** and reducing the communication workload.\n- Implements table-size restraint strategies that are based on **LRU or customized strategies**.\n  The strategies are implemented by CUDA kernels.\n- Operates at a high working-status load factor that is close to 1.0.\n\n\n## Key ideas\n\n- Buckets are locally ordered\n- Store keys and values separately\n- Store all the keys in HBM\n- Build-in and customizable eviction strategy\n\nHierarchicalKV makes NVIDIA GPUs more suitable for training large and super-large models of ***search, recommendations, and advertising***.\nThe library simplifies the common challenges to building, evaluating, and serving sophisticated recommenders models.\n\n## API Documentation\n\nThe main classes and structs are below, but reading the comments in the source code is recommended:\n\n- [`class HashTable`](https://github.com/NVIDIA-Merlin/HierarchicalKV/blob/master/include/merlin_hashtable.cuh#L151)\n- [`class EvictStrategy`](https://github.com/NVIDIA-Merlin/HierarchicalKV/blob/master/include/merlin_hashtable.cuh#L52)\n- [`struct HashTableOptions`](https://github.com/NVIDIA-Merlin/HierarchicalKV/blob/master/include/merlin_hashtable.cuh#L60)\n\nFor regular API doc, please refer to [API Docs](https://nvidia-merlin.github.io/HierarchicalKV/master/api/index.html)\n\n### API Maturity Matrix\n\n`industry-validated` means the API has been well-tested and verified in at least one real-world scenario.\n\n| Name                 | Description                                                                                                              | Function           |\n|:---------------------|:-------------------------------------------------------------------------------------------------------------------------|:-------------------|\n| __insert_or_assign__ | Insert or assign for the specified keys. <br>Overwrite one key with minimum score when bucket is full.                   | industry-validated |\n| __insert_and_evict__ | Insert new keys, and evict keys with minimum score when bucket is full.                                                  | industry-validated |\n| __find_or_insert__   | Search for the specified keys, and insert them when missed.                                                              | well-tested        |\n| __assign__           | Update for each key and bypass when missed.                                                                              | well-tested        |\n| __accum_or_assign__  | Search and update for each key. If found, add value as a delta to the original value. <br>If missed, update it directly. | well-tested        |\n| __find_or_insert\\*__ | Search for the specified keys and return the pointers of values. Insert them firstly when missing.                       | well-tested        |\n| __find__             | Search for the specified keys.                                                                                           | industry-validated |\n| __find\\*__           | Search and return the pointers of values, thread-unsafe but with high performance.                                       | well-tested        |\n| __export_batch__     | Exports a certain number of the key-value-score tuples.                                                                  | industry-validated |\n| __export_batch_if__  | Exports a certain number of the key-value-score tuples which match specific conditions.                                  | industry-validated |\n| __warmup__           | Move the hot key-values from HMEM to HBM                                                                                 | June 15, 2023      |\n\n\n### Evict Strategy\n\nThe `score` is introduced to define the importance of each key, the larger, the more important, the less likely they will be evicted. Eviction only happens when a bucket is full.\nThe `score_type` must be `uint64_t`. For more detail, please refer to [`class EvictStrategy`](https://github.com/NVIDIA-Merlin/HierarchicalKV/blob/master/include/merlin_hashtable.cuh#L52).\n\n| Name           | Definition of `Score`                                                                                                                                                                                           |\n|:---------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|\n| __Lru__        | Device clock in a nanosecond, which could differ slightly from host clock.                                                                                                                                      |\n| __Lfu__        | Frequency increment provided by caller via the input parameter of `scores` of `insert-like` APIs as the increment of frequency.                                                                                 |\n| __EpochLru__   | The high 32bits is the global epoch provided via the input parameter of `global_epoch`, <br>the low 32bits is equal to `(device_clock >> 20) & 0xffffffff` with granularity close to 1 ms.                      |\n| __EpochLfu__   | The high 32bits is the global epoch provided via the input parameter of `global_epoch`, <br>the low 32bits is the frequency, <br>the frequency will keep constant after reaching the max value of `0xffffffff`. |\n| __Customized__ | Fully provided by the caller via the input parameter of `scores` of `insert-like` APIs.                                                                                                                         |\n\n\n* __Note__:\n  - The `insert-like` APIs mean the APIs of `insert_or_assign`, `insert_and_evict`, `find_or_insert`, `accum_or_assign`, and `find_or_insert`. \n  - The `global_epoch` should be maintained by the caller and input as the input parameter of `insert-like` APIs.\n\n### Configuration Options\n\nIt's recommended to keep the default configuration for the options ending with `*`.\n\n| Name                       | Type   | Default | Description                                           |\n|:---------------------------|:-------|:--------|:------------------------------------------------------|\n| __init_capacity__          | size_t | 0       | The initial capacity of the hash table.               |\n| __max_capacity__           | size_t | 0       | The maximum capacity of the hash table.               |\n| __max_hbm_for_vectors__    | size_t | 0       | The maximum HBM for vectors, in bytes.                |\n| __dim__                    | size_t | 64      | The dimension of the value vectors.                   |\n| __max_bucket_size*__       | size_t | 128     | The length of each bucket.                            |\n| __max_load_factor*__       | float  | 0.5f    | The max load factor before rehashing.                 |\n| __block_size*__            | int    | 128     | The default block size for CUDA kernels.              |\n| __io_block_size*__         | int    | 1024    | The block size for IO CUDA kernels.                   |\n| __device_id*__             | int    | -1      | The ID of device. Managed internally when set to `-1` |\n| __io_by_cpu*__             | bool   | false   | The flag indicating if the CPU handles IO.            |\n| __reserved_key_start_bit__ | int    | 0       | The start bit offset of reserved key in the 64 bit    |\n\n- Fore more details refer to [`struct HashTableOptions`](https://github.com/NVIDIA-Merlin/HierarchicalKV/blob/master/include/merlin_hashtable.cuh#L60).\n\n#### Reserved Keys\n- By default, the keys of `0xFFFFFFFFFFFFFFFD`, `0xFFFFFFFFFFFFFFFE`, and `0xFFFFFFFFFFFFFFFF` are reserved for internal using.\n  change  `options.reserved_key_start_bit` if you want to use the above keys.\n  `reserved_key_start_bit` has a valid range from 0 to 62. The default value is 0, which is the above default reserved keys. When `reserved_key_start_bit` is set to any value other than 0, the least significant bit (bit 0) is always `0` for any reserved key.\n\n- Setting `reserved_key_start_bit = 1`:\n  - This setting reserves the two least significant bits 1 and 2 for the reserved keys.\n  - In binary, the last four bits range from `1000` to `1110`. Here, the least significant bit (bit 0) is always `0`, and bits from 3 to 63 are set to `1`.\n  - The new reserved keys in hexadecimal representation are as follows:\n    - `0xFFFFFFFFFFFFFFFE`\n    - `0xFFFFFFFFFFFFFFFC`\n    - `0xFFFFFFFFFFFFFFF8`\n    - `0xFFFFFFFFFFFFFFFA`\n\n- Setting `reserved_key_start_bit = 2`:\n  - This configuration reserves bits 2 and 3 as reserved keys.\n  - The binary representation for the last five bits ranges from `10010` to `11110`, with the least significant bit (bit 0) always set to `0`, and bits from 4 to 63 are set to `1`.\n\n- if you change the reserved_key_start_bit, you should use same value for save/load\n  For more detail, please refer to [`init_reserved_keys`](https://github.com/search?q=repo%3ANVIDIA-Merlin%2FHierarchicalKV%20init_reserved_keys&type=code)\n\n### How to use:\n```cpp\n#include \"merlin_hashtable.cuh\"\n\n\nusing TableOptions = nv::merlin::HashTableOptions;\nusing EvictStrategy = nv::merlin::EvictStrategy;\n\nint main(int argc, char *argv[])\n{\n  using K = uint64_t;\n  using V = float;\n  using S = uint64_t;\n  \n  // 1. Define the table and use LRU eviction strategy.\n  using HKVTable = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>;\n  std::unique_ptr<HKVTable> table = std::make_unique<HKVTable>();\n  \n  // 2. Define the configuration options.\n  TableOptions options;\n  options.init_capacity = 16 * 1024 * 1024;\n  options.max_capacity = options.init_capacity;\n  options.dim = 16;\n  options.max_hbm_for_vectors = nv::merlin::GB(16);\n  \n  \n  // 3. Initialize the table memory resource.\n  table->init(options);\n  \n  // 4. Use table to do something.\n  \n  return 0;\n}\n\n```\n\n### Usage restrictions\n\n- The `key_type` must be `int64_t` or `uint64_t`.\n- The `score_type` must be `uint64_t`.\n## Contributors\n\nHierarchicalKV is co-maintianed by [NVIDIA Merlin Team](https://github.com/NVIDIA-Merlin) and NVIDIA product end-users,\nand also open for public contributions, bug fixes, and documentation. [[Contribute](CONTRIBUTING.md)]\n\n## How to build\n\nBasically, HierarchicalKV is a headers only library, the commands below only create binaries for benchmark and unit testing.\n\nYour environment must meet the following requirements:\n\n- CUDA version >= 11.2\n- NVIDIA GPU with compute capability 8.0, 8.6, 8.7 or 9.0\n- GCC supports `C++17' standard or later.\n- Bazel version >= 3.7.2 (Bazel compile only)\n\n### with cmake\n```shell\ngit clone --recursive https://github.com/NVIDIA-Merlin/HierarchicalKV.git\ncd HierarchicalKV && mkdir -p build && cd build\ncmake -DCMAKE_BUILD_TYPE=Release -Dsm=80 .. && make -j\n```\n\nFor Debug:\n```shell\ncmake -DCMAKE_BUILD_TYPE=Debug -Dsm=80 .. && make -j\n```\n\nFor Benchmark:\n```shell\n./merlin_hashtable_benchmark\n```\n\nFor Unit Test:\n```shell\n./merlin_hashtable_test\n```\n\n### with bazel\n\n- DON'T use the option of `--recursive` for `git clone`.\n- Please modify the environment variables in the `.bazelrc` file in advance if using the customized docker images.\n- The docker images maintained on `nvcr.io/nvidia/tensorflow` are highly recommended.\n\nPull the docker image:\n```shell\ndocker pull nvcr.io/nvidia/tensorflow:22.09-tf2-py3\ndocker run --gpus all -it --rm nvcr.io/nvidia/tensorflow:22.09-tf2-py3\n```\n\nCompile in docker container:\n```shell\ngit clone https://github.com/NVIDIA-Merlin/HierarchicalKV.git\ncd HierarchicalKV && bash bazel_build.sh\n```\n\nFor Benchmark:\n```shell\n./benchmark_util\n```\n\n\n## Benchmark & Performance(W.I.P)\n\n* GPU: 1 x NVIDIA A100 80GB PCIe: 8.0\n* Key Type = uint64_t\n* Value Type = float32 * {dim}\n* Key-Values per OP = 1048576\n* Evict strategy: LRU\n* `λ`: load factor\n* `find*` means the `find` API that directly returns the addresses of values.\n* `find_or_insert*` means the `find_or_insert` API that directly returns the addresses of values.\n* ***Throughput Unit: Billion-KV/second***\n\n### On pure HBM mode: \n\n* dim = 8, capacity = 128 Million-KV, HBM = 4 GB, HMEM = 0 GB\n\n|    λ | insert_or_assign |   find | find_or_insert | assign |  find* | find_or_insert* | insert_and_evict |\n|-----:|-----------------:|-------:|---------------:|-------:|-------:|----------------:|-----------------:|\n| 0.50 |            1.093 |  2.470 |          1.478 |  1.770 |  3.726 |           1.447 |            1.075 |\n| 0.75 |            1.045 |  2.452 |          1.335 |  1.807 |  3.374 |           1.309 |            1.013 |\n| 1.00 |            0.655 |  2.481 |          0.612 |  1.815 |  1.865 |           0.619 |            0.511 |\n\n|    λ | export_batch | export_batch_if | contains |\n|-----:|-------------:|----------------:|---------:|\n| 0.50 |        2.087 |          12.258 |    3.121 |\n| 0.75 |        2.045 |          12.447 |    3.094 |\n| 1.00 |        1.950 |           2.657 |    3.096 |\n\n* dim = 32, capacity = 128 Million-KV, HBM = 16 GB, HMEM = 0 GB\n\n|    λ | insert_or_assign |   find | find_or_insert | assign |  find* | find_or_insert* | insert_and_evict |\n|-----:|-----------------:|-------:|---------------:|-------:|-------:|----------------:|-----------------:|\n| 0.50 |            0.961 |  2.272 |          1.278 |  1.706 |  3.718 |           1.435 |            0.931 |\n| 0.75 |            0.930 |  2.238 |          1.177 |  1.693 |  3.369 |           1.316 |            0.866 |\n| 1.00 |            0.646 |  2.321 |          0.572 |  1.783 |  1.873 |           0.618 |            0.469 |\n\n|    λ | export_batch | export_batch_if | contains |\n|-----:|-------------:|----------------:|---------:|\n| 0.50 |        0.692 |          10.784 |    3.100 |\n| 0.75 |        0.569 |          10.240 |    3.075 |\n| 1.00 |        0.551 |           0.765 |    3.096 |\n\n* dim = 64, capacity = 64 Million-KV, HBM = 16 GB, HMEM = 0 GB\n\n|    λ | insert_or_assign |   find | find_or_insert | assign |  find* | find_or_insert* | insert_and_evict |\n|-----:|-----------------:|-------:|---------------:|-------:|-------:|----------------:|-----------------:|\n| 0.50 |            0.834 |  1.982 |          1.113 |  1.499 |  3.950 |           1.502 |            0.805 |\n| 0.75 |            0.801 |  1.951 |          1.033 |  1.493 |  3.545 |           1.359 |            0.773 |\n| 1.00 |            0.621 |  2.021 |          0.608 |  1.541 |  1.965 |           0.613 |            0.481 |\n\n|    λ | export_batch | export_batch_if | contains |\n|-----:|-------------:|----------------:|---------:|\n| 0.50 |        0.316 |           8.199 |    3.239 |\n| 0.75 |        0.296 |           8.549 |    3.198 |\n| 1.00 |        0.288 |           0.395 |    3.225 |\n\n### On HBM+HMEM hybrid mode: \n\n* dim = 64, capacity = 128 Million-KV, HBM = 16 GB, HMEM = 16 GB\n\n|    λ | insert_or_assign |   find | find_or_insert | assign |  find* | find_or_insert* |\n|-----:|-----------------:|-------:|---------------:|-------:|-------:|----------------:|\n| 0.50 |            0.083 |  0.124 |          0.109 |  0.131 |  3.705 |           1.435 |\n| 0.75 |            0.083 |  0.122 |          0.111 |  0.129 |  3.221 |           1.274 |\n| 1.00 |            0.073 |  0.123 |          0.095 |  0.126 |  1.854 |           0.617 |\n\n|    λ | export_batch | export_batch_if | contains |\n|-----:|-------------:|----------------:|---------:|\n| 0.50 |        0.318 |           8.086 |    3.122 |\n| 0.75 |        0.294 |           5.549 |    3.111 |\n| 1.00 |        0.287 |           0.393 |    3.075 |\n\n* dim = 64, capacity = 512 Million-KV, HBM = 32 GB, HMEM = 96 GB\n\n|    λ | insert_or_assign |   find | find_or_insert | assign |  find* | find_or_insert* |\n|-----:|-----------------:|-------:|---------------:|-------:|-------:|----------------:|\n| 0.50 |            0.049 |  0.069 |          0.049 |  0.069 |  3.484 |           1.370 |\n| 0.75 |            0.049 |  0.069 |          0.049 |  0.069 |  3.116 |           1.242 |\n| 1.00 |            0.047 |  0.072 |          0.047 |  0.070 |  1.771 |           0.607 |\n\n|    λ | export_batch | export_batch_if | contains |\n|-----:|-------------:|----------------:|---------:|\n| 0.50 |        0.316 |           8.181 |    3.073 |\n| 0.75 |        0.293 |           8.950 |    3.052 |\n| 1.00 |        0.292 |           0.394 |    3.026 |\n\n### Support and Feedback:\n\nIf you encounter any issues or have questions, go to [https://github.com/NVIDIA-Merlin/HierarchicalKV/issues](https://github.com/NVIDIA-Merlin/HierarchicalKV/issues) and submit an issue so that we can provide you with the necessary resolutions and answers.\n\n### Acknowledgment\nWe are very grateful to external initial contributors [@Zhangyafei](https://github.com/zhangyafeikimi) and [@Lifan](https://github.com/Lifann) for their design, coding, and review work.\n\n### License\nApache License 2.0\n"
  },
  {
    "path": "STYLE_GUIDE.md",
    "content": "#### C++\nC++ code should conform to [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html).\n\nHierarchicalKV uses [clang-format](https://clang.llvm.org/docs/ClangFormat.html)\nto check your C/C++ changes. Sometimes you have some manually formatted\ncode that you don’t want clang-format to touch.\nYou can disable formatting like this:\n\n```cpp\nint formatted_code;\n// clang-format off\n    void    unformatted_code  ;\n// clang-format on\nvoid formatted_code_again;\n```\n\nInstall Clang-format (the version 18.1.3 is required) for Ubuntu:\n\n```bash\nsudo apt install clang-format-18\n```\n\nformat all with:\n```bash\nfind ./ \\( -path ./tests/googletest -prune \\) -o \\( -iname *.h -o -iname *.cpp -o -iname *.cc -o -iname *.cu -o -iname *.cuh -o -iname *.hpp \\) -print | xargs clang-format-18 -i --style=file\n\n```\n"
  },
  {
    "path": "WORKSPACE",
    "content": "workspace(name = \"HierarchicalKV\")\n\nload(\"@bazel_tools//tools/build_defs/repo:http.bzl\", \"http_archive\")\nload(\"//build_deps/gpus:configure.bzl\", \"cuda_configure\")\n\nhttp_archive(\n    name = \"bazel_skylib\",\n    sha256 = \"1dde365491125a3db70731e25658dfdd3bc5dbdfd11b840b3e987ecf043c7ca0\",\n    urls = [\n        \"https://github.com/bazelbuild/bazel-skylib/releases/download/0.9.0/bazel_skylib-0.9.0.tar.gz\",\n    ],\n)\n\ncuda_configure(name = \"local_config_cuda\")\n"
  },
  {
    "path": "bazel_build.sh",
    "content": "#!/bin/bash\n\n# Usage : `./bazel_build.sh` or `bash bazel_build.sh`\nset -e\nexport $(cat .bazeliskrc | xargs)\n\nbazel build --config=cuda //...\n"
  },
  {
    "path": "benchmark/BUILD",
    "content": "load(\"@local_config_cuda//cuda:build_defs.bzl\", \"cuda_cc_library\")\n\ncc_binary(\n    name = \"benchmark_util\",\n    deps = [\n        \":benchmark_lib\",\n    ],\n)\n\ncuda_cc_library(\n    name = \"benchmark_lib\",\n    srcs = [\n        \"merlin_hashtable_benchmark.cc.cu\",\n    ],\n    hdrs = [\n        \"benchmark_util.cuh\",\n    ],\n    copts = [\"-Iinclude/\"],\n    linkopts = [\"-pthread\"],\n    deps = [\n        \"//include:merlin_hashtable\",\n        \"@local_config_cuda//cuda\",\n    ],\n)\n"
  },
  {
    "path": "benchmark/benchmark_util.cuh",
    "content": "/*\n * Copyright (c) 2023, NVIDIA CORPORATION.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *     http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n#pragma once\n\n#include <chrono>\n#include <cmath>\n#include <cstdint>\n#include \"merlin/utils.cuh\"\n\nnamespace benchmark {\n\nenum class TimeUnit {\n  Second = 0,\n  MilliSecond = 3,\n  MicroSecond = 6,\n  NanoSecond = 9,\n};\n\nenum class API_Select {\n  find = 0,\n  insert_or_assign = 1,\n  find_or_insert = 2,\n  assign = 3,\n  insert_and_evict = 4,\n  find_ptr = 5,\n  find_or_insert_ptr = 6,\n  export_batch = 7,\n  export_batch_if = 8,\n  contains = 9,\n};\n\nenum class Hit_Mode {\n  random = 0,\n  last_insert = 1,\n};\n\ntemplate <typename Rep>\nstruct Timer {\n  explicit Timer(TimeUnit tu = TimeUnit::Second) : tu_(tu) {}\n  void start() { startRecord = std::chrono::steady_clock::now(); }\n  void end() { endRecord = std::chrono::steady_clock::now(); }\n  Rep getResult() {\n    auto duration_ = std::chrono::duration_cast<std::chrono::nanoseconds>(\n        endRecord - startRecord);\n    auto pow_ =\n        static_cast<int32_t>(tu_) - static_cast<int32_t>(TimeUnit::NanoSecond);\n    auto factor = static_cast<Rep>(std::pow(10, pow_));\n    return static_cast<Rep>(duration_.count()) * factor;\n  }\n\n private:\n  TimeUnit tu_;\n  std::chrono::time_point<std::chrono::steady_clock> startRecord{};\n  std::chrono::time_point<std::chrono::steady_clock> endRecord{};\n};\n\n// RAII Timer using CUDA Event\ntemplate <typename Rep>\nstruct KernelTimer {\n  explicit KernelTimer(TimeUnit tu = TimeUnit::Second) : tu_(tu) {\n    CUDA_CHECK(cudaEventCreate(&start_));\n    CUDA_CHECK(cudaEventCreate(&end_));\n  }\n  ~KernelTimer() {\n    CUDA_CHECK(cudaEventDestroy(start_));\n    CUDA_CHECK(cudaEventDestroy(end_));\n  }\n  void start() { CUDA_CHECK(cudaEventRecord(start_)); }\n  void end() {\n    CUDA_CHECK(cudaEventRecord(end_));\n    CUDA_CHECK(cudaEventSynchronize(end_));\n    CUDA_CHECK(cudaEventElapsedTime(&time, start_, end_));\n  }\n  Rep getResult() {\n    auto pow_ =\n        static_cast<int32_t>(tu_) - static_cast<int32_t>(TimeUnit::MilliSecond);\n    auto factor = static_cast<Rep>(std::pow(10, pow_));\n    return static_cast<Rep>(time * factor);\n  }\n\n private:\n  TimeUnit tu_;\n  float time{-1.0f};\n  cudaEvent_t start_;\n  cudaEvent_t end_;\n};\n\ninline uint64_t getTimestamp() {\n  return std::chrono::duration_cast<std::chrono::milliseconds>(\n             std::chrono::system_clock::now().time_since_epoch())\n      .count();\n}\n\ntemplate <class K, class S>\nvoid create_continuous_keys(K* h_keys, S* h_scores, const int key_num_per_op,\n                            const K start = 0, int freq_range = 1000) {\n  for (K i = 0; i < key_num_per_op; i++) {\n    h_keys[i] = start + static_cast<K>(i);\n    if (h_scores != nullptr) h_scores[i] = h_keys[i] % freq_range;\n  }\n}\n\ntemplate <class K, class S>\nvoid create_random_keys(K* h_keys, S* h_scores, const int key_num_per_op) {\n  std::unordered_set<K> numbers;\n  std::random_device rd;\n  std::mt19937_64 eng(rd());\n  std::uniform_int_distribution<K> distr;\n  int i = 0;\n\n  while (numbers.size() < key_num_per_op) {\n    numbers.insert(distr(eng));\n  }\n  for (const K num : numbers) {\n    h_keys[i] = num;\n    if (h_scores != nullptr) h_scores[i] = getTimestamp();\n    i++;\n  }\n}\n\ntemplate <typename K, typename S>\nvoid create_keys_for_hitrate(K* h_keys, S* h_scores, const int key_num_per_op,\n                             const float hitrate = 0.6f,\n                             const Hit_Mode hit_mode = Hit_Mode::last_insert,\n                             const K end = 0, const bool reset = false,\n                             int freq_range = 1000) {\n  int divide = static_cast<int>(key_num_per_op * hitrate);\n  if (Hit_Mode::random == hit_mode) {\n    std::random_device rd;\n    std::mt19937_64 eng(rd());\n    K existed_max = end == 0 ? 1 : (end - 1);\n    std::uniform_int_distribution<K> distr(0, existed_max);\n\n    if (existed_max < divide) {\n      std::cout << \"# Can not generate enough keys for hit!\";\n      exit(-1);\n    }\n    std::unordered_set<K> numbers;\n    while (numbers.size() < divide) {\n      numbers.insert(distr(eng));\n    }\n    int i = 0;\n    for (auto existed_value : numbers) {\n      h_keys[i] = existed_value;\n      if (h_scores != nullptr) h_scores[i] = h_keys[i] % freq_range;\n      i++;\n    }\n  } else {\n    // else keep its original value, but update scores\n    for (int i = 0; i < divide; i++) {\n      if (h_scores != nullptr) h_scores[i] = getTimestamp() % freq_range;\n    }\n  }\n\n  static K new_value = std::numeric_limits<K>::max();\n  if (reset) {\n    new_value = std::numeric_limits<K>::max();\n  }\n  for (int i = divide; i < key_num_per_op; i++) {\n    h_keys[i] = new_value--;\n    if (h_scores != nullptr) h_scores[i] = getTimestamp() % freq_range;\n  }\n}\n\ntemplate <typename S>\nvoid refresh_scores(S* h_scores, const int key_num_per_op) {\n  for (int i = 0; i < key_num_per_op; i++) {\n    h_scores[i] = getTimestamp();\n  }\n}\n\ntemplate <class K, class V>\nvoid init_value_using_key(K* h_keys, V* h_vectors, const int key_num_per_op,\n                          int dim) {\n  for (size_t i = 0; i < key_num_per_op; i++) {\n    for (size_t j = 0; j < dim; j++) {\n      h_vectors[i * dim + j] = static_cast<V>(h_keys[i] * 0.00001);\n    }\n  }\n}\n\ntemplate <class V>\n__global__ void read_from_ptr_kernel(const V* const* __restrict src,\n                                     V* __restrict dst, const size_t dim,\n                                     size_t N) {\n  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;\n\n  for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) {\n    int vec_index = int(t / dim);\n    int dim_index = t % dim;\n    if (src[vec_index]) {\n      dst[vec_index * dim + dim_index] = src[vec_index][dim_index];\n    }\n  }\n}\n\ntemplate <class V>\nvoid read_from_ptr(const V* const* __restrict src, V* __restrict dst,\n                   const size_t dim, size_t n, cudaStream_t stream) {\n  const size_t block_size = 1024;\n  const size_t N = n * dim;\n  const size_t grid_size = nv::merlin::SAFE_GET_GRID_SIZE(N, block_size);\n\n  read_from_ptr_kernel<V>\n      <<<grid_size, block_size, 0, stream>>>(src, dst, dim, N);\n}\n\ntemplate <class V>\n__global__ void array2ptr_kernel(V** ptr, V* __restrict array, const size_t dim,\n                                 size_t N) {\n  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;\n\n  for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) {\n    int vec_index = int(t);\n    ptr[vec_index] = array + vec_index * dim;\n  }\n}\n\ntemplate <class V>\nvoid array2ptr(V** ptr, V* __restrict array, const size_t dim, size_t n,\n               cudaStream_t stream) {\n  const size_t block_size = 1024;\n  const size_t N = n;\n  const size_t grid_size = nv::merlin::SAFE_GET_GRID_SIZE(N, block_size);\n\n  array2ptr_kernel<V><<<grid_size, block_size, 0, stream>>>(ptr, array, dim, N);\n}\n\ntemplate <class S>\n__global__ void host_nano_kernel(S* d_clk) {\n  S mclk;\n  asm volatile(\"mov.u64 %0,%%globaltimer;\" : \"=l\"(mclk));\n  *d_clk = mclk;\n}\n\ntemplate <class S>\nS host_nano(cudaStream_t stream = 0) {\n  S h_clk = 0;\n  S* d_clk;\n\n  CUDA_CHECK(cudaMalloc((void**)&(d_clk), sizeof(S)));\n  host_nano_kernel<S><<<1, 1, 0, stream>>>(d_clk);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n\n  CUDA_CHECK(cudaMemcpy(&h_clk, d_clk, sizeof(S), cudaMemcpyDeviceToHost));\n  CUDA_CHECK(cudaFree(d_clk));\n  return h_clk;\n}\n\ntemplate <class K, class S>\nstruct ExportIfPredFunctor {\n  __forceinline__ __device__ bool operator()(const K& key, S& score,\n                                             const K& pattern,\n                                             const S& threshold) {\n    return score > threshold;\n  }\n};\n\n}  // namespace benchmark\n"
  },
  {
    "path": "benchmark/dual_bucket_benchmark.cc.cu",
    "content": "/*\n * Copyright (c) 2024, NVIDIA CORPORATION.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *     http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n#include <chrono>\n#include <cstdio>\n#include <iostream>\n#include <numeric>\n#include <random>\n#include <vector>\n#include \"merlin_hashtable.cuh\"\n\nusing K = uint64_t;\nusing V = float;\nusing S = uint64_t;\nusing TableOptions = nv::merlin::HashTableOptions;\nusing TableMode = nv::merlin::TableMode;\nusing EvictStrategy = nv::merlin::EvictStrategy;\n\ntemplate <typename Table>\ndouble benchmark_insert(Table& table, size_t n, K* d_keys, V* d_values,\n                        S* d_scores, cudaStream_t stream) {\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  auto start = std::chrono::high_resolution_clock::now();\n  table.insert_or_assign(n, d_keys, d_values, d_scores, stream, true);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  auto end = std::chrono::high_resolution_clock::now();\n  double ms = std::chrono::duration_cast<std::chrono::microseconds>(end - start)\n                  .count() /\n              1000.0;\n  return static_cast<double>(n) / ms / 1000.0;  // Mops/s\n}\n\ntemplate <typename Table>\ndouble benchmark_find(Table& table, size_t n, K* d_keys, V* d_values,\n                      bool* d_founds, cudaStream_t stream) {\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  auto start = std::chrono::high_resolution_clock::now();\n  table.find(n, d_keys, d_values, d_founds, nullptr, stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  auto end = std::chrono::high_resolution_clock::now();\n  double ms = std::chrono::duration_cast<std::chrono::microseconds>(end - start)\n                  .count() /\n              1000.0;\n  return static_cast<double>(n) / ms / 1000.0;  // Mops/s\n}\n\nvoid run_benchmark(size_t capacity, size_t dim, TableMode mode,\n                   const char* mode_name) {\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  Table table;\n  TableOptions options;\n  options.init_capacity = capacity;\n  options.max_capacity = capacity;\n  options.max_hbm_for_vectors = 0;\n  options.dim = dim;\n  options.max_bucket_size = 128;\n  options.table_mode = mode;\n  table.init(options);\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  // Generate keys.\n  size_t max_n = capacity;\n  std::vector<K> h_keys(max_n);\n  std::vector<V> h_values(max_n * dim, 1.0f);\n  std::vector<S> h_scores(max_n);\n  std::iota(h_keys.begin(), h_keys.end(), 1);\n  for (size_t i = 0; i < max_n; i++) h_scores[i] = i + 1;\n\n  K* d_keys;\n  V* d_values;\n  S* d_scores;\n  bool* d_founds;\n  V* d_found_values;\n  CUDA_CHECK(cudaMalloc(&d_keys, max_n * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_values, max_n * dim * sizeof(V)));\n  CUDA_CHECK(cudaMalloc(&d_scores, max_n * sizeof(S)));\n  CUDA_CHECK(cudaMalloc(&d_founds, max_n * sizeof(bool)));\n  CUDA_CHECK(cudaMalloc(&d_found_values, max_n * dim * sizeof(V)));\n\n  CUDA_CHECK(cudaMemcpy(d_keys, h_keys.data(), max_n * sizeof(K),\n                        cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_values, h_values.data(), max_n * dim * sizeof(V),\n                        cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_scores, h_scores.data(), max_n * sizeof(S),\n                        cudaMemcpyHostToDevice));\n\n  printf(\"--- %s (capacity=%zuK, dim=%zu) ---\\n\", mode_name, capacity / 1024,\n         dim);\n  printf(\"  %-12s  %-18s  %-18s\\n\", \"Load Factor\", \"Insert (Mops/s)\",\n         \"Find (Mops/s)\");\n\n  float load_factors[] = {0.25f, 0.50f, 0.75f, 0.90f, 0.95f, 1.00f};\n  size_t prev_n = 0;\n\n  for (float lf : load_factors) {\n    size_t target_n = static_cast<size_t>(capacity * lf);\n    if (target_n > max_n) break;\n    size_t batch_n = target_n - prev_n;\n    if (batch_n == 0) continue;\n\n    // Insert to reach target load factor.\n    double insert_mops =\n        benchmark_insert(table, batch_n, d_keys + prev_n,\n                         d_values + prev_n * dim, d_scores + prev_n, stream);\n\n    // Find all inserted keys.\n    double find_mops = benchmark_find(table, target_n, d_keys, d_found_values,\n                                      d_founds, stream);\n\n    printf(\"  %-12.2f  %-18.1f  %-18.1f\\n\", lf, insert_mops, find_mops);\n    prev_n = target_n;\n  }\n\n  // Memory efficiency: first eviction LF.\n  // (Already covered in test, report here too.)\n  size_t table_size = table.size(stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  printf(\"  Final size: %zu / %zu (LF=%.4f)\\n\", table_size, capacity,\n         static_cast<float>(table_size) / capacity);\n\n  CUDA_CHECK(cudaFree(d_keys));\n  CUDA_CHECK(cudaFree(d_values));\n  CUDA_CHECK(cudaFree(d_scores));\n  CUDA_CHECK(cudaFree(d_founds));\n  CUDA_CHECK(cudaFree(d_found_values));\n  CUDA_CHECK(cudaStreamDestroy(stream));\n}\n\nint main(int argc, char** argv) {\n  printf(\"=== Dual-Bucket Benchmark Results ===\\n\\n\");\n\n  // Default: 1M capacity, dim=64.\n  size_t capacity = 128 * 1024 * 8;  // ~1M\n  size_t dim = 64;\n\n  if (argc > 1) capacity = static_cast<size_t>(atol(argv[1]));\n  if (argc > 2) dim = static_cast<size_t>(atol(argv[2]));\n\n  run_benchmark(capacity, dim, TableMode::kThroughput, \"THROUGHPUT_MODE\");\n  printf(\"\\n\");\n  run_benchmark(capacity, dim, TableMode::kMemory, \"MEMORY_MODE\");\n  printf(\"\\n\");\n\n  printf(\"=== Benchmark Complete ===\\n\");\n  return 0;\n}\n"
  },
  {
    "path": "benchmark/find_with_missed_keys_benchmark.cc.cu",
    "content": "/*\n * Copyright (c) 2024, NVIDIA CORPORATION.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *     http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n#include <assert.h>\n#include <algorithm>\n#include <chrono>\n#include <cmath>\n#include <cstdio>\n#include <cstdlib>\n#include <iomanip>\n#include <iostream>\n#include <limits>\n#include <random>\n#include <string>\n#include <thread>\n#include <unordered_map>\n#include <unordered_set>\n#include \"benchmark_util.cuh\"\n#include \"merlin_hashtable.cuh\"\n\nusing K = uint64_t;\nusing V = float;\nusing S = uint64_t;\nusing EvictStrategy = nv::merlin::EvictStrategy;\nusing TableOptions = nv::merlin::HashTableOptions;\nusing Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\nvoid print_tile() {\n  std::cout << std::endl\n            << \"|    \\u03BB \" << \"| capacity \" << \"| max_hbm_for_vectors \"\n            << \"| max_bucket_size \" << \"| dim \" << \"| missed_ratio \"\n            << \"| througput(BillionKV/secs) \";\n  std::cout << \"|\\n\";\n\n  //<< \"| load_factor \"\n  std::cout << \"|------\"\n            //<< \"| capacity \"\n            << \"|----------\"\n            //<< \"| max_hbm_for_vectors \"\n            << \"|---------------------\"\n            //<< \"| max_bucket_size \"\n            << \"|-----------------\"\n            //<< \"| dim \"\n            << \"|-----\"\n            //<< \"| missed_ratio \"\n            << \"|--------------\"\n            //<< \"| througput(BillionKV/secs) \"\n            << \"|---------------------------\";\n  std::cout << \"|\\n\";\n}\n\ntemplate <typename T>\nvoid print_w(const T& t, size_t width) {\n  std::cout << \"|\" << std::setw(width) << t;\n}\n\nvoid print_result(double load_factor, size_t capacity,\n                  size_t max_hbm_for_vectors, size_t max_bucket_size,\n                  size_t dim, double missed_ratio, float througput) {\n  print_w(load_factor, 6);\n  print_w(capacity, 10);\n  print_w(max_hbm_for_vectors, 21);\n  print_w(max_bucket_size, 17);\n  print_w(dim, 5);\n  print_w(missed_ratio, 14);\n  print_w(througput, 27);\n  std::cout << \"|\\n\";\n}\n\nvoid test_find(size_t capacity, size_t dim, size_t max_hbm_for_vectors,\n               double load_factor, size_t max_bucket_size,\n               double missed_ratio) {\n  MERLIN_CHECK(load_factor >= 0.0 && load_factor <= 1.0,\n               \"Invalid `load_factor`\");\n  K* h_keys;\n  S* h_scores;\n  V* h_vectors;\n\n  TableOptions options;\n  options.init_capacity = capacity;\n  options.max_capacity = capacity;\n  options.dim = dim;\n\n  options.max_hbm_for_vectors = nv::merlin::MB(max_hbm_for_vectors);\n  options.max_bucket_size = max_bucket_size;\n\n  size_t key_num = capacity;\n  CUDA_CHECK(cudaMallocHost(&h_keys, key_num * sizeof(K)));\n  CUDA_CHECK(cudaMallocHost(&h_scores, key_num * sizeof(S)));\n  CUDA_CHECK(cudaMallocHost(&h_vectors, key_num * options.dim * sizeof(V)));\n\n  K* d_keys;\n  S* d_scores;\n  V* d_vectors;\n  K* d_missed_keys;\n  int* d_missed_indices;\n  int* d_missed_size;\n\n  CUDA_CHECK(cudaMalloc(&d_keys, key_num * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores, key_num * sizeof(S)));\n  CUDA_CHECK(cudaMalloc(&d_vectors, key_num * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMalloc(&d_missed_keys, key_num * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_missed_indices, key_num * sizeof(int)));\n  CUDA_CHECK(cudaMalloc(&d_missed_size, sizeof(int)));\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n  // insert key-value\n  size_t insert_num = (double)key_num * load_factor;\n  benchmark::create_continuous_keys<K, S>(h_keys, h_scores, insert_num,\n                                          0 /*start*/);\n  benchmark::init_value_using_key<K, V>(h_keys, h_vectors, insert_num,\n                                        options.dim);\n  CUDA_CHECK(cudaMemcpy(d_keys, h_keys, insert_num * sizeof(K),\n                        cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_scores, h_scores, insert_num * sizeof(S),\n                        cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,\n                        insert_num * sizeof(V) * options.dim,\n                        cudaMemcpyHostToDevice));\n  Table table;\n  table.init(options);\n  table.insert_or_assign(insert_num, d_keys, d_vectors, d_scores, stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n\n  // find key-value\n  size_t find_num = (double)insert_num * (1.0 - missed_ratio);\n  benchmark::create_continuous_keys<K, S>(h_keys, nullptr, find_num,\n                                          0 /*start*/);\n  benchmark::create_continuous_keys<K, S>(\n      h_keys + find_num, nullptr, insert_num - find_num, insert_num /*start*/);\n  CUDA_CHECK(cudaMemcpy(d_keys, h_keys, insert_num * sizeof(K),\n                        cudaMemcpyHostToDevice));\n\n  auto timer = benchmark::Timer<double>();\n  timer.start();\n  table.find(insert_num, d_keys, d_vectors, d_missed_keys, d_missed_indices,\n             d_missed_size, d_scores, stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  timer.end();\n\n  CUDA_CHECK(cudaFreeHost(h_keys));\n  CUDA_CHECK(cudaFreeHost(h_scores));\n  CUDA_CHECK(cudaFreeHost(h_vectors));\n  CUDA_CHECK(cudaFree(d_keys));\n  CUDA_CHECK(cudaFree(d_scores));\n  CUDA_CHECK(cudaFree(d_vectors));\n  CUDA_CHECK(cudaFree(d_missed_keys));\n  CUDA_CHECK(cudaFree(d_missed_indices));\n  CUDA_CHECK(cudaFree(d_missed_size));\n\n  CudaCheckError();\n  float througput = insert_num / timer.getResult() / (1024 * 1024 * 1024.0f);\n  print_result(load_factor, capacity, max_hbm_for_vectors, max_bucket_size, dim,\n               missed_ratio, througput);\n}\n\nvoid test_main(double load_factor, double missed_ratio) {\n  constexpr size_t CAPACITY = 100000000UL;\n  print_tile();\n  // pure HBM\n  test_find(CAPACITY, 8, 8 * 1024UL, load_factor, 256, missed_ratio);\n  test_find(CAPACITY, 8, 8 * 1024UL, load_factor, 128, missed_ratio);\n  // hybrid\n  test_find(CAPACITY, 8, 1 * 1024UL, load_factor, 256, missed_ratio);\n  test_find(CAPACITY, 8, 1 * 1024UL, load_factor, 128, missed_ratio);\n  // pure HMEM\n  test_find(CAPACITY, 8, 0, load_factor, 256, missed_ratio);\n  test_find(CAPACITY, 8, 0, load_factor, 128, missed_ratio);\n}\n\nint main() {\n  test_main(0.2, 0);\n  test_main(0.2, 0.5);\n  test_main(0.2, 1.0);\n  test_main(0.5, 0);\n  test_main(0.5, 0.5);\n  test_main(0.5, 1.0);\n  test_main(1.0, 0);\n  test_main(1.0, 0.5);\n  test_main(1.0, 1.0);\n  return 0;\n}\n"
  },
  {
    "path": "benchmark/merlin_hashtable_benchmark.cc.cu",
    "content": "/*\n * Copyright (c) 2022, NVIDIA CORPORATION.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *     http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n#include <assert.h>\n#include <algorithm>\n#include <chrono>\n#include <cmath>\n#include <cstdio>\n#include <cstdlib>\n#include <iomanip>\n#include <iostream>\n#include <limits>\n#include <random>\n#include <thread>\n#include <unordered_map>\n#include <unordered_set>\n#include \"benchmark_util.cuh\"\n#include \"merlin_hashtable.cuh\"\n\nusing std::cerr;\nusing std::cout;\nusing std::endl;\nusing std::fixed;\nusing std::setfill;\nusing std::setprecision;\nusing std::setw;\n\nusing namespace nv::merlin;\nusing namespace benchmark;\n\nenum class Test_Mode {\n  pure_hbm = 0,\n  hybrid = 1,\n};\n\nconst float EPSILON = 0.001f;\n\nstd::string rep(int n) { return std::string(n, ' '); }\n\nusing K = uint64_t;\nusing S = uint64_t;\nusing V = float;\nusing EvictStrategy = nv::merlin::EvictStrategy;\nusing TableOptions = nv::merlin::HashTableOptions;\n\ntemplate <class Table>\nfloat test_one_api(std::shared_ptr<Table>& table, const API_Select api,\n                   const size_t dim, const size_t init_capacity,\n                   const size_t key_num_per_op, const float load_factor,\n                   const float hitrate = 0.6f) {\n  K* h_keys;\n  S* h_scores;\n  V* h_vectors;\n  bool* h_found;\n\n  CUDA_CHECK(cudaMallocHost(&h_keys, key_num_per_op * sizeof(K)));\n  CUDA_CHECK(cudaMallocHost(&h_scores, key_num_per_op * sizeof(S)));\n  CUDA_CHECK(cudaMallocHost(&h_vectors, key_num_per_op * sizeof(V) * dim));\n  CUDA_CHECK(cudaMallocHost(&h_found, key_num_per_op * sizeof(bool)));\n\n  CUDA_CHECK(cudaMemset(h_vectors, 0, key_num_per_op * sizeof(V) * dim));\n\n  bool need_scores = (Table::evict_strategy == EvictStrategy::kLfu ||\n                      Table::evict_strategy == EvictStrategy::kEpochLfu ||\n                      Table::evict_strategy == EvictStrategy::kCustomized);\n\n  K* d_keys;\n  S* d_scores_real;\n  S* d_scores;\n  V* d_vectors;\n  V* d_def_val;\n  V** d_vectors_ptr;\n  bool* d_found;\n  K* d_keys_out;\n\n  K* d_evict_keys;\n  S* d_evict_scores;\n\n  CUDA_CHECK(cudaMalloc(&d_keys, key_num_per_op * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores_real, key_num_per_op * sizeof(S)));\n  CUDA_CHECK(cudaMalloc(&d_vectors, key_num_per_op * sizeof(V) * dim));\n  CUDA_CHECK(cudaMalloc(&d_def_val, key_num_per_op * sizeof(V) * dim));\n  CUDA_CHECK(cudaMalloc(&d_vectors_ptr, key_num_per_op * sizeof(V*)));\n  CUDA_CHECK(cudaMalloc(&d_found, key_num_per_op * sizeof(bool)));\n  CUDA_CHECK(cudaMalloc(&d_keys_out, key_num_per_op * sizeof(K)));\n\n  CUDA_CHECK(cudaMalloc(&d_evict_keys, key_num_per_op * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_evict_scores, key_num_per_op * sizeof(S)));\n\n  CUDA_CHECK(cudaMemset(d_vectors, 1, key_num_per_op * sizeof(V) * dim));\n  CUDA_CHECK(cudaMemset(d_def_val, 2, key_num_per_op * sizeof(V) * dim));\n  CUDA_CHECK(cudaMemset(d_vectors_ptr, 0, key_num_per_op * sizeof(V*)));\n  CUDA_CHECK(cudaMemset(d_found, 0, key_num_per_op * sizeof(bool)));\n\n  d_scores = need_scores ? d_scores_real : nullptr;\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  // initialize insert\n  // step 1, no need to load load_factor\n  uint64_t key_num_init = static_cast<uint64_t>(init_capacity * load_factor);\n  const float target_load_factor = key_num_init * 1.0f / init_capacity;\n  uint64_t key_num_remain = key_num_init % key_num_per_op == 0\n                                ? key_num_per_op\n                                : key_num_init % key_num_per_op;\n  int32_t loop_num_init = (key_num_init + key_num_per_op - 1) / key_num_per_op;\n\n  K start = 0UL;\n\n  S threshold = benchmark::host_nano<S>();\n  int global_epoch = 0;\n  for (; global_epoch < loop_num_init; global_epoch++) {\n    table->set_global_epoch(global_epoch);\n    uint64_t key_num_cur_insert =\n        global_epoch == loop_num_init - 1 ? key_num_remain : key_num_per_op;\n    create_continuous_keys<K, S>(h_keys, h_scores, key_num_cur_insert, start);\n    CUDA_CHECK(cudaMemcpy(d_keys, h_keys, key_num_cur_insert * sizeof(K),\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(d_scores_real, h_scores,\n                          key_num_cur_insert * sizeof(S),\n                          cudaMemcpyHostToDevice));\n    table->find_or_insert(key_num_cur_insert, d_keys, d_vectors_ptr, d_found,\n                          d_scores, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    start += key_num_cur_insert;\n  }\n\n  // step 2\n  float real_load_factor = table->load_factor(stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  while (target_load_factor - real_load_factor > EPSILON) {\n    auto key_num_append = static_cast<int64_t>(\n        (target_load_factor - real_load_factor) * init_capacity);\n    if (key_num_append <= 0) break;\n    key_num_append =\n        std::min(static_cast<int64_t>(key_num_per_op), key_num_append);\n    create_continuous_keys<K, S>(h_keys, h_scores, key_num_append, start);\n    CUDA_CHECK(cudaMemcpy(d_keys, h_keys, key_num_append * sizeof(K),\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(d_scores_real, h_scores, key_num_append * sizeof(S),\n                          cudaMemcpyHostToDevice));\n    table->insert_or_assign(key_num_append, d_keys, d_vectors, d_scores,\n                            stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    start += key_num_append;\n    real_load_factor = table->load_factor(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n  }\n\n  // For trigger the kernel selection in advance.\n  int key_num_per_op_warmup = 1;\n  for (int i = 0; i < 9; i++, global_epoch++) {\n    table->set_global_epoch(global_epoch);\n    switch (api) {\n      case API_Select::find: {\n        table->find(key_num_per_op_warmup, d_keys, d_vectors, d_found, d_scores,\n                    stream);\n        CUDA_CHECK(cudaStreamSynchronize(stream));\n        break;\n      }\n      case API_Select::insert_or_assign: {\n        table->insert_or_assign(key_num_per_op_warmup, d_keys, d_vectors,\n                                d_scores, stream);\n        CUDA_CHECK(cudaStreamSynchronize(stream));\n        break;\n      }\n      case API_Select::find_or_insert: {\n        table->find_or_insert(key_num_per_op_warmup, d_keys, d_vectors,\n                              d_scores, stream);\n        CUDA_CHECK(cudaStreamSynchronize(stream));\n        break;\n      }\n      case API_Select::assign: {\n        table->assign(key_num_per_op_warmup, d_keys, d_def_val, d_scores,\n                      stream);\n        CUDA_CHECK(cudaStreamSynchronize(stream));\n        break;\n      }\n      case API_Select::insert_and_evict: {\n        table->insert_and_evict(key_num_per_op_warmup, d_keys, d_vectors,\n                                d_scores, d_evict_keys, d_def_val,\n                                d_evict_scores, stream);\n        CUDA_CHECK(cudaStreamSynchronize(stream));\n        break;\n      }\n      case API_Select::find_ptr: {\n        V** d_vectors_ptr = nullptr;\n        CUDA_CHECK(\n            cudaMalloc(&d_vectors_ptr, key_num_per_op_warmup * sizeof(V*)));\n        benchmark::array2ptr(d_vectors_ptr, d_vectors, dim,\n                             key_num_per_op_warmup, stream);\n\n        CUDA_CHECK(cudaStreamSynchronize(stream));\n        table->find(1, d_keys, d_vectors_ptr, d_found, d_scores, stream);\n        CUDA_CHECK(cudaStreamSynchronize(stream));\n        benchmark::read_from_ptr(d_vectors_ptr, d_vectors, dim,\n                                 key_num_per_op_warmup, stream);\n        CUDA_CHECK(cudaStreamSynchronize(stream));\n        CUDA_CHECK(cudaFree(d_vectors_ptr));\n        break;\n      }\n      case API_Select::find_or_insert_ptr: {\n        V** d_vectors_ptr = nullptr;\n        bool* d_found;\n        CUDA_CHECK(cudaMalloc(&d_found, key_num_per_op_warmup * sizeof(bool)));\n        CUDA_CHECK(\n            cudaMalloc(&d_vectors_ptr, key_num_per_op_warmup * sizeof(V*)));\n        benchmark::array2ptr(d_vectors_ptr, d_vectors, dim,\n                             key_num_per_op_warmup, stream);\n        CUDA_CHECK(cudaStreamSynchronize(stream));\n        table->find_or_insert(key_num_per_op_warmup, d_keys, d_vectors_ptr,\n                              d_found, d_scores, stream);\n        CUDA_CHECK(cudaStreamSynchronize(stream));\n        CUDA_CHECK(cudaFree(d_vectors_ptr));\n        CUDA_CHECK(cudaFree(d_found));\n        break;\n      }\n      case API_Select::export_batch: {\n        size_t* d_dump_counter = nullptr;\n        CUDA_CHECK(cudaMalloc(&d_dump_counter, sizeof(size_t)));\n        CUDA_CHECK(cudaMemset(d_dump_counter, 0, sizeof(size_t)));\n\n        table->export_batch(key_num_per_op_warmup, 0, d_dump_counter, d_keys,\n                            d_vectors, d_scores, stream);\n        CUDA_CHECK(cudaStreamSynchronize(stream));\n        CUDA_CHECK(cudaFree(d_dump_counter));\n        break;\n      }\n      case API_Select::export_batch_if: {\n        size_t* d_dump_counter = nullptr;\n        CUDA_CHECK(cudaMalloc(&d_dump_counter, sizeof(size_t)));\n        CUDA_CHECK(cudaMemset(d_dump_counter, 0, sizeof(size_t)));\n        K pattern = 0;\n        table->template export_batch_if<ExportIfPredFunctor>(\n            pattern, threshold, key_num_per_op_warmup, 0, d_dump_counter,\n            d_keys, d_vectors, d_scores, stream);\n        CUDA_CHECK(cudaStreamSynchronize(stream));\n        CUDA_CHECK(cudaFree(d_dump_counter));\n        break;\n      }\n      case API_Select::contains: {\n        table->contains(1, d_keys, d_found, stream);\n        CUDA_CHECK(cudaStreamSynchronize(stream));\n        break;\n      }\n      default: {\n        std::cout << \"[Unsupport API]\\n\";\n      }\n    }\n  }\n  create_keys_for_hitrate<K, S>(h_keys, h_scores, key_num_per_op, hitrate,\n                                Hit_Mode::last_insert, start, true /*reset*/);\n  CUDA_CHECK(cudaMemcpy(d_keys, h_keys, key_num_per_op * sizeof(K),\n                        cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_scores_real, h_scores, key_num_per_op * sizeof(K),\n                        cudaMemcpyHostToDevice));\n  auto timer = benchmark::Timer<double>();\n  global_epoch++;\n  table->set_global_epoch(global_epoch);\n  switch (api) {\n    case API_Select::find: {\n      timer.start();\n      table->find(key_num_per_op, d_keys, d_vectors, d_found, d_scores, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      timer.end();\n      break;\n    }\n    case API_Select::insert_or_assign: {\n      timer.start();\n      table->insert_or_assign(key_num_per_op, d_keys, d_vectors, d_scores,\n                              stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      timer.end();\n      break;\n    }\n    case API_Select::find_or_insert: {\n      timer.start();\n      table->find_or_insert(key_num_per_op, d_keys, d_vectors, d_scores,\n                            stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      timer.end();\n      break;\n    }\n    case API_Select::assign: {\n      timer.start();\n      table->assign(key_num_per_op, d_keys, d_def_val, d_scores, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      timer.end();\n      break;\n    }\n    case API_Select::insert_and_evict: {\n      timer.start();\n      table->insert_and_evict(key_num_per_op, d_keys, d_vectors, d_scores,\n                              d_evict_keys, d_def_val, d_evict_scores, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      timer.end();\n      break;\n    }\n    case API_Select::find_ptr: {\n      V** d_vectors_ptr = nullptr;\n      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, key_num_per_op * sizeof(V*)));\n      benchmark::array2ptr(d_vectors_ptr, d_vectors, dim, key_num_per_op,\n                           stream);\n\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      timer.start();\n      table->find(key_num_per_op, d_keys, d_vectors_ptr, d_found, d_scores,\n                  stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      timer.end();\n      benchmark::read_from_ptr(d_vectors_ptr, d_vectors, dim, key_num_per_op,\n                               stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      CUDA_CHECK(cudaFree(d_vectors_ptr));\n      break;\n    }\n    case API_Select::find_or_insert_ptr: {\n      V** d_vectors_ptr = nullptr;\n      bool* d_found;\n      CUDA_CHECK(cudaMalloc(&d_found, key_num_per_op * sizeof(bool)));\n      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, key_num_per_op * sizeof(V*)));\n      benchmark::array2ptr(d_vectors_ptr, d_vectors, dim, key_num_per_op,\n                           stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      timer.start();\n      table->find_or_insert(key_num_per_op, d_keys, d_vectors_ptr, d_found,\n                            d_scores, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      timer.end();\n      CUDA_CHECK(cudaFree(d_vectors_ptr));\n      CUDA_CHECK(cudaFree(d_found));\n      break;\n    }\n    case API_Select::export_batch: {\n      size_t* d_dump_counter;\n\n      // Try to export close to but less than `key_num_per_op` data.\n      // It's normal to happen `illegal memory access` error occasionally.\n      float safe_ratio = 0.995;\n\n      CUDA_CHECK(cudaMalloc(&d_dump_counter, sizeof(size_t)));\n      CUDA_CHECK(cudaMemset(d_dump_counter, 0, sizeof(size_t)));\n      timer.start();\n      table->export_batch(key_num_per_op / target_load_factor * safe_ratio, 0,\n                          d_dump_counter, d_keys, d_vectors, d_scores, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      timer.end();\n      CUDA_CHECK(cudaFree(d_dump_counter));\n      break;\n    }\n    case API_Select::export_batch_if: {\n      size_t* d_dump_counter;\n\n      // Try to export close to but less than `key_num_per_op` data.\n      // It's normal to happen `illegal memory access` error occasionally.\n      float safe_ratio = 0.995;\n\n      CUDA_CHECK(cudaMalloc(&d_dump_counter, sizeof(size_t)));\n      CUDA_CHECK(cudaMemset(d_dump_counter, 0, sizeof(size_t)));\n      timer.start();\n      K pattern = 0;\n      table->template export_batch_if<ExportIfPredFunctor>(\n          pattern, threshold, key_num_per_op / target_load_factor * safe_ratio,\n          0, d_dump_counter, d_keys, d_vectors, d_scores, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      timer.end();\n      CUDA_CHECK(cudaFree(d_dump_counter));\n      break;\n    }\n    case API_Select::contains: {\n      timer.start();\n      table->contains(key_num_per_op, d_keys, d_found, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      timer.end();\n      break;\n    }\n    default: {\n      std::cout << \"[Unsupport API]\\n\";\n    }\n  }\n\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaFreeHost(h_keys));\n  CUDA_CHECK(cudaFreeHost(h_scores));\n  CUDA_CHECK(cudaFreeHost(h_found));\n\n  CUDA_CHECK(cudaFree(d_keys));\n  CUDA_CHECK(cudaFree(d_scores_real));\n  CUDA_CHECK(cudaFree(d_vectors));\n  CUDA_CHECK(cudaFree(d_def_val));\n  CUDA_CHECK(cudaFree(d_vectors_ptr));\n  CUDA_CHECK(cudaFree(d_found));\n  CUDA_CHECK(cudaFree(d_evict_keys));\n  CUDA_CHECK(cudaFree(d_evict_scores));\n\n  CUDA_CHECK(cudaDeviceSynchronize());\n  CudaCheckError();\n\n  float througput =\n      key_num_per_op / timer.getResult() / (1024 * 1024 * 1024.0f);\n  return througput;\n}\n\nstatic Test_Mode test_mode = Test_Mode::pure_hbm;\n\nvoid print_title_a() {\n  cout << endl\n       << \"|    \\u03BB \" << \"| insert_or_assign \" << \"|   find \"\n       << \"| find_or_insert \" << \"| assign \" << \"|  find* \"\n       << \"| find_or_insert* \";\n  if (Test_Mode::pure_hbm == test_mode) {\n    cout << \"| insert_and_evict \";\n  }\n  cout << \"|\\n\";\n\n  //<< \"| load_factor \"\n  cout << \"|-----:\"\n       //<< \"| insert_or_assign \"\n       << \"|-----------------:\"\n       //<< \"|   find \"\n       << \"|-------:\"\n       //<< \"| find_or_insert \"\n       << \"|---------------:\"\n       //<< \"| assign \"\n       << \"|-------:\"\n       //<< \"|   find* \"\n       << \"|-------:\"\n       //<< \"| find_or_insert* \"\n       << \"|----------------:\";\n  if (Test_Mode::pure_hbm == test_mode) {\n    //<< \"| insert_and_evict \"\n    cout << \"|-----------------:\";\n  }\n  cout << \"|\\n\";\n}\n\nvoid print_title_b() {\n  cout << endl\n       << \"|    \\u03BB \" << \"| export_batch \" << \"| export_batch_if \"\n       << \"|  contains \";\n  cout << \"|\\n\";\n\n  //<< \"| load_factor \"\n  cout << \"|-----:\"\n       //<< \"| export_batch \"\n       << \"|-------------:\"\n       //<< \"| export_batch_if \"\n       << \"|----------------:\"\n       //<< \"|  contains \"\n       << \"|----------:\";\n  cout << \"|\\n\";\n}\n\nvoid test_main(std::vector<API_Select>& apis, const size_t dim,\n               const size_t init_capacity = 64 * 1024 * 1024UL,\n               const size_t key_num_per_op = 1 * 1024 * 1024UL,\n               const size_t hbm4values = 16, const float load_factor = 1.0f,\n               const bool io_by_cpu = false,\n               const std::vector<float> load_factors = {0.50f, 0.75f, 1.00f}) {\n  size_t free, total;\n  CUDA_CHECK(cudaSetDevice(0));\n  CUDA_CHECK(cudaMemGetInfo(&free, &total));\n\n  if (free / (1 << 30) < hbm4values) {\n    std::cout << \"free HBM is not enough, ignore current benchmark!\"\n              << std::endl;\n    return;\n  }\n  TableOptions options;\n\n  options.init_capacity = init_capacity;\n  options.max_capacity = init_capacity;\n  options.dim = dim;\n  options.max_hbm_for_vectors = nv::merlin::GB(hbm4values);\n  options.io_by_cpu = io_by_cpu;\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru, Sm80>;\n\n  std::shared_ptr<Table> table = std::make_shared<Table>();\n  table->init(options);\n\n  for (float load_factor : load_factors) {\n    std::cout << \"|\" << rep(1) << fixed << setprecision(2) << load_factor\n              << \" \";\n\n    for (auto api : apis) {\n      table->clear();\n      CUDA_CHECK(cudaDeviceSynchronize());\n      // There is a sampling of load_factor after several times call to target\n      // API. Two consecutive calls can avoid the impact of sampling.\n      auto res1 = test_one_api<Table>(table, api, dim, init_capacity,\n                                      key_num_per_op, load_factor);\n      auto res2 = test_one_api<Table>(table, api, dim, init_capacity,\n                                      key_num_per_op, load_factor);\n      auto res = std::max(res1, res2);\n      std::cout << \"|\";\n      switch (api) {\n        case API_Select::find: {\n          std::cout << rep(1);\n          break;\n        }\n        case API_Select::insert_or_assign: {\n          std::cout << rep(11);\n          break;\n        }\n        case API_Select::find_or_insert: {\n          std::cout << rep(9);\n          break;\n        }\n        case API_Select::assign: {\n          std::cout << rep(1);\n          break;\n        }\n        case API_Select::insert_and_evict: {\n          std::cout << rep(11);\n          break;\n        }\n        case API_Select::find_ptr: {\n          std::cout << rep(1);\n          break;\n        }\n        case API_Select::find_or_insert_ptr: {\n          std::cout << rep(10);\n          break;\n        }\n        case API_Select::export_batch: {\n          std::cout << rep(7);\n          break;\n        }\n        case API_Select::export_batch_if: {\n          std::cout << rep(10);\n          break;\n        }\n        case API_Select::contains: {\n          std::cout << rep(4);\n          break;\n        }\n        default: {\n          std::cout << \"[Unsupport API]\";\n        }\n      }\n      std::cout << fixed << setprecision(3) << setw(6) << setfill(' ') << res\n                << \" \";\n    }\n    std::cout << \"|\\n\";\n  }\n}\n\nint main() {\n  size_t key_num_per_op = 1 * 1024 * 1024UL;\n  cudaDeviceProp props;\n  CUDA_CHECK(cudaGetDeviceProperties(&props, 0));\n  cout << endl\n       << \"## Benchmark\" << endl\n       << endl\n       << \"* GPU: 1 x \" << props.name << \": \" << props.major << \".\"\n       << props.minor << endl\n       << \"* Key Type = uint64_t\" << endl\n       << \"* Value Type = float32 * {dim}\" << endl\n       << \"* Key-Values per OP = \" << key_num_per_op << endl\n       << \"* Evict strategy: LRU\" << endl\n       << \"* `\\u03BB`\" << \": load factor\" << endl\n       << \"* `find*` means the `find` API that directly returns the addresses \"\n          \"of values.\"\n       << endl\n       << \"* `find_or_insert*` means the `find_or_insert` API that directly \"\n          \"returns the addresses of values.\"\n       << endl\n       << \"* ***Throughput Unit: Billion-KV/second***\" << endl\n       << endl;\n  auto print_configuration = [](const size_t dim, const size_t init_capacity,\n                                const size_t hbm4values) {\n    using V = float;\n    int32_t capacity = static_cast<int32_t>(init_capacity / (1024 * 1024));\n    size_t hmem4values = init_capacity * dim * sizeof(V) / (1024 * 1024 * 1024);\n    hmem4values = hmem4values < hbm4values ? 0 : (hmem4values - hbm4values);\n    cout << \"\\n* dim = \" << dim << \", \" << \"capacity = \" << capacity\n         << \" Million-KV, \" << \"HBM = \" << hbm4values << \" GB, \"\n         << \"HMEM = \" << hmem4values << \" GB\\n\";\n  };\n\n  try {\n    {\n      std::vector<API_Select> apis_a{\n          API_Select::insert_or_assign, API_Select::find,\n          API_Select::find_or_insert,   API_Select::assign,\n          API_Select::find_ptr,         API_Select::find_or_insert_ptr,\n          API_Select::insert_and_evict};\n\n      std::vector<API_Select> apis_b{API_Select::export_batch,\n                                     API_Select::export_batch_if,\n                                     API_Select::contains};\n      test_mode = Test_Mode::pure_hbm;\n\n      cout << \"### On pure HBM mode: \" << endl;\n      print_configuration(8, 128 * 1024 * 1024UL, 4);\n      print_title_a();\n      test_main(apis_a, 8, 128 * 1024 * 1024UL, key_num_per_op, 4);\n\n      print_title_b();\n      test_main(apis_b, 8, 128 * 1024 * 1024UL, key_num_per_op, 4);\n\n      print_configuration(32, 128 * 1024 * 1024UL, 16);\n      print_title_a();\n      test_main(apis_a, 32, 128 * 1024 * 1024UL, key_num_per_op, 16);\n\n      print_title_b();\n      test_main(apis_b, 32, 128 * 1024 * 1024UL, key_num_per_op, 16);\n\n      print_configuration(64, 64 * 1024 * 1024UL, 16);\n      print_title_a();\n      test_main(apis_a, 64, 64 * 1024 * 1024UL, key_num_per_op, 16);\n\n      print_title_b();\n      test_main(apis_b, 64, 64 * 1024 * 1024UL, key_num_per_op, 16);\n\n      cout << endl;\n    }\n\n    {\n      std::vector<API_Select> apis_a{\n          API_Select::insert_or_assign, API_Select::find,\n          API_Select::find_or_insert,   API_Select::assign,\n          API_Select::find_ptr,         API_Select::find_or_insert_ptr};\n\n      std::vector<API_Select> apis_b{API_Select::export_batch,\n                                     API_Select::export_batch_if,\n                                     API_Select::contains};\n\n      cout << \"### On HBM+HMEM hybrid mode: \" << endl;\n      test_mode = Test_Mode::hybrid;\n      print_configuration(64, 128 * 1024 * 1024UL, 16);\n      print_title_a();\n      test_main(apis_a, 64, 128 * 1024 * 1024UL, key_num_per_op, 16);\n\n      print_title_b();\n      test_main(apis_b, 64, 128 * 1024 * 1024UL, key_num_per_op, 16);\n\n      print_configuration(64, 512 * 1024 * 1024UL, 32);\n      print_title_a();\n      test_main(apis_a, 64, 512 * 1024 * 1024UL, key_num_per_op, 32);\n\n      print_title_b();\n      test_main(apis_b, 64, 512 * 1024 * 1024UL, key_num_per_op, 32);\n      cout << endl;\n    }\n\n    CUDA_CHECK(cudaDeviceSynchronize());\n  } catch (const nv::merlin::CudaException& e) {\n    cerr << e.what() << endl;\n  }\n  CUDA_CHECK(cudaDeviceSynchronize());\n  return 0;\n}\n"
  },
  {
    "path": "build_deps/gpus/BUILD",
    "content": ""
  },
  {
    "path": "build_deps/gpus/check_cuda_libs.py",
    "content": "# Copyright (c) 2023, NVIDIA CORPORATION.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\n\"\"\"Verifies that a list of libraries is installed on the system.\n\nTakes a list of arguments with every two subsequent arguments being a logical\ntuple of (path, check_soname). The path to the library and either True or False\nto indicate whether to check the soname field on the shared library.\n\nExample Usage:\n./check_cuda_libs.py /path/to/lib1.so True /path/to/lib2.so False\n\"\"\"\nimport os\nimport os.path\nimport platform\nimport subprocess\nimport sys\n\n# pylint: disable=g-import-not-at-top,g-importing-member\ntry:\n    from shutil import which\nexcept ImportError:\n    from distutils.spawn import find_executable as which\n# pylint: enable=g-import-not-at-top,g-importing-member\n\n\nclass ConfigError(Exception):\n    pass\n\n\ndef check_cuda_lib(path, check_soname=True):\n    \"\"\"Tests if a library exists on disk and whether its soname matches the filename.\n\n  Args:\n    path: the path to the library.\n    check_soname: whether to check the soname as well.\n\n  Raises:\n    ConfigError: If the library does not exist or if its soname does not match\n    the filename.\n  \"\"\"\n    if not os.path.isfile(path):\n        raise ConfigError(\"No library found under: \" + path)\n    objdump = which(\"objdump\")\n    if check_soname and objdump is not None:\n        # Decode is necessary as in py3 the return type changed from str to bytes\n        output = subprocess.check_output([objdump, \"-p\", path]).decode(\"utf-8\")\n        output = [line for line in output.splitlines() if \"SONAME\" in line]\n        sonames = [line.strip().split(\" \")[-1] for line in output]\n        if not any(soname == os.path.basename(path) for soname in sonames):\n            raise ConfigError(\"None of the libraries match their SONAME: \" +\n                              path)\n\n\ndef main():\n    try:\n        args = [argv for argv in sys.argv[1:]]\n        if len(args) % 2 == 1:\n            raise ConfigError(\"Expected even number of arguments\")\n        checked_paths = []\n        for i in range(0, len(args), 2):\n            path = args[i]\n            check_cuda_lib(path, check_soname=args[i + 1] == \"True\")\n            checked_paths.append(path)\n        # pylint: disable=superfluous-parens\n        print(os.linesep.join(checked_paths))\n        # pylint: enable=superfluous-parens\n    except ConfigError as e:\n        sys.stderr.write(str(e))\n        sys.exit(1)\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "build_deps/gpus/configure.bzl",
    "content": "\"\"\"Repository rule for CUDA autoconfiguration.\n\n`cuda_configure` depends on the following environment variables:\n\n  * `NEED_CUDA`: Whether to enable building with CUDA.\n  * `GCC_HOST_COMPILER_PATH`: The GCC host compiler path\n  * `SYSROOT`: The sysroot to use when compiling.\n  * `CUDA_PATHS`: The base paths to look for CUDA and cuDNN. Default is\n    `/usr/local/cuda,usr/`.\n  * `CUDA_TOOLKIT_PATH` (deprecated): The path to the CUDA toolkit. Default is\n    `/usr/local/cuda`.\n  * `CUDA_VERSION`: The version of the CUDA toolkit. If this is blank, then\n    use the system default.\n  * `CUDNN_VERSION`: The version of the cuDNN library.\n  * `CUDNN_INSTALL_PATH` (deprecated): The path to the cuDNN library. Default is\n    `/usr/local/cuda`.\n  * `CUDA_COMPUTE_CAPABILITIES`: The CUDA compute capabilities. Default is\n    `3.5,5.2`.\n  * `PYTHON_BIN_PATH`: The python binary path\n\"\"\"\n\nload(\n    \"@bazel_tools//tools/cpp:lib_cc_configure.bzl\",\n    \"escape_string\",\n    \"get_env_var\",\n)\nload(\n    \"//build_deps/remote_config:common.bzl\",\n    \"config_repo_label\",\n    \"err_out\",\n    \"execute\",\n    \"get_bash_bin\",\n    \"get_cpu_value\",\n    \"get_host_environ\",\n    \"get_python_bin\",\n    \"raw_exec\",\n    \"read_dir\",\n    \"realpath\",\n    \"which\",\n)\n\n_GCC_HOST_COMPILER_PATH = \"GCC_HOST_COMPILER_PATH\"\n_GCC_HOST_COMPILER_PREFIX = \"GCC_HOST_COMPILER_PREFIX\"\n_SYSROOT = \"SYSROOT\"\n_CUDA_TOOLKIT_PATH = \"CUDA_TOOLKIT_PATH\"\n_CUDA_VERSION = \"CUDA_VERSION\"\n_CUDNN_VERSION = \"CUDNN_VERSION\"\n_CUDNN_INSTALL_PATH = \"CUDNN_INSTALL_PATH\"\n_CUDA_COMPUTE_CAPABILITIES = \"CUDA_COMPUTE_CAPABILITIES\"\n_CUDA_CONFIG_REPO = \"CUDA_CONFIG_REPO\"\n_PYTHON_BIN_PATH = \"PYTHON_BIN_PATH\"\n\n_TENSORRT_VERSION = \"TENSORRT_VERSION\"\n_TENSORRT_INSTALL_PATH = \"TENSORRT_INSTALL_PATH\"\n_TENSORRT_STATIC_PATH = \"TENSORRT_STATIC_PATH\"\n_TENSORRT_LIBS = [\n    \"nvinfer\",\n    \"nvinfer_plugin\",\n    \"nvonnxparser\",\n    \"nvparsers\",\n]\n_TENSORRT_HEADERS = [\n    \"NvInfer.h\",\n    \"NvUtils.h\",\n    \"NvInferPlugin.h\",\n]\n_TENSORRT_HEADERS_V6 = [\n    \"NvInfer.h\",\n    \"NvUtils.h\",\n    \"NvInferPlugin.h\",\n    \"NvInferVersion.h\",\n    \"NvInferRuntime.h\",\n    \"NvInferRuntimeCommon.h\",\n    \"NvInferPluginUtils.h\",\n    \"NvOnnxParser.h\",\n    \"NvOnnxConfig.h\",\n]\n_TENSORRT_HEADERS_V8 = [\n    \"NvInfer.h\",\n    \"NvInferLegacyDims.h\",\n    \"NvInferImpl.h\",\n    \"NvUtils.h\",\n    \"NvInferPlugin.h\",\n    \"NvInferVersion.h\",\n    \"NvInferRuntime.h\",\n    \"NvInferRuntimeCommon.h\",\n    \"NvInferPluginUtils.h\",\n    \"NvOnnxParser.h\",\n    \"NvOnnxConfig.h\",\n]\n\ndef _at_least_version(actual_version, required_version):\n    actual = [int(v) for v in actual_version.split(\".\")]\n    required = [int(v) for v in required_version.split(\".\")]\n    return actual >= required\n\ndef _get_tensorrt_headers(tensorrt_version):\n    if _at_least_version(tensorrt_version, \"8\"):\n        return _TENSORRT_HEADERS_V8\n    if _at_least_version(tensorrt_version, \"6\"):\n        return _TENSORRT_HEADERS_V6\n    return _TENSORRT_HEADERS\n\ndef to_list_of_strings(elements):\n    \"\"\"Convert the list of [\"a\", \"b\", \"c\"] into '\"a\", \"b\", \"c\"'.\n\n    This is to be used to put a list of strings into the bzl file templates\n    so it gets interpreted as list of strings in Starlark.\n\n    Args:\n      elements: list of string elements\n\n    Returns:\n      single string of elements wrapped in quotes separated by a comma.\"\"\"\n    quoted_strings = [\"\\\"\" + element + \"\\\"\" for element in elements]\n    return \", \".join(quoted_strings)\n\ndef verify_build_defines(params):\n    \"\"\"Verify all variables that crosstool/BUILD.tpl expects are substituted.\n\n    Args:\n      params: dict of variables that will be passed to the BUILD.tpl template.\n    \"\"\"\n    missing = []\n    for param in [\n        \"cxx_builtin_include_directories\",\n        \"extra_no_canonical_prefixes_flags\",\n        \"host_compiler_path\",\n        \"host_compiler_prefix\",\n        \"host_compiler_warnings\",\n        \"linker_bin_path\",\n        \"compiler_deps\",\n        \"unfiltered_compile_flags\",\n    ]:\n        if (\"%{\" + param + \"}\") not in params:\n            missing.append(param)\n\n    if missing:\n        auto_configure_fail(\n            \"BUILD.tpl template is missing these variables: \" + str(missing) +\n            \".\\nWe only got: \" + str(params) + \".\",\n        )\n\n# TODO(dzc): Once these functions have been factored out of Bazel's\n# cc_configure.bzl, load them from @bazel_tools instead.\n# BEGIN cc_configure common functions.\ndef find_cc(repository_ctx):\n    \"\"\"Find the C++ compiler.\"\"\"\n    target_cc_name = \"gcc\"\n    cc_path_envvar = _GCC_HOST_COMPILER_PATH\n    cc_name = target_cc_name\n\n    cc_name_from_env = get_host_environ(repository_ctx, cc_path_envvar)\n    if cc_name_from_env:\n        cc_name = cc_name_from_env\n    if cc_name.startswith(\"/\"):\n        # Absolute path, maybe we should make this supported by our which function.\n        return cc_name\n    cc = which(repository_ctx, cc_name)\n    if cc == None:\n        fail((\"Cannot find {}, either correct your path or set the {}\" +\n              \" environment variable\").format(target_cc_name, cc_path_envvar))\n    return cc\n\n_INC_DIR_MARKER_BEGIN = \"#include <...>\"\n\n# OSX add \" (framework directory)\" at the end of line, strip it.\n_OSX_FRAMEWORK_SUFFIX = \" (framework directory)\"\n_OSX_FRAMEWORK_SUFFIX_LEN = len(_OSX_FRAMEWORK_SUFFIX)\n\ndef _cxx_inc_convert(path):\n    \"\"\"Convert path returned by cc -E xc++ in a complete path.\"\"\"\n    path = path.strip()\n    if path.endswith(_OSX_FRAMEWORK_SUFFIX):\n        path = path[:-_OSX_FRAMEWORK_SUFFIX_LEN].strip()\n    return path\n\ndef _normalize_include_path(repository_ctx, path):\n    \"\"\"Normalizes include paths before writing them to the crosstool.\n\n      If path points inside the 'crosstool' folder of the repository, a relative\n      path is returned.\n      If path points outside the 'crosstool' folder, an absolute path is returned.\n      \"\"\"\n    path = str(repository_ctx.path(path))\n    crosstool_folder = str(repository_ctx.path(\".\").get_child(\"crosstool\"))\n\n    if path.startswith(crosstool_folder):\n        # We drop the path to \"$REPO/crosstool\" and a trailing path separator.\n        return path[len(crosstool_folder) + 1:]\n    return path\n\ndef _is_compiler_option_supported(repository_ctx, cc, option):\n    \"\"\"Checks that `option` is supported by the C compiler. Doesn't %-escape the option.\"\"\"\n    result = repository_ctx.execute([\n        cc,\n        option,\n        \"-o\",\n        \"/dev/null\",\n        \"-c\",\n        str(repository_ctx.path(\"tools/cpp/empty.cc\")),\n    ])\n    return result.stderr.find(option) == -1\n\ndef _get_cxx_inc_directories_impl(repository_ctx, cc, lang_is_cpp, tf_sysroot):\n    \"\"\"Compute the list of default C or C++ include directories.\"\"\"\n    if lang_is_cpp:\n        lang = \"c++\"\n    else:\n        lang = \"c\"\n    sysroot = []\n    if tf_sysroot:\n        sysroot += [\"--sysroot\", tf_sysroot]\n    result = raw_exec(\n        repository_ctx,\n        [cc, \"-E\", \"-x\" + lang, \"-\", \"-v\"] + sysroot,\n    )\n    stderr = err_out(result)\n    index1 = stderr.find(_INC_DIR_MARKER_BEGIN)\n    if index1 == -1:\n        return []\n    index1 = stderr.find(\"\\n\", index1)\n    if index1 == -1:\n        return []\n    index2 = stderr.rfind(\"\\n \")\n    if index2 == -1 or index2 < index1:\n        return []\n    index2 = stderr.find(\"\\n\", index2 + 1)\n    if index2 == -1:\n        inc_dirs = stderr[index1 + 1:]\n    else:\n        inc_dirs = stderr[index1 + 1:index2].strip()\n\n    print_resource_dir_supported = _is_compiler_option_supported(\n        repository_ctx,\n        cc,\n        \"-print-resource-dir\",\n    )\n\n    if print_resource_dir_supported:\n        resource_dir = repository_ctx.execute(\n            [cc, \"-print-resource-dir\"],\n        ).stdout.strip() + \"/share\"\n        inc_dirs += \"\\n\" + resource_dir\n\n    return [\n        _normalize_include_path(repository_ctx, _cxx_inc_convert(p))\n        for p in inc_dirs.split(\"\\n\")\n    ]\n\ndef get_cxx_inc_directories(repository_ctx, cc, tf_sysroot):\n    \"\"\"Compute the list of default C and C++ include directories.\"\"\"\n\n    includes_cpp = _get_cxx_inc_directories_impl(\n        repository_ctx,\n        cc,\n        True,\n        tf_sysroot,\n    )\n    includes_c = _get_cxx_inc_directories_impl(\n        repository_ctx,\n        cc,\n        False,\n        tf_sysroot,\n    )\n\n    return includes_cpp + [\n        inc\n        for inc in includes_c\n        if inc not in includes_cpp\n    ]\n\ndef auto_configure_fail(msg):\n    \"\"\"Output failure message when cuda configuration fails.\"\"\"\n    red = \"\\033[0;31m\"\n    no_color = \"\\033[0m\"\n    fail(\"\\n%sCuda Configuration Error:%s %s\\n\" % (red, no_color, msg))\n\n# END cc_configure common functions (see TODO above).\n\ndef _cuda_include_path(repository_ctx, cuda_config):\n    \"\"\"Generates the Starlark string with cuda include directories.\n\n      Args:\n        repository_ctx: The repository context.\n        cc: The path to the gcc host compiler.\n\n      Returns:\n        A list of the gcc host compiler include directories.\n      \"\"\"\n    nvcc_path = repository_ctx.path(\"%s/bin/nvcc%s\" % (\n        cuda_config.cuda_toolkit_path,\n        \".exe\" if cuda_config.cpu_value == \"Windows\" else \"\",\n    ))\n\n    # The expected exit code of this command is non-zero. Bazel remote execution\n    # only caches commands with zero exit code. So force a zero exit code.\n    cmd = \"%s -v /dev/null -o /dev/null ; [ $? -eq 1 ]\" % str(nvcc_path)\n    result = raw_exec(\n        repository_ctx,\n        [get_bash_bin(repository_ctx), \"-c\", cmd],\n    )\n    target_dir = \"\"\n    for one_line in err_out(result).splitlines():\n        if one_line.startswith(\"#$ _TARGET_DIR_=\"):\n            target_dir = (cuda_config.cuda_toolkit_path + \"/\" +\n                          one_line.replace(\n                              \"#$ _TARGET_DIR_=\",\n                              \"\",\n                          ) + \"/include\")\n    inc_entries = []\n    if target_dir != \"\":\n        inc_entries.append(realpath(repository_ctx, target_dir))\n    inc_entries.append(\n        realpath(repository_ctx, cuda_config.cuda_toolkit_path + \"/include\"),\n    )\n    return inc_entries\n\ndef matches_version(environ_version, detected_version):\n    \"\"\"Checks whether the user-specified version matches the detected version.\n\n      This function performs a weak matching so that if the user specifies only\n      the\n      major or major and minor versions, the versions are still considered\n      matching\n      if the version parts match. To illustrate:\n\n          environ_version  detected_version  result\n          -----------------------------------------\n          5.1.3            5.1.3             True\n          5.1              5.1.3             True\n          5                5.1               True\n          5.1.3            5.1               False\n          5.2.3            5.1.3             False\n\n      Args:\n        environ_version: The version specified by the user via environment\n          variables.\n        detected_version: The version autodetected from the CUDA installation on\n          the system.\n      Returns: True if user-specified version matches detected version and False\n        otherwise.\n    \"\"\"\n    environ_version_parts = environ_version.split(\".\")\n    detected_version_parts = detected_version.split(\".\")\n    if len(detected_version_parts) < len(environ_version_parts):\n        return False\n    for i, part in enumerate(detected_version_parts):\n        if i >= len(environ_version_parts):\n            break\n        if part != environ_version_parts[i]:\n            return False\n    return True\n\n_NVCC_VERSION_PREFIX = \"Cuda compilation tools, release \"\n\n_DEFINE_CUDNN_MAJOR = \"#define CUDNN_MAJOR\"\n\ndef compute_capabilities(repository_ctx):\n    \"\"\"Returns a list of strings representing cuda compute capabilities.\n\n    Args:\n      repository_ctx: the repo rule's context.\n    Returns: list of cuda architectures to compile for. 'compute_xy' refers to\n      both PTX and SASS, 'sm_xy' refers to SASS only.\n    \"\"\"\n    capabilities = get_host_environ(\n        repository_ctx,\n        _CUDA_COMPUTE_CAPABILITIES,\n        \"compute_35,compute_52\",\n    ).split(\",\")\n\n    # Map old 'x.y' capabilities to 'compute_xy'.\n    if len(capabilities) > 0 and all(\n        [len(x.split(\".\")) == 2 for x in capabilities],\n    ):\n        # If all capabilities are in 'x.y' format, only include PTX for the\n        # highest capability.\n        cc_list = sorted([x.replace(\".\", \"\") for x in capabilities])\n        capabilities = [\n            \"sm_%s\" % x\n            for x in cc_list[:-1]\n        ] + [\"compute_%s\" % cc_list[-1]]\n    for i, capability in enumerate(capabilities):\n        parts = capability.split(\".\")\n        if len(parts) != 2:\n            continue\n        capabilities[i] = \"compute_%s%s\" % (parts[0], parts[1])\n\n    # Make list unique\n    capabilities = dict(zip(capabilities, capabilities)).keys()\n\n    # Validate capabilities.\n    for capability in capabilities:\n        if not capability.startswith((\"compute_\", \"sm_\")):\n            auto_configure_fail(\"Invalid compute capability: %s\" % capability)\n        for prefix in [\"compute_\", \"sm_\"]:\n            if not capability.startswith(prefix):\n                continue\n            if len(capability) == len(prefix) + 2 and capability[-2:].isdigit(\n            ):\n                continue\n            auto_configure_fail(\"Invalid compute capability: %s\" % capability)\n\n    return capabilities\n\ndef lib_name(base_name, cpu_value, version = None, static = False):\n    \"\"\"Constructs the platform-specific name of a library.\n\n      Args:\n        base_name: The name of the library, such as \"cudart\"\n        cpu_value: The name of the host operating system.\n        version: The version of the library.\n        static: True the library is static or False if it is a shared object.\n\n      Returns:\n        The platform-specific name of the library.\n      \"\"\"\n    version = \"\" if not version else \".\" + version\n    if cpu_value in (\"Linux\", \"FreeBSD\"):\n        if static:\n            return \"lib%s.a\" % base_name\n        return \"lib%s.so%s\" % (base_name, version)\n    elif cpu_value == \"Windows\":\n        return \"%s.lib\" % base_name\n    elif cpu_value == \"Darwin\":\n        if static:\n            return \"lib%s.a\" % base_name\n        return \"lib%s%s.dylib\" % (base_name, version)\n    else:\n        auto_configure_fail(\"Invalid cpu_value: %s\" % cpu_value)\n\ndef _lib_path(lib, cpu_value, basedir, version, static):\n    file_name = lib_name(lib, cpu_value, version, static)\n    return \"%s/%s\" % (basedir, file_name)\n\ndef _should_check_soname(version, static):\n    return version and not static\n\ndef _check_cuda_lib_params(lib, cpu_value, basedir, version, static = False):\n    return (\n        _lib_path(lib, cpu_value, basedir, version, static),\n        _should_check_soname(version, static),\n    )\n\ndef _check_cuda_libs(repository_ctx, script_path, libs):\n    python_bin = get_python_bin(repository_ctx)\n    contents = repository_ctx.read(script_path).splitlines()\n\n    cmd = \"from os import linesep;\"\n    cmd += \"f = open('script.py', 'w');\"\n    for line in contents:\n        cmd += \"f.write('%s' + linesep);\" % line\n    cmd += \"f.close();\"\n    cmd += \"from os import system;\"\n    args = \" \".join([\"\\\"\" + path + \"\\\" \" + str(check) for path, check in libs])\n    cmd += \"system('%s script.py %s');\" % (python_bin, args)\n\n    all_paths = [path for path, _ in libs]\n    checked_paths = execute(\n        repository_ctx,\n        [python_bin, \"-c\", cmd],\n    ).stdout.splitlines()\n\n    # Filter out empty lines from splitting on '\\r\\n' on Windows\n    checked_paths = [path for path in checked_paths if len(path) > 0]\n    if all_paths != checked_paths:\n        auto_configure_fail(\n            \"Error with installed CUDA libs. Expected '%s'. Actual '%s'.\" %\n            (all_paths, checked_paths),\n        )\n\ndef _find_libs(repository_ctx, check_cuda_libs_script, cuda_config):\n    \"\"\"Returns the CUDA and cuDNN libraries on the system.\n\n      Also, verifies that the script actually exist.\n\n      Args:\n        repository_ctx: The repository context.\n        check_cuda_libs_script: The path to a script verifying that the cuda\n          libraries exist on the system.\n        cuda_config: The CUDA config as returned by _get_cuda_config\n\n      Returns:\n        Map of library names to structs of filename and path.\n      \"\"\"\n    cpu_value = cuda_config.cpu_value\n    stub_dir = \"/stubs\"\n\n    check_cuda_libs_params = {\n        \"cuda\": _check_cuda_lib_params(\n            \"cuda\",\n            cpu_value,\n            cuda_config.config[\"cuda_library_dir\"] + stub_dir,\n            version = None,\n            static = False,\n        ),\n        \"cudart\": _check_cuda_lib_params(\n            \"cudart\",\n            cpu_value,\n            cuda_config.config[\"cuda_library_dir\"],\n            cuda_config.cudart_version,\n            static = False,\n        ),\n        \"cudart_static\": _check_cuda_lib_params(\n            \"cudart_static\",\n            cpu_value,\n            cuda_config.config[\"cuda_library_dir\"],\n            cuda_config.cudart_version,\n            static = True,\n        ),\n        \"cublas\": _check_cuda_lib_params(\n            \"cublas\",\n            cpu_value,\n            cuda_config.config[\"cublas_library_dir\"],\n            cuda_config.cublas_version,\n            static = False,\n        ),\n        \"cublasLt\": _check_cuda_lib_params(\n            \"cublasLt\",\n            cpu_value,\n            cuda_config.config[\"cublas_library_dir\"],\n            cuda_config.cublas_version,\n            static = False,\n        ),\n        \"cusolver\": _check_cuda_lib_params(\n            \"cusolver\",\n            cpu_value,\n            cuda_config.config[\"cusolver_library_dir\"],\n            cuda_config.cusolver_version,\n            static = False,\n        ),\n        \"curand\": _check_cuda_lib_params(\n            \"curand\",\n            cpu_value,\n            cuda_config.config[\"curand_library_dir\"],\n            cuda_config.curand_version,\n            static = False,\n        ),\n        \"cufft\": _check_cuda_lib_params(\n            \"cufft\",\n            cpu_value,\n            cuda_config.config[\"cufft_library_dir\"],\n            cuda_config.cufft_version,\n            static = False,\n        ),\n        \"cudnn\": _check_cuda_lib_params(\n            \"cudnn\",\n            cpu_value,\n            cuda_config.config[\"cudnn_library_dir\"],\n            cuda_config.cudnn_version,\n            static = False,\n        ),\n        \"cupti\": _check_cuda_lib_params(\n            \"cupti\",\n            cpu_value,\n            cuda_config.config[\"cupti_library_dir\"],\n            cuda_config.cupti_version,\n            static = False,\n        ),\n        \"cusparse\": _check_cuda_lib_params(\n            \"cusparse\",\n            cpu_value,\n            cuda_config.config[\"cusparse_library_dir\"],\n            cuda_config.cusparse_version,\n            static = False,\n        ),\n    }\n\n    # Verify that the libs actually exist at their locations.\n    _check_cuda_libs(\n        repository_ctx,\n        check_cuda_libs_script,\n        check_cuda_libs_params.values(),\n    )\n\n    paths = {\n        filename: v[0]\n        for (filename, v) in check_cuda_libs_params.items()\n    }\n    return paths\n\ndef _cudart_static_linkopt(cpu_value):\n    \"\"\"Returns additional platform-specific linkopts for cudart.\"\"\"\n    return \"\" if cpu_value == \"Darwin\" else \"\\\"-lrt\\\",\"\n\ndef _exec_find_cuda_config(repository_ctx, script_path, cuda_libraries):\n    python_bin = get_python_bin(repository_ctx)\n    cmd = \"from os import system;\" + \"system('\\\"%s\\\" %s %s');\" % (\n        python_bin,\n        script_path,\n        \" \".join(cuda_libraries),\n    )\n    return execute(repository_ctx, [python_bin, \"-c\", cmd])\n\n# TODO(csigg): Only call once instead of from here, tensorrt_configure.bzl,\n# and nccl_configure.bzl.\ndef find_cuda_config(repository_ctx, script_path, cuda_libraries):\n    \"\"\"Returns CUDA config dictionary from running find_cuda_config.py\"\"\"\n    exec_result = _exec_find_cuda_config(\n        repository_ctx,\n        script_path,\n        cuda_libraries,\n    )\n\n    if exec_result.return_code:\n        auto_configure_fail(\"Failed to run find_cuda_config.py: %s\" %\n                            err_out(exec_result))\n\n    # Parse the dict from stdout.\n    return dict(\n        [tuple(x.split(\": \")) for x in exec_result.stdout.splitlines()],\n    )\n\ndef _get_cuda_config(repository_ctx, find_cuda_config_script):\n    \"\"\"Detects and returns information about the CUDA installation on the system.\n\n      Args:\n        repository_ctx: The repository context.\n\n      Returns:\n        A struct containing the following fields:\n          cuda_toolkit_path: The CUDA toolkit installation directory.\n          cudnn_install_basedir: The cuDNN installation directory.\n          cuda_version: The version of CUDA on the system.\n          cudart_version: The CUDA runtime version on the system.\n          cudnn_version: The version of cuDNN on the system.\n          compute_capabilities: A list of the system's CUDA compute capabilities.\n          cpu_value: The name of the host operating system.\n      \"\"\"\n    config = find_cuda_config(\n        repository_ctx,\n        find_cuda_config_script,\n        [\"cuda\", \"cudnn\"],\n    )\n\n    cpu_value = get_cpu_value(repository_ctx)\n    toolkit_path = config[\"cuda_toolkit_path\"]\n\n    cuda_version = config[\"cuda_version\"].split(\".\")\n    cuda_major = cuda_version[0]\n    cuda_minor = cuda_version[1]\n\n    cuda_version = \"%s.%s\" % (cuda_major, cuda_minor)\n    cudnn_version = \"%s\" % config[\"cudnn_version\"]\n\n    if int(cuda_major) >= 11:\n        # The libcudart soname in CUDA 11.x is versioned as 11.0 for backward compatability.\n        if int(cuda_major) == 11:\n            cudart_version = \"11.0\"\n            cupti_version = cuda_version\n        else:\n            cudart_version = (\"%s\") % cuda_major\n            cupti_version = cudart_version\n        cublas_version = (\"%s\") % config[\"cublas_version\"].split(\".\")[0]\n        cusolver_version = (\"%s\") % config[\"cusolver_version\"].split(\".\")[0]\n        curand_version = (\"%s\") % config[\"curand_version\"].split(\".\")[0]\n        cufft_version = (\"%s\") % config[\"cufft_version\"].split(\".\")[0]\n        cusparse_version = (\"%s\") % config[\"cusparse_version\"].split(\".\")[0]\n    elif (int(cuda_major), int(cuda_minor)) >= (10, 1):\n        # cuda_lib_version is for libraries like cuBLAS, cuFFT, cuSOLVER, etc.\n        # It changed from 'x.y' to just 'x' in CUDA 10.1.\n        cuda_lib_version = (\"%s\") % cuda_major\n        cudart_version = cuda_version\n        cupti_version = cuda_version\n        cublas_version = cuda_lib_version\n        cusolver_version = cuda_lib_version\n        curand_version = cuda_lib_version\n        cufft_version = cuda_lib_version\n        cusparse_version = cuda_lib_version\n    else:\n        cudart_version = cuda_version\n        cupti_version = cuda_version\n        cublas_version = cuda_version\n        cusolver_version = cuda_version\n        curand_version = cuda_version\n        cufft_version = cuda_version\n        cusparse_version = cuda_version\n\n    return struct(\n        cuda_toolkit_path = toolkit_path,\n        cuda_version = cuda_version,\n        cupti_version = cupti_version,\n        cuda_version_major = cuda_major,\n        cudart_version = cudart_version,\n        cublas_version = cublas_version,\n        cusolver_version = cusolver_version,\n        curand_version = curand_version,\n        cufft_version = cufft_version,\n        cusparse_version = cusparse_version,\n        cudnn_version = cudnn_version,\n        compute_capabilities = compute_capabilities(repository_ctx),\n        cpu_value = cpu_value,\n        config = config,\n    )\n\ndef _tpl(repository_ctx, tpl, substitutions = {}, out = None):\n    if not out:\n        out = tpl.replace(\":\", \"/\")\n    repository_ctx.template(\n        out,\n        Label(\"//build_deps/gpus/%s.tpl\" % tpl),\n        substitutions,\n    )\n\ndef _file(repository_ctx, label):\n    repository_ctx.template(\n        label.replace(\":\", \"/\"),\n        Label(\"//build_deps/gpus/%s.tpl\" % label),\n        {},\n    )\n\n_DUMMY_CROSSTOOL_BZL_FILE = \"\"\"\ndef error_gpu_disabled():\n  fail(\"ERROR: Building with --config=cuda but TensorFlow is not configured \" +\n       \"to build with GPU support. Please re-run ./configure and enter 'Y' \" +\n       \"at the prompt to build with GPU support.\")\n\n  native.genrule(\n      name = \"error_gen_crosstool\",\n      outs = [\"CROSSTOOL\"],\n      cmd = \"echo 'Should not be run.' && exit 1\",\n  )\n\n  native.filegroup(\n      name = \"crosstool\",\n      srcs = [\":CROSSTOOL\"],\n      output_licenses = [\"unencumbered\"],\n  )\n\"\"\"\n\n_DUMMY_CROSSTOOL_BUILD_FILE = \"\"\"\nload(\"//crosstool:error_gpu_disabled.bzl\", \"error_gpu_disabled\")\n\nerror_gpu_disabled()\n\"\"\"\n\ndef _norm_path(path):\n    \"\"\"Returns a path with '/' and remove the trailing slash.\"\"\"\n    path = path.replace(\"\\\\\", \"/\")\n    if path[-1] == \"/\":\n        path = path[:-1]\n    return path\n\ndef make_copy_files_rule(repository_ctx, name, srcs, outs):\n    \"\"\"Returns a rule to copy a set of files.\"\"\"\n    cmds = []\n\n    # Copy files.\n    for src, out in zip(srcs, outs):\n        cmds.append('cp -f \"%s\" \"$(location %s)\"' % (src, out))\n    outs = [('        \"%s\",' % out) for out in outs]\n    return \"\"\"genrule(\n    name = \"%s\",\n    outs = [\n%s\n    ],\n    cmd = \\\"\"\"%s \\\"\"\",\n)\"\"\" % (name, \"\\n\".join(outs), \" && \\\\\\n\".join(cmds))\n\ndef make_copy_dir_rule(\n        repository_ctx,\n        name,\n        src_dir,\n        out_dir,\n        exceptions = None):\n    \"\"\"Returns a rule to recursively copy a directory.\n    If exceptions is not None, it must be a list of files or directories in\n    'src_dir'; these will be excluded from copying.\n    \"\"\"\n    src_dir = _norm_path(src_dir)\n    out_dir = _norm_path(out_dir)\n    outs = read_dir(repository_ctx, src_dir)\n    post_cmd = \"\"\n    if exceptions != None:\n        outs = [\n            x\n            for x in outs\n            if not any([x.startswith(src_dir + \"/\" + y) for y in exceptions])\n        ]\n    outs = [('        \"%s\",' % out.replace(src_dir, out_dir)) for out in outs]\n\n    # '@D' already contains the relative path for a single file, see\n    # http://docs.bazel.build/versions/master/be/make-variables.html#predefined_genrule_variables\n    out_dir = \"$(@D)/%s\" % out_dir if len(outs) > 1 else \"$(@D)\"\n    if exceptions != None:\n        for x in exceptions:\n            post_cmd += \" ; rm -fR \" + out_dir + \"/\" + x\n    return \"\"\"genrule(\n    name = \"%s\",\n    outs = [\n%s\n    ],\n    cmd = \\\"\"\"cp -rLf \"%s/.\" \"%s/\" %s\\\"\"\",\n)\"\"\" % (name, \"\\n\".join(outs), src_dir, out_dir, post_cmd)\n\ndef _flag_enabled(repository_ctx, flag_name):\n    return get_host_environ(repository_ctx, flag_name) == \"1\"\n\ndef _tf_sysroot(repository_ctx):\n    return get_host_environ(repository_ctx, _SYSROOT, \"\")\n\ndef _compute_cuda_extra_copts(repository_ctx, compute_capabilities):\n    copts = []\n    for capability in compute_capabilities:\n        if capability.startswith(\"compute_\"):\n            capability = capability.replace(\"compute_\", \"sm_\")\n            copts.append(\"--cuda-include-ptx=%s\" % capability)\n        copts.append(\"--cuda-gpu-arch=%s\" % capability)\n\n    return str(copts)\n\ndef _tpl_path(repository_ctx, filename):\n    return repository_ctx.path(Label(\"//build_deps/gpus/%s.tpl\" % filename))\n\ndef _basename(repository_ctx, path_str):\n    \"\"\"Returns the basename of a path of type string.\n    \"\"\"\n\n    num_chars = len(path_str)\n    for i in range(num_chars):\n        r_i = num_chars - 1 - i\n        if path_str[r_i] == \"/\":\n            return path_str[r_i + 1:]\n    return path_str\n\ndef _create_local_cuda_repository(repository_ctx):\n    \"\"\"Creates the repository containing files set up to build with CUDA.\"\"\"\n    tpl_paths = {\n        filename: _tpl_path(repository_ctx, filename)\n        for filename in [\n            \"cuda:build_defs.bzl\",\n            \"crosstool:crosstool_compiler_wrapper\",\n            \"crosstool:BUILD\",\n            \"crosstool:cc_toolchain_config.bzl\",\n            \"cuda:cuda_config.h\",\n            \"cuda:cuda_config.py\",\n        ]\n    }\n    tpl_paths[\"cuda:BUILD\"] = _tpl_path(repository_ctx, \"cuda:BUILD\")\n    find_cuda_config_script = repository_ctx.path(\n        Label(\"//build_deps/gpus:find_cuda_config.py\"),\n    )\n\n    cuda_config = _get_cuda_config(repository_ctx, find_cuda_config_script)\n\n    cuda_include_path = cuda_config.config[\"cuda_include_dir\"]\n    cublas_include_path = cuda_config.config[\"cublas_include_dir\"]\n    cudnn_header_dir = cuda_config.config[\"cudnn_include_dir\"]\n    cupti_header_dir = cuda_config.config[\"cupti_include_dir\"]\n    nvvm_libdevice_dir = cuda_config.config[\"nvvm_library_dir\"]\n\n    # Create genrule to copy files from the installed CUDA toolkit into execroot.\n    copy_rules = [\n        make_copy_dir_rule(\n            repository_ctx,\n            name = \"cuda-include\",\n            src_dir = cuda_include_path,\n            out_dir = \"cuda/include\",\n        ),\n        make_copy_dir_rule(\n            repository_ctx,\n            name = \"cuda-nvvm\",\n            src_dir = nvvm_libdevice_dir,\n            out_dir = \"cuda/nvvm/libdevice\",\n        ),\n        make_copy_dir_rule(\n            repository_ctx,\n            name = \"cuda-extras\",\n            src_dir = cupti_header_dir,\n            out_dir = \"cuda/extras/CUPTI/include\",\n        ),\n    ]\n\n    copy_rules.append(\n        make_copy_files_rule(\n            repository_ctx,\n            name = \"cublas-include\",\n            srcs = [\n                cublas_include_path + \"/cublas.h\",\n                cublas_include_path + \"/cublas_v2.h\",\n                cublas_include_path + \"/cublas_api.h\",\n                cublas_include_path + \"/cublasLt.h\",\n            ],\n            outs = [\n                \"cublas/include/cublas.h\",\n                \"cublas/include/cublas_v2.h\",\n                \"cublas/include/cublas_api.h\",\n                \"cublas/include/cublasLt.h\",\n            ],\n        ),\n    )\n\n    cusolver_include_path = cuda_config.config[\"cusolver_include_dir\"]\n    copy_rules.append(\n        make_copy_files_rule(\n            repository_ctx,\n            name = \"cusolver-include\",\n            srcs = [\n                cusolver_include_path + \"/cusolver_common.h\",\n                cusolver_include_path + \"/cusolverDn.h\",\n            ],\n            outs = [\n                \"cusolver/include/cusolver_common.h\",\n                \"cusolver/include/cusolverDn.h\",\n            ],\n        ),\n    )\n\n    cufft_include_path = cuda_config.config[\"cufft_include_dir\"]\n    copy_rules.append(\n        make_copy_files_rule(\n            repository_ctx,\n            name = \"cufft-include\",\n            srcs = [\n                cufft_include_path + \"/cufft.h\",\n            ],\n            outs = [\n                \"cufft/include/cufft.h\",\n            ],\n        ),\n    )\n\n    cusparse_include_path = cuda_config.config[\"cusparse_include_dir\"]\n    copy_rules.append(\n        make_copy_files_rule(\n            repository_ctx,\n            name = \"cusparse-include\",\n            srcs = [\n                cusparse_include_path + \"/cusparse.h\",\n            ],\n            outs = [\n                \"cusparse/include/cusparse.h\",\n            ],\n        ),\n    )\n\n    curand_include_path = cuda_config.config[\"curand_include_dir\"]\n    copy_rules.append(\n        make_copy_files_rule(\n            repository_ctx,\n            name = \"curand-include\",\n            srcs = [\n                curand_include_path + \"/curand.h\",\n            ],\n            outs = [\n                \"curand/include/curand.h\",\n            ],\n        ),\n    )\n\n    check_cuda_libs_script = repository_ctx.path(\n        Label(\"//build_deps/gpus:check_cuda_libs.py\"),\n    )\n    cuda_libs = _find_libs(repository_ctx, check_cuda_libs_script, cuda_config)\n    cuda_lib_srcs = []\n    cuda_lib_outs = []\n    for path in cuda_libs.values():\n        cuda_lib_srcs.append(path)\n        cuda_lib_outs.append(\"cuda/lib/\" + _basename(repository_ctx, path))\n    copy_rules.append(\n        make_copy_files_rule(\n            repository_ctx,\n            name = \"cuda-lib\",\n            srcs = cuda_lib_srcs,\n            outs = cuda_lib_outs,\n        ),\n    )\n\n    file_ext = \"\"\n    bin_files = (\n        [\"crt/link.stub\"] +\n        [f + file_ext for f in [\"bin2c\", \"fatbinary\", \"nvlink\", \"nvprune\"]]\n    )\n    copy_rules.append(\n        make_copy_files_rule(\n            repository_ctx,\n            name = \"cuda-bin\",\n            srcs = [\n                cuda_config.cuda_toolkit_path + \"/bin/\" + f\n                for f in bin_files\n            ],\n            outs = [\"cuda/bin/\" + f for f in bin_files],\n        ),\n    )\n\n    # Select the headers based on the cuDNN version (strip '64_' for Windows).\n    cudnn_headers = [\"cudnn.h\"]\n    if cuda_config.cudnn_version.rsplit(\"_\", 1)[-1] >= \"8\":\n        cudnn_headers += [\n            \"cudnn_backend.h\",\n            \"cudnn_adv_infer.h\",\n            \"cudnn_adv_train.h\",\n            \"cudnn_cnn_infer.h\",\n            \"cudnn_cnn_train.h\",\n            \"cudnn_ops_infer.h\",\n            \"cudnn_ops_train.h\",\n            \"cudnn_version.h\",\n        ]\n\n    cudnn_srcs = []\n    cudnn_outs = []\n    for header in cudnn_headers:\n        cudnn_srcs.append(cudnn_header_dir + \"/\" + header)\n        cudnn_outs.append(\"cudnn/include/\" + header)\n\n    copy_rules.append(\n        make_copy_files_rule(\n            repository_ctx,\n            name = \"cudnn-include\",\n            srcs = cudnn_srcs,\n            outs = cudnn_outs,\n        ),\n    )\n\n    # Set up BUILD file for cuda/\n    repository_ctx.template(\n        \"cuda/build_defs.bzl\",\n        tpl_paths[\"cuda:build_defs.bzl\"],\n        {\n            \"%{cuda_is_configured}\": \"True\",\n            \"%{cuda_extra_copts}\": _compute_cuda_extra_copts(\n                repository_ctx,\n                cuda_config.compute_capabilities,\n            ),\n            \"%{cuda_gpu_architectures}\": str(cuda_config.compute_capabilities),\n        },\n    )\n\n    cub_actual = \"@cub_archive//:cub\"\n    if int(cuda_config.cuda_version_major) >= 11:\n        cub_actual = \":cuda_headers\"\n\n    repository_ctx.template(\n        \"cuda/BUILD\",\n        tpl_paths[\"cuda:BUILD\"],\n        {\n            \"%{cuda_driver_lib}\": _basename(repository_ctx, cuda_libs[\"cuda\"]),\n            \"%{cudart_static_lib}\": _basename(repository_ctx, cuda_libs[\"cudart_static\"]),\n            \"%{cudart_static_linkopt}\": _cudart_static_linkopt(cuda_config.cpu_value),\n            \"%{cudart_lib}\": _basename(repository_ctx, cuda_libs[\"cudart\"]),\n            \"%{cublas_lib}\": _basename(repository_ctx, cuda_libs[\"cublas\"]),\n            \"%{cublasLt_lib}\": _basename(repository_ctx, cuda_libs[\"cublasLt\"]),\n            \"%{cusolver_lib}\": _basename(repository_ctx, cuda_libs[\"cusolver\"]),\n            \"%{cudnn_lib}\": _basename(repository_ctx, cuda_libs[\"cudnn\"]),\n            \"%{cufft_lib}\": _basename(repository_ctx, cuda_libs[\"cufft\"]),\n            \"%{curand_lib}\": _basename(repository_ctx, cuda_libs[\"curand\"]),\n            \"%{cupti_lib}\": _basename(repository_ctx, cuda_libs[\"cupti\"]),\n            \"%{cusparse_lib}\": _basename(repository_ctx, cuda_libs[\"cusparse\"]),\n            \"%{cub_actual}\": cub_actual,\n            \"%{copy_rules}\": \"\\n\".join(copy_rules),\n        },\n    )\n\n    tf_sysroot = _tf_sysroot(repository_ctx)\n\n    # Set up crosstool/\n    cc = find_cc(repository_ctx)\n    cc_fullpath = cc\n\n    host_compiler_includes = get_cxx_inc_directories(\n        repository_ctx,\n        cc_fullpath,\n        tf_sysroot,\n    )\n    cuda_defines = {}\n    cuda_defines[\"%{builtin_sysroot}\"] = tf_sysroot\n    cuda_defines[\"%{cuda_toolkit_path}\"] = \"\"\n    cuda_defines[\"%{compiler}\"] = \"unknown\"\n\n    host_compiler_prefix = get_host_environ(\n        repository_ctx,\n        _GCC_HOST_COMPILER_PREFIX,\n    )\n    if not host_compiler_prefix:\n        host_compiler_prefix = \"/usr/bin\"\n\n    cuda_defines[\"%{host_compiler_prefix}\"] = host_compiler_prefix\n    cuda_defines[\"%{linker_bin_path}\"] = host_compiler_prefix\n    cuda_defines[\"%{extra_no_canonical_prefixes_flags}\"] = \"\"\n    cuda_defines[\"%{unfiltered_compile_flags}\"] = \"\"\n\n    cuda_defines[\"%{host_compiler_path}\"] = \"crosstool_compiler_wrapper\"\n    cuda_defines[\"%{host_compiler_warnings}\"] = \"\"\n\n    # nvcc has the system include paths built in and will automatically\n    # search them; we cannot work around that, so we add the relevant cuda\n    # system paths to the allowed compiler specific include paths.\n    cuda_defines[\"%{cxx_builtin_include_directories}\"] = to_list_of_strings(\n        host_compiler_includes + _cuda_include_path(\n            repository_ctx,\n            cuda_config,\n        ) + [cupti_header_dir, cudnn_header_dir],\n    )\n\n    # For gcc, do not canonicalize system header paths; some versions of gcc\n    # pick the shortest possible path for system includes when creating the\n    # .d file - given that includes that are prefixed with \"../\" multiple\n    # time quickly grow longer than the root of the tree, this can lead to\n    # bazel's header check failing.\n    cuda_defines[\"%{extra_no_canonical_prefixes_flags}\"] = \"\\\"-fno-canonical-system-headers\\\"\"\n\n    file_ext = \"\"\n    nvcc_path = \"%s/nvcc%s\" % (cuda_config.config[\"cuda_binary_dir\"], file_ext)\n    cuda_defines[\"%{compiler_deps}\"] = \":crosstool_compiler\"\n\n    wrapper_defines = {\n        \"%{cpu_compiler}\": str(cc),\n        \"%{cuda_version}\": cuda_config.cuda_version,\n        \"%{nvcc_path}\": nvcc_path,\n        \"%{gcc_host_compiler_path}\": str(cc),\n    }\n    repository_ctx.template(\n        \"crosstool/crosstool_compiler_wrapper\",\n        tpl_paths[\"crosstool:crosstool_compiler_wrapper\"],\n        wrapper_defines,\n    )\n\n    verify_build_defines(cuda_defines)\n\n    # Only expand template variables in the BUILD file\n    repository_ctx.template(\n        \"crosstool/BUILD\",\n        tpl_paths[\"crosstool:BUILD\"],\n        cuda_defines,\n    )\n\n    # No templating of cc_toolchain_config - use attributes and templatize the\n    # BUILD file.\n    repository_ctx.template(\n        \"crosstool/cc_toolchain_config.bzl\",\n        tpl_paths[\"crosstool:cc_toolchain_config.bzl\"],\n        {},\n    )\n\n    # Set up cuda_config.h\n    repository_ctx.template(\n        \"cuda/cuda/cuda_config.h\",\n        tpl_paths[\"cuda:cuda_config.h\"],\n        {\n            \"%{cuda_version}\": cuda_config.cuda_version,\n            \"%{cudart_version}\": cuda_config.cudart_version,\n            \"%{cupti_version}\": cuda_config.cupti_version,\n            \"%{cublas_version}\": cuda_config.cublas_version,\n            \"%{cusolver_version}\": cuda_config.cusolver_version,\n            \"%{curand_version}\": cuda_config.curand_version,\n            \"%{cufft_version}\": cuda_config.cufft_version,\n            \"%{cusparse_version}\": cuda_config.cusparse_version,\n            \"%{cudnn_version}\": cuda_config.cudnn_version,\n            \"%{cuda_toolkit_path}\": cuda_config.cuda_toolkit_path,\n            \"%{cuda_compute_capabilities}\": \", \".join(\n                [cc.split(\"_\")[1] for cc in cuda_config.compute_capabilities],\n            ),\n        },\n    )\n\n    # Set up cuda_config.py, which is used by gen_build_info to provide\n    # static build environment info to the API\n    repository_ctx.template(\n        \"cuda/cuda/cuda_config.py\",\n        tpl_paths[\"cuda:cuda_config.py\"],\n        _py_tmpl_dict({\n            \"cuda_version\": cuda_config.cuda_version,\n            \"cudnn_version\": cuda_config.cudnn_version,\n            \"cuda_compute_capabilities\": cuda_config.compute_capabilities,\n            \"cpu_compiler\": str(cc),\n        }),\n    )\n\ndef _get_tensorrt_static_path(repository_ctx):\n    return get_host_environ(repository_ctx, _TENSORRT_STATIC_PATH, None)\n\ndef _create_local_tensorrt_repository(repository_ctx):\n    find_cuda_config_path = repository_ctx.path(\n        Label(\"//build_deps/gpus:find_cuda_config.py\"),\n    )\n    config = find_cuda_config(\n        repository_ctx,\n        find_cuda_config_path,\n        [\"tensorrt\"],\n    )\n    tensorrt_version = config[\"tensorrt_version\"]\n    cpu_value = get_cpu_value(repository_ctx)\n\n    # Copy the library and header files\n    libraries = [\n        lib_name(lib, cpu_value, tensorrt_version)\n        for lib in _TENSORRT_LIBS\n    ]\n    library_dir = config[\"tensorrt_library_dir\"] + \"/\"\n    headers = _get_tensorrt_headers(tensorrt_version)\n    include_dir = config[\"tensorrt_include_dir\"] + \"/\"\n    copy_rules = [\n        make_copy_files_rule(\n            repository_ctx,\n            name = \"tensorrt_lib\",\n            srcs = [library_dir + library for library in libraries],\n            outs = [\"tensorrt/lib/\" + library for library in libraries],\n        ),\n        make_copy_files_rule(\n            repository_ctx,\n            name = \"tensorrt_include\",\n            srcs = [include_dir + header for header in headers],\n            outs = [\"tensorrt/include/\" + header for header in headers],\n        ),\n    ]\n\n    tensorrt_static_path = _get_tensorrt_static_path(repository_ctx)\n    if tensorrt_static_path:\n        tensorrt_static_path = tensorrt_static_path + \"/\"\n        if _at_least_version(tensorrt_version, \"8\"):\n            raw_static_library_names = _TENSORRT_LIBS\n        else:\n            raw_static_library_names = _TENSORRT_LIBS + [\n                \"nvrtc\",\n                \"myelin_compiler\",\n                \"myelin_executor\",\n                \"myelin_pattern_library\",\n                \"myelin_pattern_runtime\",\n            ]\n\n        static_library_names = [\n            \"%s_static\" % name\n            for name in raw_static_library_names\n        ]\n        static_libraries = [\n            lib_name(lib, cpu_value, tensorrt_version, static = True)\n            for lib in static_library_names\n        ]\n        copy_rules = copy_rules + [\n            make_copy_files_rule(\n                repository_ctx,\n                name = \"tensorrt_static_lib\",\n                srcs = [\n                    tensorrt_static_path + library\n                    for library in static_libraries\n                ],\n                outs = [\n                    \"tensorrt/lib/\" + library\n                    for library in static_libraries\n                ],\n            ),\n        ]\n\n    tpl_paths = {\n        \"tensorrt/build_defs.bzl\": _tpl_path(repository_ctx, \"tensorrt:build_defs.bzl\"),\n        \"tensorrt/BUILD\": _tpl_path(repository_ctx, \"tensorrt:BUILD\"),\n        \"tensorrt/tensorrt_config.h\": _tpl_path(repository_ctx, \"tensorrt:tensorrt_config.h\"),\n        \"tensorrt/tensorrt_config.py\": _tpl_path(repository_ctx, \"tensorrt:tensorrt_config.py\"),\n    }\n\n    # Set up config file.\n    repository_ctx.template(\n        \"tensorrt/build_defs.bzl\",\n        tpl_paths[\"tensorrt/build_defs.bzl\"],\n        {\"%{if_tensorrt}\": \"if_true\"},\n    )\n\n    # Set up BUILD file.\n    repository_ctx.template(\n        \"tensorrt/BUILD\",\n        tpl_paths[\"tensorrt/BUILD\"],\n        {\n            \"%{copy_rules}\": \"\\n\".join(copy_rules),\n        },\n    )\n\n    # Set up tensorrt_config.h, which is used by\n    # tensorflow/stream_executor/dso_loader.cc.\n    repository_ctx.template(\n        \"tensorrt/tensorrt_config.h\",\n        tpl_paths[\"tensorrt/tensorrt_config.h\"],\n        {\"%{tensorrt_version}\": tensorrt_version},\n    )\n\n    # Set up tensorrt_config.py, which is used by gen_build_info to provide\n    # build environment info to the API\n    repository_ctx.template(\n        \"tensorrt/tensorrt_config.py\",\n        tpl_paths[\"tensorrt/tensorrt_config.py\"],\n        _py_tmpl_dict({\n            \"tensorrt_version\": tensorrt_version,\n        }),\n    )\n\ndef _py_tmpl_dict(d):\n    return {\"%{cuda_config}\": str(d)}\n\n_CUDA_ENVIRONS = [\n    _GCC_HOST_COMPILER_PATH,\n    _GCC_HOST_COMPILER_PREFIX,\n    \"NEED_CUDA\",\n    _CUDA_TOOLKIT_PATH,\n    _CUDNN_INSTALL_PATH,\n    _CUDA_VERSION,\n    _CUDNN_VERSION,\n    _CUDA_COMPUTE_CAPABILITIES,\n    \"NVVMIR_LIBRARY_DIR\",\n    _PYTHON_BIN_PATH,\n    \"TMP\",\n    \"TMPDIR\",\n    \"CUDA_PATHS\",\n]\n\ncuda_configure = repository_rule(\n    implementation = _create_local_cuda_repository,\n    environ = _CUDA_ENVIRONS,\n)\n\n_TENSORRT_ENVIRONS = [\n    _TENSORRT_INSTALL_PATH,\n    _TENSORRT_VERSION,\n    _TENSORRT_STATIC_PATH,\n    \"CUDA_PATHS\",\n]\n\ntensorrt_configure = repository_rule(\n    implementation = _create_local_tensorrt_repository,\n    environ = _TENSORRT_ENVIRONS,\n)\n"
  },
  {
    "path": "build_deps/gpus/crosstool/BUILD",
    "content": ""
  },
  {
    "path": "build_deps/gpus/crosstool/BUILD.tpl",
    "content": "# This file is expanded from a template by cuda_configure.bzl\n# Update cuda_configure.bzl#verify_build_defines when adding new variables.\n\nload(\":cc_toolchain_config.bzl\", \"cc_toolchain_config\")\n\nlicenses([\"restricted\"])\n\npackage(default_visibility = [\"//visibility:public\"])\n\ntoolchain(\n    name = \"toolchain-linux-x86_64\",\n    exec_compatible_with = [\n        \"@platforms//os:linux\",\n        \"@platforms//cpu:x86_64\",\n    ],\n    target_compatible_with = [\n        \"@platforms//os:linux\",\n        \"@platforms//cpu:x86_64\",\n    ],\n    toolchain = \":cc-compiler-local\",\n    toolchain_type = \"@bazel_tools//tools/cpp:toolchain_type\",\n)\n\ncc_toolchain_suite(\n    name = \"toolchain\",\n    toolchains = {\n        \"local|compiler\": \":cc-compiler-local\",\n        \"darwin|compiler\": \":cc-compiler-darwin\",\n        \"arm\": \":cc-compiler-local\",\n        \"aarch64\": \":cc-compiler-local\",\n        \"k8\": \":cc-compiler-local\",\n        \"piii\": \":cc-compiler-local\",\n        \"ppc\": \":cc-compiler-local\",\n        \"darwin\": \":cc-compiler-darwin\",\n    },\n)\n\ncc_toolchain(\n    name = \"cc-compiler-local\",\n    all_files = \"%{compiler_deps}\",\n    compiler_files = \"%{compiler_deps}\",\n    ar_files = \"%{compiler_deps}\",\n    as_files = \"%{compiler_deps}\",\n    dwp_files = \":empty\",\n    linker_files = \"%{compiler_deps}\",\n    objcopy_files = \":empty\",\n    strip_files = \":empty\",\n    # To support linker flags that need to go to the start of command line\n    # we need the toolchain to support parameter files. Parameter files are\n    # last on the command line and contain all shared libraries to link, so all\n    # regular options will be left of them.\n    supports_param_files = 1,\n    toolchain_identifier = \"local_linux\",\n    toolchain_config = \":cc-compiler-local-config\",\n)\n\ncc_toolchain_config(\n    name = \"cc-compiler-local-config\",\n    cpu = \"local\",\n    builtin_include_directories = [%{cxx_builtin_include_directories}],\n    extra_no_canonical_prefixes_flags = [%{extra_no_canonical_prefixes_flags}],\n    host_compiler_path = \"%{host_compiler_path}\",\n    host_compiler_prefix = \"%{host_compiler_prefix}\",\n    host_compiler_warnings = [%{host_compiler_warnings}],\n    host_unfiltered_compile_flags = [%{unfiltered_compile_flags}],\n    linker_bin_path = \"%{linker_bin_path}\",\n    builtin_sysroot = \"%{builtin_sysroot}\",\n    cuda_path = \"%{cuda_toolkit_path}\",\n    compiler = \"%{compiler}\",\n)\n\ncc_toolchain(\n    name = \"cc-compiler-darwin\",\n    all_files = \"%{compiler_deps}\",\n    compiler_files = \"%{compiler_deps}\",\n    ar_files = \"%{compiler_deps}\",\n    as_files = \"%{compiler_deps}\",\n    dwp_files = \":empty\",\n    linker_files = \"%{compiler_deps}\",\n    objcopy_files = \":empty\",\n    strip_files = \":empty\",\n    supports_param_files = 0,\n    toolchain_identifier = \"local_darwin\",\n    toolchain_config = \":cc-compiler-local-darwin\",\n)\n\ncc_toolchain_config(\n    name = \"cc-compiler-local-darwin\",\n    cpu = \"darwin\",\n    builtin_include_directories = [%{cxx_builtin_include_directories}],\n    extra_no_canonical_prefixes_flags = [%{extra_no_canonical_prefixes_flags}],\n    host_compiler_path = \"%{host_compiler_path}\",\n    host_compiler_prefix = \"%{host_compiler_prefix}\",\n    host_compiler_warnings = [%{host_compiler_warnings}],\n    host_unfiltered_compile_flags = [%{unfiltered_compile_flags}],\n    linker_bin_path = \"%{linker_bin_path}\",\n)\n\n\nfilegroup(\n    name = \"empty\",\n    srcs = [],\n)\n\nfilegroup(\n    name = \"crosstool_compiler\",\n    srcs = [\"crosstool_compiler_wrapper\"],\n)\n"
  },
  {
    "path": "build_deps/gpus/crosstool/cc_toolchain_config.bzl.tpl",
    "content": "\"\"\"cc_toolchain_config rule for configuring CUDA toolchains on Linux, Mac, and Windows.\"\"\"\n\nload(\n    \"@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl\",\n    \"action_config\",\n    \"artifact_name_pattern\",\n    \"env_entry\",\n    \"env_set\",\n    \"feature\",\n    \"feature_set\",\n    \"flag_group\",\n    \"flag_set\",\n    \"tool\",\n    \"tool_path\",\n    \"variable_with_value\",\n    \"with_feature_set\",\n)\nload(\"@bazel_tools//tools/build_defs/cc:action_names.bzl\", \"ACTION_NAMES\")\n\ndef all_assembly_actions():\n    return [\n        ACTION_NAMES.assemble,\n        ACTION_NAMES.preprocess_assemble,\n    ]\n\ndef all_compile_actions():\n    return [\n        ACTION_NAMES.assemble,\n        ACTION_NAMES.c_compile,\n        ACTION_NAMES.cpp_compile,\n        ACTION_NAMES.cpp_header_parsing,\n        ACTION_NAMES.cpp_module_codegen,\n        ACTION_NAMES.cpp_module_compile,\n        ACTION_NAMES.linkstamp_compile,\n        ACTION_NAMES.preprocess_assemble,\n    ]\n\ndef all_c_compile_actions():\n    return [\n        ACTION_NAMES.c_compile,\n    ]\n\ndef all_cpp_compile_actions():\n    return [\n        ACTION_NAMES.cpp_compile,\n        ACTION_NAMES.cpp_header_parsing,\n        ACTION_NAMES.cpp_module_codegen,\n        ACTION_NAMES.cpp_module_compile,\n        ACTION_NAMES.linkstamp_compile,\n    ]\n\ndef all_preprocessed_actions():\n    return [\n        ACTION_NAMES.c_compile,\n        ACTION_NAMES.cpp_compile,\n        ACTION_NAMES.cpp_header_parsing,\n        ACTION_NAMES.cpp_module_codegen,\n        ACTION_NAMES.cpp_module_compile,\n        ACTION_NAMES.linkstamp_compile,\n        ACTION_NAMES.preprocess_assemble,\n    ]\n\ndef all_link_actions():\n    return [\n        ACTION_NAMES.cpp_link_executable,\n        ACTION_NAMES.cpp_link_dynamic_library,\n        ACTION_NAMES.cpp_link_nodeps_dynamic_library,\n    ]\n\ndef all_executable_link_actions():\n    return [\n        ACTION_NAMES.cpp_link_executable,\n    ]\n\ndef all_shared_library_link_actions():\n    return [\n        ACTION_NAMES.cpp_link_dynamic_library,\n        ACTION_NAMES.cpp_link_nodeps_dynamic_library,\n    ]\n\ndef all_archive_actions():\n    return [ACTION_NAMES.cpp_link_static_library]\n\ndef all_strip_actions():\n    return [ACTION_NAMES.strip]\n\ndef _library_to_link(flag_prefix, value, iterate = None):\n    return flag_group(\n        flags = [\n            \"{}%{{libraries_to_link.{}}}\".format(\n                flag_prefix,\n                iterate if iterate else \"name\",\n            ),\n        ],\n        iterate_over = (\"libraries_to_link.\" + iterate if iterate else None),\n        expand_if_equal = variable_with_value(\n            name = \"libraries_to_link.type\",\n            value = value,\n        ),\n    )\n\ndef _surround_static_library(prefix, suffix):\n    return [\n        flag_group(\n            flags = [prefix, \"%{libraries_to_link.name}\", suffix],\n            expand_if_true = \"libraries_to_link.is_whole_archive\",\n        ),\n        flag_group(\n            flags = [\"%{libraries_to_link.name}\"],\n            expand_if_false = \"libraries_to_link.is_whole_archive\",\n        ),\n    ]\n\ndef _prefix_static_library(prefix):\n    return [\n        flag_group(\n            flags = [\"%{libraries_to_link.name}\"],\n            expand_if_false = \"libraries_to_link.is_whole_archive\",\n        ),\n        flag_group(\n            flags = [prefix + \"%{libraries_to_link.name}\"],\n            expand_if_true = \"libraries_to_link.is_whole_archive\",\n        ),\n    ]\n\ndef _static_library_to_link(alwayslink_prefix, alwayslink_suffix = None):\n    if alwayslink_suffix:\n        flag_groups = _surround_static_library(alwayslink_prefix, alwayslink_suffix)\n    else:\n        flag_groups = _prefix_static_library(alwayslink_prefix)\n    return flag_group(\n        flag_groups = flag_groups,\n        expand_if_equal = variable_with_value(\n            name = \"libraries_to_link.type\",\n            value = \"static_library\",\n        ),\n    )\n\ndef _iterate_flag_group(iterate_over, flags = [], flag_groups = []):\n    return flag_group(\n        iterate_over = iterate_over,\n        expand_if_available = iterate_over,\n        flag_groups = flag_groups,\n        flags = flags,\n    )\n\ndef _libraries_to_link_group(flavour):\n    if flavour == \"linux\":\n        return _iterate_flag_group(\n            iterate_over = \"libraries_to_link\",\n            flag_groups = [\n                flag_group(\n                    flags = [\"-Wl,--start-lib\"],\n                    expand_if_equal = variable_with_value(\n                        name = \"libraries_to_link.type\",\n                        value = \"object_file_group\",\n                    ),\n                ),\n                _library_to_link(\"\", \"object_file_group\", \"object_files\"),\n                flag_group(\n                    flags = [\"-Wl,--end-lib\"],\n                    expand_if_equal = variable_with_value(\n                        name = \"libraries_to_link.type\",\n                        value = \"object_file_group\",\n                    ),\n                ),\n                _library_to_link(\"\", \"object_file\"),\n                _library_to_link(\"\", \"interface_library\"),\n                _static_library_to_link(\"-Wl,-whole-archive\", \"-Wl,-no-whole-archive\"),\n                _library_to_link(\"-l\", \"dynamic_library\"),\n                _library_to_link(\"-l:\", \"versioned_dynamic_library\"),\n            ],\n        )\n    elif flavour == \"darwin\":\n        return _iterate_flag_group(\n            iterate_over = \"libraries_to_link\",\n            flag_groups = [\n                _library_to_link(\"\", \"object_file_group\", \"object_files\"),\n                _library_to_link(\"\", \"object_file\"),\n                _library_to_link(\"\", \"interface_library\"),\n                _static_library_to_link(\"-Wl,-force_load,\"),\n                _library_to_link(\"-l\", \"dynamic_library\"),\n                _library_to_link(\"-l:\", \"versioned_dynamic_library\"),\n            ],\n        )\n\ndef _action_configs_with_tool(path, actions):\n    return [\n        action_config(\n            action_name = name,\n            enabled = True,\n            tools = [tool(path = path)],\n        )\n        for name in actions\n    ]\n\ndef _action_configs(assembly_path, c_compiler_path, cc_compiler_path, archiver_path, linker_path, strip_path):\n    return _action_configs_with_tool(\n        assembly_path,\n        all_assembly_actions(),\n    ) + _action_configs_with_tool(\n        c_compiler_path,\n        all_c_compile_actions(),\n    ) + _action_configs_with_tool(\n        cc_compiler_path,\n        all_cpp_compile_actions(),\n    ) + _action_configs_with_tool(\n        archiver_path,\n        all_archive_actions(),\n    ) + _action_configs_with_tool(\n        linker_path,\n        all_link_actions(),\n    ) + _action_configs_with_tool(\n        strip_path,\n        all_strip_actions(),\n    )\n\ndef _tool_paths(cpu, ctx):\n    if cpu in [\"local\", \"darwin\"]:\n        return [\n            tool_path(name = \"gcc\", path = ctx.attr.host_compiler_path),\n            tool_path(name = \"ar\", path = ctx.attr.host_compiler_prefix + (\n                \"/ar\" if cpu == \"local\" else \"/libtool\"\n            )),\n            tool_path(name = \"compat-ld\", path = ctx.attr.host_compiler_prefix + \"/ld\"),\n            tool_path(name = \"cpp\", path = ctx.attr.host_compiler_prefix + \"/cpp\"),\n            tool_path(name = \"dwp\", path = ctx.attr.host_compiler_prefix + \"/dwp\"),\n            tool_path(name = \"gcov\", path = ctx.attr.host_compiler_prefix + \"/gcov\"),\n            tool_path(name = \"ld\", path = ctx.attr.host_compiler_prefix + \"/ld\"),\n            tool_path(name = \"nm\", path = ctx.attr.host_compiler_prefix + \"/nm\"),\n            tool_path(name = \"objcopy\", path = ctx.attr.host_compiler_prefix + \"/objcopy\"),\n            tool_path(name = \"objdump\", path = ctx.attr.host_compiler_prefix + \"/objdump\"),\n            tool_path(name = \"strip\", path = ctx.attr.host_compiler_prefix + \"/strip\"),\n        ]\n    else:\n        fail(\"Unreachable\")\n\ndef _sysroot_group():\n    return flag_group(\n        flags = [\"--sysroot=%{sysroot}\"],\n        expand_if_available = \"sysroot\",\n    )\n\ndef _no_canonical_prefixes_group(extra_flags):\n    return flag_group(\n        flags = [\n            \"-no-canonical-prefixes\",\n        ] + extra_flags,\n    )\n\ndef _cuda_set(cuda_path, actions):\n    if cuda_path:\n        return [flag_set(\n            actions = actions,\n            flag_groups = [\n                flag_group(\n                    flags = [\"--cuda-path=\" + cuda_path],\n                ),\n            ],\n        )]\n    else:\n        return []\n\ndef _nologo():\n    return flag_group(flags = [\"/nologo\"])\n\ndef _features(cpu, compiler, ctx):\n    if cpu in [\"local\", \"darwin\"]:\n        return [\n            feature(name = \"no_legacy_features\"),\n            feature(\n                name = \"all_compile_flags\",\n                enabled = True,\n                flag_sets = [\n                    flag_set(\n                        actions = all_compile_actions(),\n                        flag_groups = [\n                            flag_group(\n                                flags = [\"-MD\", \"-MF\", \"%{dependency_file}\"],\n                                expand_if_available = \"dependency_file\",\n                            ),\n                            flag_group(\n                                flags = [\"-gsplit-dwarf\"],\n                                expand_if_available = \"per_object_debug_info_file\",\n                            ),\n                        ],\n                    ),\n                    flag_set(\n                        actions = all_preprocessed_actions(),\n                        flag_groups = [\n                            flag_group(\n                                flags = [\"-frandom-seed=%{output_file}\"],\n                                expand_if_available = \"output_file\",\n                            ),\n                            _iterate_flag_group(\n                                flags = [\"-D%{preprocessor_defines}\"],\n                                iterate_over = \"preprocessor_defines\",\n                            ),\n                            _iterate_flag_group(\n                                flags = [\"-include\", \"%{includes}\"],\n                                iterate_over = \"includes\",\n                            ),\n                            _iterate_flag_group(\n                                flags = [\"-iquote\", \"%{quote_include_paths}\"],\n                                iterate_over = \"quote_include_paths\",\n                            ),\n                            _iterate_flag_group(\n                                flags = [\"-I%{include_paths}\"],\n                                iterate_over = \"include_paths\",\n                            ),\n                            _iterate_flag_group(\n                                flags = [\"-isystem\", \"%{system_include_paths}\"],\n                                iterate_over = \"system_include_paths\",\n                            ),\n                            _iterate_flag_group(\n                                flags = [\"-F\", \"%{framework_include_paths}\"],\n                                iterate_over = \"framework_include_paths\",\n                            ),\n                        ],\n                    ),\n                    flag_set(\n                        actions = all_cpp_compile_actions(),\n                        flag_groups = [],\n                    ),\n                    flag_set(\n                        actions = all_compile_actions(),\n                        flag_groups = [\n                            flag_group(\n                                flags = [\n                                    \"-Wno-builtin-macro-redefined\",\n                                    \"-D__DATE__=\\\"redacted\\\"\",\n                                    \"-D__TIMESTAMP__=\\\"redacted\\\"\",\n                                    \"-D__TIME__=\\\"redacted\\\"\",\n                                ],\n                            ),\n                            flag_group(\n                                flags = [\"-fPIC\"],\n                                expand_if_available = \"pic\",\n                            ),\n                            flag_group(\n                                flags = [\"-fPIE\"],\n                                expand_if_not_available = \"pic\",\n                            ),\n                            flag_group(\n                                flags = [\n                                    \"-U_FORTIFY_SOURCE\",\n                                    \"-D_FORTIFY_SOURCE=1\",\n                                    \"-fstack-protector\",\n                                    \"-Wall\",\n                                ] + ctx.attr.host_compiler_warnings + [\n                                    \"-fno-omit-frame-pointer\",\n                                ],\n                            ),\n                            _no_canonical_prefixes_group(\n                                ctx.attr.extra_no_canonical_prefixes_flags,\n                            ),\n                        ],\n                    ),\n                    flag_set(\n                        actions = all_compile_actions(),\n                        flag_groups = [flag_group(flags = [\"-DNDEBUG\"])],\n                        with_features = [with_feature_set(features = [\"disable-assertions\"])],\n                    ),\n                    flag_set(\n                        actions = all_compile_actions(),\n                        flag_groups = [\n                            flag_group(\n                                flags = [\n                                    \"-g0\",\n                                    \"-O2\",\n                                    \"-ffunction-sections\",\n                                    \"-fdata-sections\",\n                                ],\n                            ),\n                        ],\n                        with_features = [with_feature_set(features = [\"opt\"])],\n                    ),\n                    flag_set(\n                        actions = all_compile_actions(),\n                        flag_groups = [flag_group(flags = [\"-g\"])],\n                        with_features = [with_feature_set(features = [\"dbg\"])],\n                    ),\n                ] + _cuda_set(\n                    ctx.attr.cuda_path,\n                    all_compile_actions(),\n                ) + [\n                    flag_set(\n                        actions = all_compile_actions(),\n                        flag_groups = [\n                            _iterate_flag_group(\n                                flags = [\"%{user_compile_flags}\"],\n                                iterate_over = \"user_compile_flags\",\n                            ),\n                            _sysroot_group(),\n                            flag_group(\n                                expand_if_available = \"source_file\",\n                                flags = [\"-c\", \"%{source_file}\"],\n                            ),\n                            flag_group(\n                                expand_if_available = \"output_assembly_file\",\n                                flags = [\"-S\"],\n                            ),\n                            flag_group(\n                                expand_if_available = \"output_preprocess_file\",\n                                flags = [\"-E\"],\n                            ),\n                            flag_group(\n                                expand_if_available = \"output_file\",\n                                flags = [\"-o\", \"%{output_file}\"],\n                            ),\n                        ],\n                    ),\n                ],\n            ),\n            feature(\n                name = \"all_archive_flags\",\n                enabled = True,\n                flag_sets = [\n                    flag_set(\n                        actions = all_archive_actions(),\n                        flag_groups = [\n                            flag_group(\n                                expand_if_available = \"linker_param_file\",\n                                flags = [\"@%{linker_param_file}\"],\n                            ),\n                            flag_group(flags = [\"rcsD\"]),\n                            flag_group(\n                                flags = [\"%{output_execpath}\"],\n                                expand_if_available = \"output_execpath\",\n                            ),\n                            flag_group(\n                                iterate_over = \"libraries_to_link\",\n                                flag_groups = [\n                                    flag_group(\n                                        flags = [\"%{libraries_to_link.name}\"],\n                                        expand_if_equal = variable_with_value(\n                                            name = \"libraries_to_link.type\",\n                                            value = \"object_file\",\n                                        ),\n                                    ),\n                                    flag_group(\n                                        flags = [\"%{libraries_to_link.object_files}\"],\n                                        iterate_over = \"libraries_to_link.object_files\",\n                                        expand_if_equal = variable_with_value(\n                                            name = \"libraries_to_link.type\",\n                                            value = \"object_file_group\",\n                                        ),\n                                    ),\n                                ],\n                                expand_if_available = \"libraries_to_link\",\n                            ),\n                        ],\n                    ),\n                ],\n            ),\n            feature(\n                name = \"all_link_flags\",\n                enabled = True,\n                flag_sets = [\n                    flag_set(\n                        actions = all_shared_library_link_actions(),\n                        flag_groups = [flag_group(flags = [\"-shared\"])],\n                    ),\n                    flag_set(\n                        actions = all_link_actions(),\n                        flag_groups = ([\n                            flag_group(flags = [\"-Wl,-no-as-needed\"])\n                        ] if cpu == \"local\" else []) + ([\n                            flag_group(flags = [\"-B\" + ctx.attr.linker_bin_path])\n                        ] if ctx.attr.linker_bin_path else []) + [\n                            flag_group(\n                                flags = [\"@%{linker_param_file}\"],\n                                expand_if_available = \"linker_param_file\",\n                            ),\n                            _iterate_flag_group(\n                                flags = [\"%{linkstamp_paths}\"],\n                                iterate_over = \"linkstamp_paths\",\n                            ),\n                            flag_group(\n                                flags = [\"-o\", \"%{output_execpath}\"],\n                                expand_if_available = \"output_execpath\",\n                            ),\n                            _iterate_flag_group(\n                                flags = [\"-L%{library_search_directories}\"],\n                                iterate_over = \"library_search_directories\",\n                            ),\n                            _iterate_flag_group(\n                                iterate_over = \"runtime_library_search_directories\",\n                                flags = [\n                                    \"-Wl,-rpath,$ORIGIN/%{runtime_library_search_directories}\",\n                                ] if cpu == \"local\" else [\n                                    \"-Wl,-rpath,@loader_path/%{runtime_library_search_directories}\",\n                                ],\n                            ),\n                            _libraries_to_link_group(\"darwin\" if cpu == \"darwin\" else \"linux\"),\n                            _iterate_flag_group(\n                                flags = [\"%{user_link_flags}\"],\n                                iterate_over = \"user_link_flags\",\n                            ),\n                            flag_group(\n                                flags = [\"-Wl,--gdb-index\"],\n                                expand_if_available = \"is_using_fission\",\n                            ),\n                            flag_group(\n                                flags = [\"-Wl,-S\"],\n                                expand_if_available = \"strip_debug_symbols\",\n                            ),\n                            flag_group(flags = [\"-lc++\" if cpu == \"darwin\" else \"-lstdc++\"]),\n                            _no_canonical_prefixes_group(\n                                ctx.attr.extra_no_canonical_prefixes_flags,\n                            ),\n                        ],\n                    ),\n                    flag_set(\n                        actions = all_executable_link_actions(),\n                        flag_groups = [flag_group(flags = [\"-pie\"])],\n                    ),\n                ] + ([\n                    flag_set(\n                        actions = all_link_actions(),\n                        flag_groups = [flag_group(flags = [\n                            \"-Wl,-z,relro,-z,now\",\n                        ])],\n                    ),\n                ] if cpu == \"local\" else []) + ([\n                    flag_set(\n                        actions = all_link_actions(),\n                        flag_groups = [\n                            flag_group(flags = [\"-Wl,--gc-sections\"]),\n                            flag_group(\n                                flags = [\"-Wl,--build-id=md5\", \"-Wl,--hash-style=gnu\"],\n                            ),\n                        ],\n                    ),\n                ] if cpu == \"local\" else []) + ([\n                    flag_set(\n                        actions = all_link_actions(),\n                        flag_groups = [flag_group(flags = [\"-undefined\", \"dynamic_lookup\"])],\n                    ),\n                ] if cpu == \"darwin\" else []) + _cuda_set(\n                    ctx.attr.cuda_path,\n                    all_link_actions(),\n                ) + [\n                    flag_set(\n                        actions = all_link_actions(),\n                        flag_groups = [\n                            _sysroot_group(),\n                        ],\n                    ),\n                ],\n            ),\n            feature(name = \"disable-assertions\"),\n            feature(\n                name = \"opt\",\n                implies = [\"disable-assertions\"],\n            ),\n            feature(name = \"fastbuild\"),\n            feature(name = \"dbg\"),\n            feature(name = \"supports_dynamic_linker\", enabled = True),\n            feature(name = \"pic\", enabled = True),\n            feature(name = \"supports_pic\", enabled = True),\n            feature(name = \"has_configured_linker_path\", enabled = True),\n        ]\n    else:\n        fail(\"Unreachable\")\n\ndef _impl(ctx):\n    cpu = ctx.attr.cpu\n    compiler = ctx.attr.compiler\n\n    if (cpu == \"darwin\"):\n        toolchain_identifier = \"local_darwin\"\n        target_cpu = \"darwin\"\n        target_libc = \"macosx\"\n        compiler = \"compiler\"\n        action_configs = _action_configs(\n            assembly_path = ctx.attr.host_compiler_path,\n            c_compiler_path = ctx.attr.host_compiler_path,\n            cc_compiler_path = ctx.attr.host_compiler_path,\n            archiver_path = ctx.attr.host_compiler_prefix + \"/libtool\",\n            linker_path = ctx.attr.host_compiler_path,\n            strip_path = ctx.attr.host_compiler_prefix + \"/strip\",\n        )\n        artifact_name_patterns = []\n    elif (cpu == \"local\"):\n        toolchain_identifier = \"local_linux\"\n        target_cpu = \"local\"\n        target_libc = \"local\"\n        action_configs = _action_configs(\n            assembly_path = ctx.attr.host_compiler_path,\n            c_compiler_path = ctx.attr.host_compiler_path,\n            cc_compiler_path = ctx.attr.host_compiler_path,\n            archiver_path = ctx.attr.host_compiler_prefix + \"/ar\",\n            linker_path = ctx.attr.host_compiler_path,\n            strip_path = ctx.attr.host_compiler_prefix + \"/strip\",\n        )\n        artifact_name_patterns = []\n    else:\n        fail(\"Unreachable\")\n\n    out = ctx.actions.declare_file(ctx.label.name)\n    ctx.actions.write(out, \"Fake executable\")\n    return [\n        cc_common.create_cc_toolchain_config_info(\n            ctx = ctx,\n            features = _features(cpu, compiler, ctx),\n            action_configs = action_configs,\n            artifact_name_patterns = artifact_name_patterns,\n            cxx_builtin_include_directories = ctx.attr.builtin_include_directories,\n            toolchain_identifier = toolchain_identifier,\n            host_system_name = \"local\",\n            target_system_name = \"local\",\n            target_cpu = target_cpu,\n            target_libc = target_libc,\n            compiler = compiler,\n            abi_version = \"local\",\n            abi_libc_version = \"local\",\n            tool_paths = _tool_paths(cpu, ctx),\n            make_variables = [],\n            builtin_sysroot = ctx.attr.builtin_sysroot,\n            cc_target_os = None,\n        ),\n        DefaultInfo(\n            executable = out,\n        ),\n    ]\n\ncc_toolchain_config = rule(\n    implementation = _impl,\n    attrs = {\n        \"cpu\": attr.string(mandatory = True, values = [\"darwin\", \"local\"]),\n        \"compiler\": attr.string(values = [\"unknown\"], default = \"unknown\"),\n        \"builtin_include_directories\": attr.string_list(),\n        \"extra_no_canonical_prefixes_flags\": attr.string_list(),\n        \"host_compiler_path\": attr.string(),\n        \"host_compiler_prefix\": attr.string(),\n        \"host_compiler_warnings\": attr.string_list(),\n        \"host_unfiltered_compile_flags\": attr.string_list(),\n        \"linker_bin_path\": attr.string(),\n        \"builtin_sysroot\": attr.string(),\n        \"cuda_path\": attr.string(),\n    },\n    provides = [CcToolchainConfigInfo],\n    executable = True,\n)\n"
  },
  {
    "path": "build_deps/gpus/crosstool/crosstool_compiler_wrapper.tpl",
    "content": "#!/usr/bin/env python\n# Copyright (c) 2023, NVIDIA CORPORATION.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\n\"\"\"Crosstool wrapper for compiling CUDA programs.\n\nSYNOPSIS:\n  crosstool_compiler_wrapper [options passed in by cc_library()\n                                or cc_binary() rule]\n\nDESCRIPTION:\n  This script is expected to be called by the cc_library() or cc_binary() bazel\n  rules. When the option \"-x cuda\" is present in the list of arguments passed\n  to this script, it invokes the nvcc CUDA compiler. Most arguments are passed\n  as is as a string to --compiler-options of nvcc. When \"-x cuda\" is not\n  present, this wrapper invokes hybrid_driver_is_not_gcc with the input\n  arguments as is.\n\"\"\"\n\n__author__ = 'keveman@google.com (Manjunath Kudlur)'\n\nimport os\nimport pipes\nimport re\nimport subprocess\nimport sys\nfrom argparse import ArgumentParser\n\n# Template values set by cuda_autoconf.\nCPU_COMPILER = ('%{cpu_compiler}')\nGCC_HOST_COMPILER_PATH = ('%{gcc_host_compiler_path}')\n\nNVCC_PATH = '%{nvcc_path}'\nPREFIX_DIR = os.path.dirname(GCC_HOST_COMPILER_PATH)\nNVCC_VERSION = '%{cuda_version}'\n\n\ndef Log(s):\n    print('gpus/crosstool: {0}'.format(s))\n\n\ndef GetOptionValue(argv, option):\n    \"\"\"Extract the list of values for option from the argv list.\n\n  Args:\n    argv: A list of strings, possibly the argv passed to main().\n    option: The option whose value to extract, with the leading '-'.\n\n  Returns:\n    A list of values, either directly following the option,\n    (eg., -opt val1 val2) or values collected from multiple occurrences of\n    the option (eg., -opt val1 -opt val2).\n  \"\"\"\n\n    parser = ArgumentParser()\n    parser.add_argument(option, nargs='*', action='append')\n    option = option.lstrip('-').replace('-', '_')\n    args, _ = parser.parse_known_args(argv)\n    if not args or not vars(args)[option]:\n        return []\n    else:\n        return sum(vars(args)[option], [])\n\n\ndef GetHostCompilerOptions(argv):\n    \"\"\"Collect the -isystem, -iquote, and --sysroot option values from argv.\n\n  Args:\n    argv: A list of strings, possibly the argv passed to main().\n\n  Returns:\n    The string that can be used as the --compiler-options to nvcc.\n  \"\"\"\n\n    parser = ArgumentParser()\n    parser.add_argument('-isystem', nargs='*', action='append')\n    parser.add_argument('-iquote', nargs='*', action='append')\n    parser.add_argument('--sysroot', nargs=1)\n    parser.add_argument('-g', nargs='*', action='append')\n    parser.add_argument('-fno-canonical-system-headers', action='store_true')\n    parser.add_argument('-no-canonical-prefixes', action='store_true')\n\n    args, _ = parser.parse_known_args(argv)\n\n    opts = ''\n\n    if args.isystem:\n        opts += ' -isystem ' + ' -isystem '.join(sum(args.isystem, []))\n    if args.iquote:\n        opts += ' -iquote ' + ' -iquote '.join(sum(args.iquote, []))\n    if args.g:\n        opts += ' -g' + ' -g'.join(sum(args.g, []))\n    if args.fno_canonical_system_headers:\n        opts += ' -fno-canonical-system-headers'\n    if args.no_canonical_prefixes:\n        opts += ' -no-canonical-prefixes'\n    if args.sysroot:\n        opts += ' --sysroot ' + args.sysroot[0]\n\n    return opts\n\n\ndef _update_options(nvcc_options):\n    if NVCC_VERSION in (\"7.0\", ):\n        return nvcc_options\n\n    update_options = {\"relaxed-constexpr\": \"expt-relaxed-constexpr\"}\n    return [\n        update_options[opt] if opt in update_options else opt\n        for opt in nvcc_options\n    ]\n\n\ndef GetNvccOptions(argv):\n    \"\"\"Collect the -nvcc_options values from argv.\n\n  Args:\n    argv: A list of strings, possibly the argv passed to main().\n\n  Returns:\n    The string that can be passed directly to nvcc.\n  \"\"\"\n\n    parser = ArgumentParser()\n    parser.add_argument('-nvcc_options', nargs='*', action='append')\n\n    args, _ = parser.parse_known_args(argv)\n\n    if args.nvcc_options:\n        options = _update_options(sum(args.nvcc_options, []))\n        return ' '.join(['--' + a for a in options])\n    return ''\n\n\ndef system(cmd):\n    \"\"\"Invokes cmd with os.system().\n\n  Args:\n    cmd: The command.\n\n  Returns:\n    The exit code if the process exited with exit() or -signal\n    if the process was terminated by a signal.\n  \"\"\"\n    retv = os.system(cmd)\n    if os.WIFEXITED(retv):\n        return os.WEXITSTATUS(retv)\n    else:\n        return -os.WTERMSIG(retv)\n\n\ndef InvokeNvcc(argv, log=False):\n    \"\"\"Call nvcc with arguments assembled from argv.\n\n  Args:\n    argv: A list of strings, possibly the argv passed to main().\n    log: True if logging is requested.\n\n  Returns:\n    The return value of calling system('nvcc ' + args)\n  \"\"\"\n\n    host_compiler_options = GetHostCompilerOptions(argv)\n    nvcc_compiler_options = GetNvccOptions(argv)\n    opt_option = GetOptionValue(argv, '-O')\n    m_options = GetOptionValue(argv, '-m')\n    m_options = ''.join([' -m' + m for m in m_options if m in ['32', '64']])\n    include_options = GetOptionValue(argv, '-I')\n    out_file = GetOptionValue(argv, '-o')\n    depfiles = GetOptionValue(argv, '-MF')\n    defines = GetOptionValue(argv, '-D')\n    defines = ''.join([' -D' + define for define in defines])\n    undefines = GetOptionValue(argv, '-U')\n    undefines = ''.join([' -U' + define for define in undefines])\n    std_options = GetOptionValue(argv, '-std')\n    nvcc_allowed_std_options = [\"c++03\", \"c++11\", \"c++14\"]\n    nvcc_std_map = {}\n    if int(NVCC_VERSION.split('.')[0]) >= 11:\n        nvcc_std_map[\"c++1z\"] = \"c++17\"\n        nvcc_allowed_std_options += [\"c++17\", \"c++1z\"]\n    std_options = ''.join([\n        ' -std=' + (nvcc_std_map[define] if define in nvcc_std_map else define)\n        for define in std_options if define in nvcc_allowed_std_options\n    ][-1:])\n    fatbin_options = ''.join([\n        ' --fatbin-options=' + option\n        for option in GetOptionValue(argv, '-Xcuda-fatbinary')\n    ])\n\n    # The list of source files get passed after the -c option. I don't know of\n    # any other reliable way to just get the list of source files to be compiled.\n    src_files = GetOptionValue(argv, '-c')\n\n    # Pass -w through from host to nvcc, but don't do anything fancier with\n    # warnings-related flags, since they're not necessarily the same across\n    # compilers.\n    warning_options = ' -w' if '-w' in argv else ''\n\n    if len(src_files) == 0:\n        return 1\n    if len(out_file) != 1:\n        return 1\n\n    opt = (' -O2' if\n           (len(opt_option) > 0 and int(opt_option[0]) > 0) else ' -g')\n\n    includes = (' -I ' + ' -I '.join(include_options)\n                if len(include_options) > 0 else '')\n\n    # Unfortunately, there are other options that have -c prefix too.\n    # So allowing only those look like C/C++ files.\n    src_files = [\n        f for f in src_files\n        if re.search('\\.cpp$|\\.cc$|\\.c$|\\.cxx$|\\.C|\\.cu|\\.cuh$', f)\n    ]\n    srcs = ' '.join(src_files)\n    out = ' -o ' + out_file[0]\n\n    nvccopts = '-D_FORCE_INLINES '\n    capabilities_sm = set(GetOptionValue(argv, \"--cuda-gpu-arch\"))\n    capabilities_compute = set(GetOptionValue(argv, '--cuda-include-ptx'))\n    # When both \"code=sm_xy\" and \"code=compute_xy\" are requested for a single\n    # arch, they can be combined using \"code=xy,compute_xy\" which avoids a\n    # redundant PTX generation during compilation.\n    capabilities_both = capabilities_sm.intersection(capabilities_compute)\n    for capability in capabilities_both:\n        capability = capability[len('sm_'):]\n        nvccopts += r'-gencode=arch=compute_%s,code=\\\"sm_%s,compute_%s\\\" ' % (\n            capability, capability, capability)\n    for capability in capabilities_sm - capabilities_both:\n        capability = capability[len('sm_'):]\n        nvccopts += r'-gencode=arch=compute_%s,\\\"code=sm_%s\\\" ' % (capability,\n                                                                   capability)\n    for capability in capabilities_compute - capabilities_both:\n        capability = capability[len('sm_'):]\n        nvccopts += r'-gencode=arch=compute_%s,\\\"code=compute_%s\\\" ' % (\n            capability, capability)\n    nvccopts += nvcc_compiler_options\n    nvccopts += undefines\n    nvccopts += defines\n    nvccopts += std_options\n    nvccopts += m_options\n    nvccopts += warning_options\n    # Force C++17 dialect (note, everything in just one string!)\n    nvccopts += ' --std c++17 '\n    nvccopts += fatbin_options\n\n    if depfiles:\n        # Generate the dependency file\n        depfile = depfiles[0]\n        cmd = (NVCC_PATH + ' ' + nvccopts + ' --compiler-options \"' +\n               host_compiler_options + '\"' + ' --compiler-bindir=' +\n               GCC_HOST_COMPILER_PATH + ' -I .' + ' -x cu ' + opt + includes +\n               ' ' + srcs + ' -M -o ' + depfile)\n        if log:\n            Log(cmd)\n        exit_status = system(cmd)\n        if exit_status != 0:\n            return exit_status\n\n    cmd = (NVCC_PATH + ' ' + nvccopts + ' --compiler-options \"' +\n           host_compiler_options + ' -fPIC\"' + ' --compiler-bindir=' +\n           GCC_HOST_COMPILER_PATH + ' -I .' + ' -x cu ' + opt + includes +\n           ' -c ' + srcs + out)\n\n    # TODO(zhengxq): for some reason, 'gcc' needs this help to find 'as'.\n    # Need to investigate and fix.\n    cmd = 'PATH=' + PREFIX_DIR + ':$PATH ' + cmd\n    if log:\n        Log(cmd)\n    return system(cmd)\n\n\ndef main():\n    parser = ArgumentParser()\n    parser.add_argument('-x', nargs=1)\n    parser.add_argument('--cuda_log', action='store_true')\n    args, leftover = parser.parse_known_args(sys.argv[1:])\n\n    if args.x and args.x[0] == 'cuda':\n        if args.cuda_log:\n            Log('-x cuda')\n        leftover = [pipes.quote(s) for s in leftover]\n        if args.cuda_log:\n            Log('using nvcc')\n        return InvokeNvcc(leftover, log=args.cuda_log)\n\n    # Strip our flags before passing through to the CPU compiler for files which\n    # are not -x cuda. We can't just pass 'leftover' because it also strips -x.\n    # We not only want to pass -x to the CPU compiler, but also keep it in its\n    # relative location in the argv list (the compiler is actually sensitive to\n    # this).\n    cpu_compiler_flags = [\n        flag for flag in sys.argv[1:] if not flag.startswith(('--cuda_log'))\n    ]\n\n    return subprocess.call([CPU_COMPILER] + cpu_compiler_flags)\n\n\nif __name__ == '__main__':\n    sys.exit(main())\n"
  },
  {
    "path": "build_deps/gpus/cuda/BUILD",
    "content": ""
  },
  {
    "path": "build_deps/gpus/cuda/BUILD.tpl",
    "content": "load(\":build_defs.bzl\", \"cuda_header_library\")\nload(\"@bazel_skylib//:bzl_library.bzl\", \"bzl_library\")\nload(\"@bazel_skylib//lib:selects.bzl\", \"selects\")\nload(\"@bazel_skylib//rules:common_settings.bzl\", \"bool_flag\")\n\nlicenses([\"restricted\"])  # MPL2, portions GPL v3, LGPL v3, BSD-like\n\npackage(default_visibility = [\"//visibility:public\"])\n\nbool_flag(\n    name = \"enable_cuda\",\n    build_setting_default = False,\n)\n\nconfig_setting(\n    name = \"is_cuda_enabled\",\n    flag_values = {\":enable_cuda\": \"True\"},\n)\n\n\n# Config setting whether built with CUDA support using nvcc.\n#\n# TODO(b/174244321), DEPRECATED: this target will be removed when all users\n# have been converted to :is_cuda_enabled (most) or :is_cuda_compiler_nvcc.\nselects.config_setting_group(\n    name = \"using_nvcc\",\n    match_all = [\n        \"//:is_cuda_enabled\",\n        \"//:is_cuda_compiler_nvcc\",\n    ],\n)\n\nconfig_setting(\n    name = \"_opt\",\n    values = {\"compilation_mode\": \"opt\"},\n    visibility = [\"//visibility:private\"],\n)\n\n# Provides CUDA headers for '#include \"third_party/gpus/cuda/include/cuda.h\"'\n# All clients including TensorFlow should use these directives.\ncuda_header_library(\n    name = \"cuda_headers\",\n    hdrs = [\n        \"cuda/cuda_config.h\",\n        \":cuda-include\",\n    ],\n    include_prefix = \"third_party/gpus\",\n    includes = [\n        \".\",  # required to include cuda/cuda/cuda_config.h as cuda/config.h\n        \"cuda/include\",\n    ],\n)\n\ncc_library(\n    name = \"cudart_static\",\n    srcs = [\"cuda/lib/%{cudart_static_lib}\"],\n    linkopts = [\n        \"-ldl\",\n        \"-lpthread\",\n        %{cudart_static_linkopt}\n    ],\n)\n\ncc_library(\n    name = \"cuda_driver\",\n    srcs = [\"cuda/lib/%{cuda_driver_lib}\"],\n)\n\ncc_library(\n    name = \"cudart\",\n    srcs = [\"cuda/lib/%{cudart_lib}\"],\n    data = [\"cuda/lib/%{cudart_lib}\"],\n    linkstatic = 1,\n)\n\ncuda_header_library(\n    name = \"cublas_headers\",\n    hdrs = [\":cublas-include\"],\n    include_prefix = \"third_party/gpus/cuda/include\",\n    includes = [\"cublas/include\"],\n    strip_include_prefix = \"cublas/include\",\n    deps = [\":cuda_headers\"],\n)\n\ncuda_header_library(\n    name = \"cusolver_headers\",\n    hdrs = [\":cusolver-include\"],\n    include_prefix = \"third_party/gpus/cuda/include\",\n    includes = [\"cusolver/include\"],\n    strip_include_prefix = \"cusolver/include\",\n    deps = [\":cuda_headers\"],\n)\n\ncuda_header_library(\n    name = \"cufft_headers\",\n    hdrs = [\":cufft-include\"],\n    include_prefix = \"third_party/gpus/cuda/include\",\n    includes = [\"cufft/include\"],\n    strip_include_prefix = \"cufft/include\",\n    deps = [\":cuda_headers\"],\n)\n\ncuda_header_library(\n    name = \"cusparse_headers\",\n    hdrs = [\":cusparse-include\"],\n    include_prefix = \"third_party/gpus/cuda/include\",\n    includes = [\"cusparse/include\"],\n    strip_include_prefix = \"cusparse/include\",\n    deps = [\":cuda_headers\"],\n)\n\ncuda_header_library(\n    name = \"curand_headers\",\n    hdrs = [\":curand-include\"],\n    include_prefix = \"third_party/gpus/cuda/include\",\n    includes = [\"curand/include\"],\n    strip_include_prefix = \"curand/include\",\n    deps = [\":cuda_headers\"],\n)\n\ncc_library(\n    name = \"cublas\",\n    srcs = [\"cuda/lib/%{cublas_lib}\"],\n    data = [\"cuda/lib/%{cublas_lib}\"],\n    linkstatic = 1,\n)\n\ncc_library(\n    name = \"cublasLt\",\n    srcs = [\"cuda/lib/%{cublasLt_lib}\"],\n    data = [\"cuda/lib/%{cublasLt_lib}\"],\n    linkstatic = 1,\n)\n\ncc_library(\n    name = \"cusolver\",\n    srcs = [\"cuda/lib/%{cusolver_lib}\"],\n    data = [\"cuda/lib/%{cusolver_lib}\"],\n    linkopts = [\"-lgomp\"],\n    linkstatic = 1,\n)\n\ncc_library(\n    name = \"cudnn\",\n    srcs = [\"cuda/lib/%{cudnn_lib}\"],\n    data = [\"cuda/lib/%{cudnn_lib}\"],\n    linkstatic = 1,\n)\n\ncc_library(\n    name = \"cudnn_header\",\n    hdrs = [\":cudnn-include\"],\n    include_prefix = \"third_party/gpus/cudnn\",\n    strip_include_prefix = \"cudnn/include\",\n    deps = [\":cuda_headers\"],\n)\n\ncc_library(\n    name = \"cufft\",\n    srcs = [\"cuda/lib/%{cufft_lib}\"],\n    data = [\"cuda/lib/%{cufft_lib}\"],\n    linkstatic = 1,\n)\n\ncc_library(\n    name = \"curand\",\n    srcs = [\"cuda/lib/%{curand_lib}\"],\n    data = [\"cuda/lib/%{curand_lib}\"],\n    linkstatic = 1,\n)\n\ncc_library(\n    name = \"cuda\",\n    deps = [\n        \":cublas\",\n        \":cublasLt\",\n        \":cuda_headers\",\n        \":cudart\",\n        \":cudnn\",\n        \":cufft\",\n        \":curand\",\n    ],\n)\n\nalias(\n    name = \"cub_headers\",\n    actual = \"%{cub_actual}\",\n)\n\ncuda_header_library(\n    name = \"cupti_headers\",\n    hdrs = [\":cuda-extras\"],\n    include_prefix = \"third_party/gpus\",\n    includes = [\"cuda/extras/CUPTI/include/\"],\n    deps = [\":cuda_headers\"],\n)\n\ncc_library(\n    name = \"cupti_dsos\",\n    data = [\"cuda/lib/%{cupti_lib}\"],\n)\n\ncc_library(\n    name = \"cusparse\",\n    srcs = [\"cuda/lib/%{cusparse_lib}\"],\n    data = [\"cuda/lib/%{cusparse_lib}\"],\n    linkopts = [\"-lgomp\"],\n    linkstatic = 1,\n)\n\ncc_library(\n    name = \"libdevice_root\",\n    data = [\":cuda-nvvm\"],\n)\n\nbzl_library(\n    name = \"build_defs_bzl\",\n    srcs = [\"build_defs.bzl\"],\n    deps = [\n        \"@bazel_skylib//lib:selects\",\n    ],\n)\n\npy_library(\n    name = \"cuda_config_py\",\n    srcs = [\"cuda/cuda_config.py\"],\n)\n\n%{copy_rules}\n"
  },
  {
    "path": "build_deps/gpus/cuda/build_defs.bzl.tpl",
    "content": "# Macros for building CUDA code.\ndef cuda_default_copts():\n    \"\"\"Default options for all CUDA compilations.\"\"\"\n    return [\n        \"-x\",\n        \"cuda\",\n        \"-DUSE_CUDA=1\",\n        \"-Xcuda-fatbinary=--compress-all\",\n    ] + %{cuda_extra_copts}\n\n\ndef cuda_gpu_architectures():\n    \"\"\"Returns a list of supported GPU architectures.\"\"\"\n    return %{cuda_gpu_architectures}\n\n\ndef cuda_header_library(name,\n                        hdrs,\n                        include_prefix=None,\n                        strip_include_prefix=None,\n                        deps=[],\n                        **kwargs):\n    \"\"\"Generates a cc_library containing both virtual and system include paths.\n\n    Generates both a header-only target with virtual includes plus the full\n    target without virtual includes. This works around the fact that bazel can't\n    mix 'includes' and 'include_prefix' in the same target.\"\"\"\n\n    native.cc_library(\n        name=name + \"_virtual\",\n        hdrs=hdrs,\n        include_prefix=include_prefix,\n        strip_include_prefix=strip_include_prefix,\n        deps=deps,\n        visibility=[\"//visibility:private\"],\n    )\n\n    native.cc_library(name=name,\n                      textual_hdrs=hdrs,\n                      deps=deps + [\":%s_virtual\" % name],\n                      **kwargs)\n\n\ndef cuda_cc_library(copts=[], **kwargs):\n    \"\"\"Wrapper over cc_library which adds default CUDA options.\"\"\"\n    native.cc_library(copts=cuda_default_copts() + copts, **kwargs)\n\n\ndef cuda_cc_binary(copts=[], **kwargs):\n    \"\"\"Wrapper over cc_library which adds default CUDA options.\"\"\"\n    native.cc_binary(copts=cuda_default_copts() + copts, **kwargs)\n\n\ndef cuda_cc_test(copts=[], **kwargs):\n    \"\"\"Wrapper over cc_test which adds default CUDA options.\"\"\"\n    native.cc_test(copts=copts, **kwargs)\n"
  },
  {
    "path": "build_deps/gpus/cuda/cuda_config.h.tpl",
    "content": "/*\n * Copyright (c) 2023, NVIDIA CORPORATION.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *     http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n#ifndef CUDA_CUDA_CONFIG_H_\n#define CUDA_CUDA_CONFIG_H_\n\n#define CUDA_VERSION \"%{cuda_version}\"\n#define CUDART_VERSION \"%{cudart_version}\"\n#define CUPTI_VERSION \"%{cupti_version}\"\n#define CUBLAS_VERSION \"%{cublas_version}\"\n#define CUSOLVER_VERSION \"%{cusolver_version}\"\n#define CURAND_VERSION \"%{curand_version}\"\n#define CUFFT_VERSION \"%{cufft_version}\"\n#define CUSPARSE_VERSION \"%{cusparse_version}\"\n#define CUDNN_VERSION \"%{cudnn_version}\"\n\n#define CUDA_TOOLKIT_PATH \"%{cuda_toolkit_path}\"\n\n#define CUDA_COMPUTE_CAPABILITIES %{cuda_compute_capabilities}\n\n#endif  // CUDA_CUDA_CONFIG_H_\n"
  },
  {
    "path": "build_deps/gpus/cuda/cuda_config.py.tpl",
    "content": "# Copyright (c) 2023, NVIDIA CORPORATION.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\nconfig = %{cuda_config}\n"
  },
  {
    "path": "build_deps/gpus/find_cuda_config.py",
    "content": "# Copyright (c) 2023, NVIDIA CORPORATION.\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\n\"\"\"Prints CUDA library and header directories and versions found on the system.\n\nThe script searches for CUDA library and header files on the system, inspects\nthem to determine their version and prints the configuration to stdout.\nThe paths to inspect and the required versions are specified through environment\nvariables. If no valid configuration is found, the script prints to stderr and\nreturns an error code.\n\nThe list of libraries to find is specified as arguments. Supported libraries are\nCUDA (includes cuBLAS), cuDNN, NCCL, and TensorRT.\n\nThe script takes a list of base directories specified by the CUDA_PATHS\nenvironment variable as comma-separated glob list. The script looks for headers\nand library files in a hard-coded set of subdirectories from these base paths.\nIf CUDA_PATHS is not specified, a OS specific default is used:\n\n  Linux:   /usr/local/cuda, /usr, and paths from 'ldconfig -p'.\n  Windows: CUDA_PATH environment variable, or\n           C:\\\\Program Files\\\\NVIDIA GPU Computing Toolkit\\\\CUDA\\\\*\n\nFor backwards compatibility, some libraries also use alternative base\ndirectories from other environment variables if they are specified. List of\nlibrary-specific environment variables:\n\n  Library   Version env variable  Additional base directories\n  ----------------------------------------------------------------\n  CUDA      CUDA_VERSION       CUDA_TOOLKIT_PATH\n  cuBLAS    CUBLAS_VERSION     CUDA_TOOLKIT_PATH\n  cuDNN     CUDNN_VERSION      CUDNN_INSTALL_PATH\n  NCCL      NCCL_VERSION       NCCL_INSTALL_PATH, NCCL_HDR_PATH\n  TensorRT  TENSORRT_VERSION   TENSORRT_INSTALL_PATH\n\nVersions environment variables can be of the form 'x' or 'x.y' to request a\nspecific version, empty or unspecified to accept any version.\n\nThe output of a found library is of the form:\ntf_<library>_version: x.y.z\ntf_<library>_header_dir: ...\ntf_<library>_library_dir: ...\n\"\"\"\n\nimport glob\nimport io\nimport os\nimport platform\nimport re\nimport subprocess\nimport sys\n\n# pylint: disable=g-import-not-at-top\ntry:\n    from shutil import which\nexcept ImportError:\n    from distutils.spawn import find_executable as which\n# pylint: enable=g-import-not-at-top\n\n\nclass ConfigError(Exception):\n    pass\n\n\ndef _is_linux():\n    return platform.system() == \"Linux\"\n\n\ndef _is_macos():\n    return platform.system() == \"Darwin\"\n\n\ndef _matches_version(actual_version, required_version):\n    \"\"\"Checks whether some version meets the requirements.\n\n      All elements of the required_version need to be present in the\n      actual_version.\n\n          required_version  actual_version  result\n          -----------------------------------------\n          1                 1.1             True\n          1.2               1               False\n          1.2               1.3             False\n                            1               True\n\n      Args:\n        required_version: The version specified by the user.\n        actual_version: The version detected from the CUDA installation.\n      Returns: Whether the actual version matches the required one.\n  \"\"\"\n    if actual_version is None:\n        return False\n\n    # Strip spaces from the versions.\n    actual_version = actual_version.strip()\n    required_version = required_version.strip()\n    return actual_version.startswith(required_version)\n\n\ndef _at_least_version(actual_version, required_version):\n    actual = [int(v) for v in actual_version.split(\".\")]\n    required = [int(v) for v in required_version.split(\".\")]\n    return actual >= required\n\n\ndef _get_header_version(path, name):\n    \"\"\"Returns preprocessor defines in C header file.\"\"\"\n    for line in io.open(path, \"r\", encoding=\"utf-8\").readlines():\n        match = re.match(\"\\s*#\\s*define %s\\s+(\\d+)\" % name, line)\n        if match:\n            return match.group(1)\n    return \"\"\n\n\ndef _cartesian_product(first, second):\n    \"\"\"Returns all path combinations of first and second.\"\"\"\n    return [os.path.join(f, s) for f in first for s in second]\n\n\ndef _get_ld_config_paths():\n    \"\"\"Returns all directories from 'ldconfig -p'.\"\"\"\n    if not _is_linux():\n        return []\n    ldconfig_path = which(\"ldconfig\") or \"/sbin/ldconfig\"\n    output = subprocess.check_output([ldconfig_path, \"-p\"])\n    pattern = re.compile(\".* => (.*)\")\n    result = set()\n    for line in output.splitlines():\n        try:\n            match = pattern.match(line.decode(\"ascii\"))\n        except UnicodeDecodeError:\n            match = False\n        if match:\n            result.add(os.path.dirname(match.group(1)))\n    return sorted(list(result))\n\n\ndef _get_default_cuda_paths(cuda_version):\n    if not cuda_version:\n        cuda_version = \"*\"\n    elif not \".\" in cuda_version:\n        cuda_version = cuda_version + \".*\"\n\n    return [\n        \"/usr/local/cuda-%s\" % cuda_version, \"/usr/local/cuda\", \"/usr\",\n        \"/usr/local/cudnn\"\n    ] + _get_ld_config_paths()\n\n\ndef _header_paths():\n    \"\"\"Returns hard-coded set of relative paths to look for header files.\"\"\"\n    return [\n        \"\",\n        \"include\",\n        \"include/cuda\",\n        \"include/*-linux-gnu\",\n        \"extras/CUPTI/include\",\n        \"include/cuda/CUPTI\",\n        \"local/cuda/extras/CUPTI/include\",\n    ]\n\n\ndef _library_paths():\n    \"\"\"Returns hard-coded set of relative paths to look for library files.\"\"\"\n    return [\n        \"\",\n        \"lib64\",\n        \"lib\",\n        \"lib/*-linux-gnu\",\n        \"lib/x64\",\n        \"extras/CUPTI/*\",\n        \"local/cuda/lib64\",\n        \"local/cuda/extras/CUPTI/lib64\",\n    ]\n\n\ndef _not_found_error(base_paths, relative_paths, filepattern):\n    base_paths = \"\".join(\n        [\"\\n        '%s'\" % path for path in sorted(base_paths)])\n    relative_paths = \"\".join(\n        [\"\\n        '%s'\" % path for path in relative_paths])\n    return ConfigError(\n        \"Could not find any %s in any subdirectory:%s\\nof:%s\\n\" %\n        (filepattern, relative_paths, base_paths))\n\n\ndef _find_file(base_paths, relative_paths, filepattern):\n    for path in _cartesian_product(base_paths, relative_paths):\n        for file in glob.glob(os.path.join(path, filepattern)):\n            return file\n    raise _not_found_error(base_paths, relative_paths, filepattern)\n\n\ndef _find_library(base_paths, library_name, required_version):\n    \"\"\"Returns first valid path to the requested library.\"\"\"\n    if _is_macos():\n        filepattern = \"%s*.dylib\" % (\".\".join([\"lib\" + library_name] +\n                                              required_version.split(\".\")[:1]))\n    else:\n        filepattern = \".\".join([\"lib\" + library_name, \"so\"] +\n                               required_version.split(\".\")[:1]) + \"*\"\n    return _find_file(base_paths, _library_paths(), filepattern)\n\n\ndef _find_versioned_file(base_paths, relative_paths, filepatterns,\n                         required_version, get_version):\n    \"\"\"Returns first valid path to a file that matches the requested version.\"\"\"\n    if type(filepatterns) not in [list, tuple]:\n        filepatterns = [filepatterns]\n    for path in _cartesian_product(base_paths, relative_paths):\n        for filepattern in filepatterns:\n            for file in glob.glob(os.path.join(path, filepattern)):\n                actual_version = get_version(file)\n                if _matches_version(actual_version, required_version):\n                    return file, actual_version\n    raise _not_found_error(\n        base_paths, relative_paths,\n        \", \".join(filepatterns) + \" matching version '%s'\" % required_version)\n\n\ndef _find_header(base_paths, header_name, required_version, get_version):\n    \"\"\"Returns first valid path to a header that matches the requested version.\"\"\"\n    return _find_versioned_file(base_paths, _header_paths(), header_name,\n                                required_version, get_version)\n\n\ndef _find_cuda_config(base_paths, required_version):\n\n    def get_header_version(path):\n        version = int(_get_header_version(path, \"CUDA_VERSION\"))\n        if not version:\n            return None\n        return \"%d.%d\" % (version // 1000, version % 1000 // 10)\n\n    cuda_header_path, header_version = _find_header(base_paths, \"cuda.h\",\n                                                    required_version,\n                                                    get_header_version)\n    cuda_version = header_version  # x.y, see above.\n\n    cuda_library_path = _find_library(base_paths, \"cudart\", cuda_version)\n\n    def get_nvcc_version(path):\n        pattern = \"Cuda compilation tools, release \\d+\\.\\d+, V(\\d+\\.\\d+\\.\\d+)\"\n        for line in subprocess.check_output([path, \"--version\"]).splitlines():\n            match = re.match(pattern, line.decode(\"ascii\"))\n            if match:\n                return match.group(1)\n        return None\n\n    nvcc_name = \"nvcc\"\n    nvcc_path, nvcc_version = _find_versioned_file(base_paths, [\n        \"\",\n        \"bin\",\n        \"local/cuda/bin\",\n    ], nvcc_name, cuda_version, get_nvcc_version)\n\n    nvvm_path = _find_file(base_paths, [\n        \"nvvm/libdevice\",\n        \"share/cuda\",\n        \"lib/nvidia-cuda-toolkit/libdevice\",\n        \"local/cuda/nvvm/libdevice\",\n    ], \"libdevice*.10.bc\")\n\n    cupti_header_path = _find_file(base_paths, _header_paths(), \"cupti.h\")\n    cupti_library_path = _find_library(base_paths, \"cupti\", required_version)\n\n    cuda_binary_dir = os.path.dirname(nvcc_path)\n    nvvm_library_dir = os.path.dirname(nvvm_path)\n\n    # XLA requires the toolkit path to find ptxas and libdevice.\n    # TODO(csigg): pass in both directories instead.\n    cuda_toolkit_paths = (\n        os.path.normpath(os.path.join(cuda_binary_dir, \"..\")),\n        os.path.normpath(os.path.join(nvvm_library_dir, \"../..\")),\n    )\n    if cuda_toolkit_paths[0] != cuda_toolkit_paths[1]:\n        raise ConfigError(\"Inconsistent CUDA toolkit path: %s vs %s\" %\n                          cuda_toolkit_paths)\n\n    return {\n        \"cuda_version\": cuda_version,\n        \"cuda_include_dir\": os.path.dirname(cuda_header_path),\n        \"cuda_library_dir\": os.path.dirname(cuda_library_path),\n        \"cuda_binary_dir\": cuda_binary_dir,\n        \"nvvm_library_dir\": nvvm_library_dir,\n        \"cupti_include_dir\": os.path.dirname(cupti_header_path),\n        \"cupti_library_dir\": os.path.dirname(cupti_library_path),\n        \"cuda_toolkit_path\": cuda_toolkit_paths[0],\n    }\n\n\ndef _find_cublas_config(base_paths, required_version, cuda_version):\n\n    if _at_least_version(cuda_version, \"10.1\"):\n\n        def get_header_version(path):\n            version = (v for v in (_get_header_version(path, name)\n                                   for name in (\"CUBLAS_VER_MAJOR\",\n                                                \"CUBLAS_VER_MINOR\",\n                                                \"CUBLAS_VER_PATCH\",\n                                                \"CUBLAS_VER_BUILD\")) if v != \"\")\n            return \".\".join(version)\n\n        header_path, header_version = _find_header(base_paths, \"cublas_api.h\",\n                                                   required_version,\n                                                   get_header_version)\n        # cuBLAS uses the major version only.\n        cublas_version = header_version.split(\".\")[0]\n\n    else:\n        # There is no version info available before CUDA 10.1, just find the file.\n        header_version = cuda_version\n        header_path = _find_file(base_paths, _header_paths(), \"cublas_api.h\")\n        # cuBLAS version is the same as CUDA version (x.y).\n        cublas_version = required_version\n\n    library_path = _find_library(base_paths, \"cublas\", cublas_version)\n\n    return {\n        \"cublas_version\": header_version,\n        \"cublas_include_dir\": os.path.dirname(header_path),\n        \"cublas_library_dir\": os.path.dirname(library_path),\n    }\n\n\ndef _find_cusolver_config(base_paths, required_version, cuda_version):\n\n    if _at_least_version(cuda_version, \"11.0\"):\n\n        def get_header_version(path):\n            version = (v for v in (_get_header_version(path, name)\n                                   for name in (\"CUSOLVER_VER_MAJOR\",\n                                                \"CUSOLVER_VER_MINOR\",\n                                                \"CUSOLVER_VER_PATCH\",\n                                                \"CUSOLVER_VER_BUILD\")) if v != \"\")\n            return \".\".join(version)\n\n        header_path, header_version = _find_header(base_paths,\n                                                   \"cusolver_common.h\",\n                                                   required_version,\n                                                   get_header_version)\n        cusolver_version = header_version.split(\".\")[0]\n\n    else:\n        header_version = cuda_version\n        header_path = _find_file(base_paths, _header_paths(),\n                                 \"cusolver_common.h\")\n        cusolver_version = required_version\n\n    library_path = _find_library(base_paths, \"cusolver\", cusolver_version)\n\n    return {\n        \"cusolver_version\": header_version,\n        \"cusolver_include_dir\": os.path.dirname(header_path),\n        \"cusolver_library_dir\": os.path.dirname(library_path),\n    }\n\n\ndef _find_curand_config(base_paths, required_version, cuda_version):\n\n    if _at_least_version(cuda_version, \"11.0\"):\n\n        def get_header_version(path):\n            version = (v for v in (_get_header_version(path, name)\n                                   for name in (\"CURAND_VER_MAJOR\",\n                                                \"CURAND_VER_MINOR\",\n                                                \"CURAND_VER_PATCH\",\n                                                \"CURAND_VER_BUILD\")) if v != \"\")\n            return \".\".join(version)\n\n        header_path, header_version = _find_header(base_paths, \"curand.h\",\n                                                   required_version,\n                                                   get_header_version)\n        curand_version = header_version.split(\".\")[0]\n\n    else:\n        header_version = cuda_version\n        header_path = _find_file(base_paths, _header_paths(), \"curand.h\")\n        curand_version = required_version\n\n    library_path = _find_library(base_paths, \"curand\", curand_version)\n\n    return {\n        \"curand_version\": header_version,\n        \"curand_include_dir\": os.path.dirname(header_path),\n        \"curand_library_dir\": os.path.dirname(library_path),\n    }\n\n\ndef _find_cufft_config(base_paths, required_version, cuda_version):\n\n    if _at_least_version(cuda_version, \"11.0\"):\n\n        def get_header_version(path):\n            version = (v for v in (_get_header_version(path, name)\n                                   for name in (\"CUFFT_VER_MAJOR\",\n                                                \"CUFFT_VER_MINOR\",\n                                                \"CUFFT_VER_PATCH\",\n                                                \"CUFFT_VER_BUILD\")) if v != \"\")\n            return \".\".join(version)\n\n        header_path, header_version = _find_header(base_paths, \"cufft.h\",\n                                                   required_version,\n                                                   get_header_version)\n        cufft_version = header_version.split(\".\")[0]\n\n    else:\n        header_version = cuda_version\n        header_path = _find_file(base_paths, _header_paths(), \"cufft.h\")\n        cufft_version = required_version\n\n    library_path = _find_library(base_paths, \"cufft\", cufft_version)\n\n    return {\n        \"cufft_version\": header_version,\n        \"cufft_include_dir\": os.path.dirname(header_path),\n        \"cufft_library_dir\": os.path.dirname(library_path),\n    }\n\n\ndef _find_cudnn_config(base_paths, required_version):\n\n    def get_header_version(path):\n        version = [\n            _get_header_version(path, name)\n            for name in (\"CUDNN_MAJOR\", \"CUDNN_MINOR\", \"CUDNN_PATCHLEVEL\")\n        ]\n        return \".\".join(version) if version[0] else None\n\n    header_path, header_version = _find_header(base_paths,\n                                               (\"cudnn.h\", \"cudnn_version.h\"),\n                                               required_version,\n                                               get_header_version)\n    cudnn_version = header_version.split(\".\")[0]\n\n    library_path = _find_library(base_paths, \"cudnn\", cudnn_version)\n\n    return {\n        \"cudnn_version\": cudnn_version,\n        \"cudnn_include_dir\": os.path.dirname(header_path),\n        \"cudnn_library_dir\": os.path.dirname(library_path),\n    }\n\n\ndef _find_cusparse_config(base_paths, required_version, cuda_version):\n\n    if _at_least_version(cuda_version, \"11.0\"):\n\n        def get_header_version(path):\n            version = (v for v in (_get_header_version(path, name)\n                                   for name in (\"CUSPARSE_VER_MAJOR\",\n                                                \"CUSPARSE_VER_MINOR\",\n                                                \"CUSPARSE_VER_PATCH\",\n                                                \"CUSPARSE_VER_BUILD\")) if v != \"\")\n            return \".\".join(version)\n\n        header_path, header_version = _find_header(base_paths, \"cusparse.h\",\n                                                   required_version,\n                                                   get_header_version)\n        cusparse_version = header_version.split(\".\")[0]\n\n    else:\n        header_version = cuda_version\n        header_path = _find_file(base_paths, _header_paths(), \"cusparse.h\")\n        cusparse_version = required_version\n\n    library_path = _find_library(base_paths, \"cusparse\", cusparse_version)\n\n    return {\n        \"cusparse_version\": header_version,\n        \"cusparse_include_dir\": os.path.dirname(header_path),\n        \"cusparse_library_dir\": os.path.dirname(library_path),\n    }\n\n\ndef _find_nccl_config(base_paths, required_version):\n\n    def get_header_version(path):\n        version = (_get_header_version(path, name)\n                   for name in (\"NCCL_MAJOR\", \"NCCL_MINOR\", \"NCCL_PATCH\"))\n        return \".\".join(version)\n\n    header_path, header_version = _find_header(base_paths, \"nccl.h\",\n                                               required_version,\n                                               get_header_version)\n    nccl_version = header_version.split(\".\")[0]\n\n    library_path = _find_library(base_paths, \"nccl\", nccl_version)\n\n    return {\n        \"nccl_version\": nccl_version,\n        \"nccl_include_dir\": os.path.dirname(header_path),\n        \"nccl_library_dir\": os.path.dirname(library_path),\n    }\n\n\ndef _find_tensorrt_config(base_paths, required_version):\n\n    def get_header_version(path):\n        version = (_get_header_version(path, name)\n                   for name in (\"NV_TENSORRT_MAJOR\", \"NV_TENSORRT_MINOR\",\n                                \"NV_TENSORRT_PATCH\"))\n        # `version` is a generator object, so we convert it to a list before using\n        # it (muitiple times below).\n        version = list(version)\n        if not all(version):\n            # Versions not found, make _matches_version returns False.\n            return None\n        return \".\".join(version)\n\n    header_path, header_version = _find_header(base_paths, \"NvInferVersion.h\",\n                                               required_version,\n                                               get_header_version)\n\n    tensorrt_version = header_version.split(\".\")[0]\n    library_path = _find_library(base_paths, \"nvinfer\", tensorrt_version)\n\n    return {\n        \"tensorrt_version\": tensorrt_version,\n        \"tensorrt_include_dir\": os.path.dirname(header_path),\n        \"tensorrt_library_dir\": os.path.dirname(library_path),\n    }\n\n\ndef _list_from_env(env_name, default=[]):\n    \"\"\"Returns comma-separated list from environment variable.\"\"\"\n    if env_name in os.environ:\n        return os.environ[env_name].split(\",\")\n    return default\n\n\ndef _get_legacy_path(env_name, default=[]):\n    \"\"\"Returns a path specified by a legacy environment variable.\n\n  CUDNN_INSTALL_PATH, NCCL_INSTALL_PATH, TENSORRT_INSTALL_PATH set to\n  '/usr/lib/x86_64-linux-gnu' would previously find both library and header\n  paths. Detect those and return '/usr', otherwise forward to _list_from_env().\n  \"\"\"\n    if env_name in os.environ:\n        match = re.match(\"^(/[^/ ]*)+/lib/\\w+-linux-gnu/?$\",\n                         os.environ[env_name])\n        if match:\n            return [match.group(1)]\n    return _list_from_env(env_name, default)\n\n\ndef _normalize_path(path):\n    \"\"\"Returns normalized path, with forward slashes on Windows.\"\"\"\n    return os.path.realpath(path)\n\n\ndef find_cuda_config():\n    \"\"\"Returns a dictionary of CUDA library and header file paths.\"\"\"\n    libraries = [argv.lower() for argv in sys.argv[1:]]\n    cuda_version = os.environ.get(\"CUDA_VERSION\", \"\")\n    base_paths = _list_from_env(\"CUDA_PATHS\",\n                                _get_default_cuda_paths(cuda_version))\n    base_paths = [path for path in base_paths if os.path.exists(path)]\n\n    result = {}\n    if \"cuda\" in libraries:\n        cuda_paths = _list_from_env(\"CUDA_TOOLKIT_PATH\", base_paths)\n        result.update(_find_cuda_config(cuda_paths, cuda_version))\n\n        cuda_version = result[\"cuda_version\"]\n        cublas_paths = base_paths\n        if tuple(int(v) for v in cuda_version.split(\".\")) < (10, 1):\n            # Before CUDA 10.1, cuBLAS was in the same directory as the toolkit.\n            cublas_paths = cuda_paths\n        cublas_version = os.environ.get(\"CUBLAS_VERSION\", \"\")\n        result.update(\n            _find_cublas_config(cublas_paths, cublas_version, cuda_version))\n\n        cusolver_paths = base_paths\n        if tuple(int(v) for v in cuda_version.split(\".\")) < (11, 0):\n            cusolver_paths = cuda_paths\n        cusolver_version = os.environ.get(\"CUSOLVER_VERSION\", \"\")\n        result.update(\n            _find_cusolver_config(cusolver_paths, cusolver_version,\n                                  cuda_version))\n\n        curand_paths = base_paths\n        if tuple(int(v) for v in cuda_version.split(\".\")) < (11, 0):\n            curand_paths = cuda_paths\n        curand_version = os.environ.get(\"CURAND_VERSION\", \"\")\n        result.update(\n            _find_curand_config(curand_paths, curand_version, cuda_version))\n\n        cufft_paths = base_paths\n        if tuple(int(v) for v in cuda_version.split(\".\")) < (11, 0):\n            cufft_paths = cuda_paths\n        cufft_version = os.environ.get(\"CUFFT_VERSION\", \"\")\n        result.update(\n            _find_cufft_config(cufft_paths, cufft_version, cuda_version))\n\n        cusparse_paths = base_paths\n        if tuple(int(v) for v in cuda_version.split(\".\")) < (11, 0):\n            cusparse_paths = cuda_paths\n        cusparse_version = os.environ.get(\"CUSPARSE_VERSION\", \"\")\n        result.update(\n            _find_cusparse_config(cusparse_paths, cusparse_version,\n                                  cuda_version))\n\n    if \"cudnn\" in libraries:\n        cudnn_paths = _get_legacy_path(\"CUDNN_INSTALL_PATH\", base_paths)\n        cudnn_version = os.environ.get(\"CUDNN_VERSION\", \"\")\n        result.update(_find_cudnn_config(cudnn_paths, cudnn_version))\n\n    if \"nccl\" in libraries:\n        nccl_paths = _get_legacy_path(\"NCCL_INSTALL_PATH\", base_paths)\n        nccl_version = os.environ.get(\"NCCL_VERSION\", \"\")\n        result.update(_find_nccl_config(nccl_paths, nccl_version))\n\n    if \"tensorrt\" in libraries:\n        tensorrt_paths = _get_legacy_path(\"TENSORRT_INSTALL_PATH\", base_paths)\n        tensorrt_version = os.environ.get(\"TENSORRT_VERSION\", \"\")\n        result.update(_find_tensorrt_config(tensorrt_paths, tensorrt_version))\n\n    for k, v in result.items():\n        if k.endswith(\"_dir\") or k.endswith(\"_path\"):\n            result[k] = _normalize_path(v)\n\n    return result\n\n\ndef main():\n    try:\n        for key, value in sorted(find_cuda_config().items()):\n            print(\"%s: %s\" % (key, value))\n    except ConfigError as e:\n        sys.stderr.write(str(e) + '\\n')\n        sys.exit(1)\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "build_deps/remote_config/BUILD",
    "content": ""
  },
  {
    "path": "build_deps/remote_config/BUILD.tpl",
    "content": "# Each platform creates a constraint @<platform>//:platform_constraint that\n# is listed in its constraint_values; rule that want to select a specific\n# platform to run on can put @<platform>//:platform_constraing into their\n# exec_compatible_with attribute.\n# Toolchains can similarly be marked with target_compatible_with or\n# exec_compatible_with to bind them to this platform.\nconstraint_setting(\n    name = \"platform_setting\"\n)\n\nconstraint_value(\n    name = \"platform_constraint\",\n    constraint_setting = \":platform_setting\",\n    visibility = [\"//visibility:public\"],\n)\n\nplatform(\n    name = \"platform\",\n    visibility = [\"//visibility:public\"],\n    constraint_values = [\n        \"@platforms//cpu:%{cpu}\",\n        \"@platforms//os:%{platform}\",\n        \":platform_constraint\",\n    ],\n    exec_properties = %{exec_properties},\n)\n"
  },
  {
    "path": "build_deps/remote_config/common.bzl",
    "content": "\"\"\"Functions common across configure rules.\"\"\"\n\nBAZEL_SH = \"BAZEL_SH\"\nPYTHON_BIN_PATH = \"PYTHON_BIN_PATH\"\nPYTHON_LIB_PATH = \"PYTHON_LIB_PATH\"\nPYTHON_CONFIG_REPO = \"PYTHON_CONFIG_REPO\"\n\n\ndef auto_config_fail(msg):\n    \"\"\"Output failure message when auto configuration fails.\"\"\"\n    red = \"\\033[0;31m\"\n    no_color = \"\\033[0m\"\n    fail(\"%sConfiguration Error:%s %s\\n\" % (red, no_color, msg))\n\n\ndef which(repository_ctx, program_name, allow_failure=False):\n    \"\"\"Returns the full path to a program on the execution platform.\n\n    Args:\n      repository_ctx: the repository_ctx\n      program_name: name of the program on the PATH\n\n    Returns:\n      The full path to a program on the execution platform.\n    \"\"\"\n    out = execute(\n        repository_ctx,\n        [\"which\", program_name],\n        allow_failure=allow_failure,\n    ).stdout\n    if out != None:\n        out = out.replace(\"\\\\\", \"\\\\\\\\\").rstrip()\n    return out\n\n\ndef get_python_bin(repository_ctx):\n    \"\"\"Gets the python bin path.\n\n    Args:\n      repository_ctx: the repository_ctx\n\n    Returns:\n      The python bin path.\n    \"\"\"\n    python_bin = get_host_environ(repository_ctx, PYTHON_BIN_PATH)\n    if python_bin:\n        return python_bin\n\n    # First check for an explicit \"python3\"\n    python_bin = which(repository_ctx, \"python3\", True)\n    if python_bin:\n        return python_bin\n\n    # Some systems just call pythone3 \"python\"\n    python_bin = which(repository_ctx, \"python\", True)\n    if python_bin:\n        return python_bin\n\n    auto_config_fail(\n        \"Cannot find python in PATH, please make sure \" +\n        \"python is installed and add its directory in PATH, or --define \" +\n        \"%s='/something/else'.\\nPATH=%s\" % (\n            PYTHON_BIN_PATH,\n            get_environ(repository_ctx, \"PATH\"),\n        ))\n    return python_bin  # unreachable\n\n\ndef get_bash_bin(repository_ctx):\n    \"\"\"Gets the bash bin path.\n\n    Args:\n      repository_ctx: the repository_ctx\n\n    Returns:\n      The bash bin path.\n    \"\"\"\n    bash_bin = get_host_environ(repository_ctx, BAZEL_SH)\n    if bash_bin != None:\n        return bash_bin\n    bash_bin_path = which(repository_ctx, \"bash\")\n    if bash_bin_path == None:\n        auto_config_fail(\n            \"Cannot find bash in PATH, please make sure \" +\n            \"bash is installed and add its directory in PATH, or --define \" +\n            \"%s='/path/to/bash'.\\nPATH=%s\" % (\n                BAZEL_SH,\n                get_environ(repository_ctx, \"PATH\"),\n            ))\n    return bash_bin_path\n\n\ndef read_dir(repository_ctx, src_dir):\n    \"\"\"Returns a sorted list with all files in a directory.\n\n    Finds all files inside a directory, traversing subfolders and following\n    symlinks.\n\n    Args:\n      repository_ctx: the repository_ctx\n      src_dir: the directory to traverse\n\n    Returns:\n      A sorted list with all files in a directory.\n    \"\"\"\n    find_result = execute(\n        repository_ctx,\n        [\"find\", src_dir, \"-follow\", \"-type\", \"f\"],\n        allow_failure=True,\n    )\n    result = find_result.stdout\n    return sorted(result.splitlines())\n\n\ndef get_environ(repository_ctx, name, default_value=None):\n    \"\"\"Returns the value of an environment variable on the execution platform.\n\n    Args:\n      repository_ctx: the repository_ctx\n      name: the name of environment variable\n      default_value: the value to return if not set\n\n    Returns:\n      The value of the environment variable 'name' on the execution platform\n      or 'default_value' if it's not set.\n    \"\"\"\n    cmd = \"echo -n \\\"$%s\\\"\" % name\n    result = execute(\n        repository_ctx,\n        [get_bash_bin(repository_ctx), \"-c\", cmd],\n        allow_failure=True,\n    )\n    if len(result.stdout) == 0:\n        return default_value\n    return result.stdout\n\n\ndef get_host_environ(repository_ctx, name, default_value=None):\n    \"\"\"Returns the value of an environment variable on the host platform.\n\n    The host platform is the machine that Bazel runs on.\n\n    Args:\n      repository_ctx: the repository_ctx\n      name: the name of environment variable\n\n    Returns:\n      The value of the environment variable 'name' on the host platform.\n    \"\"\"\n    if name in repository_ctx.os.environ:\n        return repository_ctx.os.environ.get(name).strip()\n\n    if hasattr(repository_ctx.attr,\n               \"environ\") and name in repository_ctx.attr.environ:\n        return repository_ctx.attr.environ.get(name).strip()\n\n    return default_value\n\n\ndef get_cpu_value(repository_ctx):\n    \"\"\"Returns the name of the host operating system.\n\n    Args:\n      repository_ctx: The repository context.\n    Returns:\n      A string containing the name of the host operating system.\n    \"\"\"\n    result = raw_exec(repository_ctx, [\"uname\", \"-s\"])\n    return result.stdout.strip()\n\n\ndef execute(repository_ctx,\n            cmdline,\n            error_msg=None,\n            error_details=None,\n            allow_failure=False):\n    \"\"\"Executes an arbitrary shell command.\n\n    Args:\n      repository_ctx: the repository_ctx object\n      cmdline: list of strings, the command to execute\n      error_msg: string, a summary of the error if the command fails\n      error_details: string, details about the error or steps to fix it\n      allow_failure: bool, if True, an empty stdout result or output to stderr\n        is fine, otherwise either of these is an error\n    Returns:\n      The result of repository_ctx.execute(cmdline)\n    \"\"\"\n    result = raw_exec(repository_ctx, cmdline)\n    if (result.stderr or not result.stdout) and not allow_failure:\n        fail(\n            \"\\n\".join([\n                error_msg.strip()\n                if error_msg else \"Repository command failed\",\n                result.stderr.strip(),\n                error_details if error_details else \"\",\n            ]), )\n    return result\n\n\ndef raw_exec(repository_ctx, cmdline):\n    \"\"\"Executes a command via repository_ctx.execute() and returns the result.\n\n    This method is useful for debugging purposes. For example, to print all\n    commands executed as well as their return code.\n\n    Args:\n      repository_ctx: the repository_ctx\n      cmdline: the list of args\n\n    Returns:\n      The 'exec_result' of repository_ctx.execute().\n    \"\"\"\n    return repository_ctx.execute(cmdline)\n\n\ndef files_exist(repository_ctx, paths, bash_bin=None):\n    \"\"\"Checks which files in paths exists.\n\n    Args:\n      repository_ctx: the repository_ctx\n      paths: a list of paths\n      bash_bin: path to the bash interpreter\n\n    Returns:\n      Returns a list of Bool. True means that the path at the\n      same position in the paths list exists.\n    \"\"\"\n    if bash_bin == None:\n        bash_bin = get_bash_bin(repository_ctx)\n\n    cmd_tpl = \"[ -e \\\"%s\\\" ] && echo True || echo False\"\n    cmds = [cmd_tpl % path for path in paths]\n    cmd = \" ; \".join(cmds)\n\n    stdout = execute(repository_ctx, [bash_bin, \"-c\", cmd]).stdout.strip()\n    return [val == \"True\" for val in stdout.splitlines()]\n\n\ndef realpath(repository_ctx, path, bash_bin=None):\n    \"\"\"Returns the result of \"realpath path\".\n\n    Args:\n      repository_ctx: the repository_ctx\n      path: a path on the file system\n      bash_bin: path to the bash interpreter\n\n    Returns:\n      Returns the result of \"realpath path\"\n    \"\"\"\n    if bash_bin == None:\n        bash_bin = get_bash_bin(repository_ctx)\n\n    return execute(repository_ctx,\n                   [bash_bin, \"-c\", \"realpath \\\"%s\\\"\" % path]).stdout.strip()\n\n\ndef err_out(result):\n    \"\"\"Returns stderr if set, else stdout.\n\n    This function is a workaround for a bug in RBE where stderr is returned as stdout. Instead\n    of using result.stderr use err_out(result) instead.\n\n    Args:\n      result: the exec_result.\n\n    Returns:\n      The stderr if set, else stdout\n    \"\"\"\n    if len(result.stderr) == 0:\n        return result.stdout\n    return result.stderr\n\n\ndef config_repo_label(config_repo, target):\n    \"\"\"Construct a label from config_repo and target.\n\n    This function exists to ease the migration from preconfig to remote config. In preconfig\n    the *_CONFIG_REPO environ variables are set to packages in the main repo while in\n    remote config they will point to remote repositories.\n\n    Args:\n      config_repo: a remote repository or package.\n      target: a target\n    Returns:\n      A label constructed from config_repo and target.\n    \"\"\"\n    if config_repo.startswith(\"@\") and not config_repo.find(\"//\") > 0:\n        # remote config is being used.\n        return Label(config_repo + \"//\" + target)\n    elif target.startswith(\":\"):\n        return Label(config_repo + target)\n    else:\n        return Label(config_repo + \"/\" + target)\n"
  },
  {
    "path": "build_deps/remote_config/remote_platform_configure.bzl",
    "content": "\"\"\"Repository rule to create a platform for a docker image to be used with RBE.\"\"\"\n\n\ndef _remote_platform_configure_impl(repository_ctx):\n    platform = repository_ctx.attr.platform\n    if platform == \"local\":\n        os = repository_ctx.os.name.lower()\n        if os.startswith(\"mac os\"):\n            platform = \"osx\"\n        else:\n            platform = \"linux\"\n\n    cpu = \"x86_64\"\n    machine_type = repository_ctx.execute([\"bash\", \"-c\",\n                                           \"echo $MACHTYPE\"]).stdout\n    if (machine_type.startswith(\"ppc\") or machine_type.startswith(\"powerpc\")):\n        cpu = \"ppc\"\n    elif machine_type.startswith(\"s390x\"):\n        cpu = \"s390x\"\n    elif machine_type.startswith(\"aarch64\"):\n        cpu = \"aarch64\"\n    elif machine_type.startswith(\"arm64\"):\n        cpu = \"aarch64\"\n    elif machine_type.startswith(\"arm\"):\n        cpu = \"arm\"\n    elif machine_type.startswith(\"mips64\"):\n        cpu = \"mips64\"\n    elif machine_type.startswith(\"riscv64\"):\n        cpu = \"riscv64\"\n\n    exec_properties = repository_ctx.attr.platform_exec_properties\n\n    serialized_exec_properties = \"{\"\n    for k, v in exec_properties.items():\n        serialized_exec_properties += \"\\\"%s\\\" : \\\"%s\\\",\" % (k, v)\n    serialized_exec_properties += \"}\"\n\n    repository_ctx.template(\n        \"BUILD\",\n        Label(\"//remote_config:BUILD.tpl\"),\n        {\n            \"%{platform}\": platform,\n            \"%{exec_properties}\": serialized_exec_properties,\n            \"%{cpu}\": cpu,\n        },\n    )\n\n\nremote_platform_configure = repository_rule(\n    implementation=_remote_platform_configure_impl,\n    attrs={\n        \"platform_exec_properties\": attr.string_dict(mandatory=True),\n        \"platform\": attr.string(default=\"linux\", values=[\"linux\", \"local\"]),\n    },\n)\n"
  },
  {
    "path": "cmake/modules/ClangFormat.cmake",
    "content": "# Copyright Tomas Zeman 2018.\n# Distributed under the Boost Software License, Version 1.0.\n# (See accompanying file LICENSE_1_0.txt or copy at\n# http://www.boost.org/LICENSE_1_0.txt)\n\nfunction(clangformat_setup clangformat_srcs)\n\n  if(NOT CLANGFORMAT_EXECUTABLE)\n    set(CLANGFORMAT_EXECUTABLE clang-format)\n  endif()\n\n  if(NOT EXISTS ${CLANGFORMAT_EXECUTABLE})\n    find_program(clangformat_executable_tmp ${CLANGFORMAT_EXECUTABLE})\n    if(clangformat_executable_tmp)\n      set(CLANGFORMAT_EXECUTABLE ${clangformat_executable_tmp})\n      unset(clangformat_executable_tmp)\n    else()\n      message(FATAL_ERROR \"ClangFormat: ${CLANGFORMAT_EXECUTABLE} not found! Aborting\")\n    endif()\n  endif()\n\n  foreach(clangformat_src ${clangformat_srcs})\n    get_filename_component(clangformat_src ${clangformat_src} ABSOLUTE)\n    list(APPEND clangformat_srcs_tmp ${clangformat_src})\n  endforeach()\n  set(clangformat_srcs \"${clangformat_srcs_tmp}\")\n  unset(clangformat_srcs_tmp)\n\n  add_custom_target(${PROJECT_NAME}_clangformat ALL\n                    COMMAND ${CLANGFORMAT_EXECUTABLE}\n                            -style=file\n                            -i\n                            ${clangformat_srcs}\n                    COMMENT \"Formating with ${CLANGFORMAT_EXECUTABLE} ...\")\n\nendfunction()\n"
  },
  {
    "path": "docs/Makefile",
    "content": "# Minimal makefile for Sphinx documentation\n#\n\n# You can set these variables from the command line, and also\n# from the environment for the first two.\nSPHINXOPTS    ?=\nSPHINXBUILD   ?= sphinx-build\nSOURCEDIR     = source\nBUILDDIR      = build\n\n# Put it first so that \"make\" without argument is like \"make help\".\nhelp:\n\t@$(SPHINXBUILD) -M help \"$(SOURCEDIR)\" \"$(BUILDDIR)\" $(SPHINXOPTS) $(O)\n\n.PHONY: help Makefile clean\n\nclean:\n\trm -rf source/api source/README.md source/CONTRIBUTING.md\n\t@$(SPHINXBUILD) -M clean \"$(SOURCEDIR)\" \"$(BUILDDIR)\" $(SPHINXOPTS) $(0)\n\n# Catch-all target: route all unknown targets to Sphinx using the new\n# \"make mode\" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).\n%: Makefile\n\t@$(SPHINXBUILD) -M $@ \"$(SOURCEDIR)\" \"$(BUILDDIR)\" $(SPHINXOPTS) $(O)\n"
  },
  {
    "path": "docs/README.md",
    "content": "# Documentation\n\nThis folder contains the scripts necessary to build the documentation for HierarchicalKV.\nYou can view the generated [HierarchicalKV documentation](https://nvidia-merlin.github.io/HierarchicalKV/master/README.html).\n\n## Contributing to Docs\n\nFollow the instructions below to be able to build the docs.\n\n1. Install required documentation tools and extensions:\n\n```shell\nsudo apt-get install doxygen\npip install -r docs/requirements-doc.txt\n```\n\n2. Build the documentation:\n\n`make -C docs clean html`\n\nThe preceding command runs Sphinx in your shell and outputs to build/html/index.html.\n\nThe build process for HierarchicalKV is unique among the Merlin projects because it\nuses Doxygen, Breathe, and Exhale to create API documentation from the C++ source.\n\n## Preview the changes\n\nView docs web page by opening the HTML in your browser.\nRun the following command from the root of the repository:\n\n```bash\npython -m http.server 8000 --directory docs/build/html\n```\n\nAfterward, open a web browser and access `https://localhost:8000`.\n\nCheck that your edits formatted correctly and read well.\n\n## Decisions\n\n### Rebuild the documentation on GitHub Pages\n\nThe `.github/workflows/docs-sched-rebuild.yaml` file rebuilds the documentation\nfor the `master` branch and the six most recent tags.  The job runs daily,\nbut you can trigger it manually by going to the following URL and clicking\nthe *Run workflow* button.\n\n<https://github.com/NVIDIA-Merlin/HierarchicalKV/actions/workflows/docs-sched-rebuild.yaml>\n\n### Source management: README and index files\n\n* To preserve Sphinx's expectation that all source files are child files and directories\n  of the `docs/source` directory, other content, such as the `README.md` file is\n  copied to the source directory. You can determine which directories and files are copied by\n  viewing `docs/source/conf.py` and looking for the `copydirs_additional_dirs` list.\n  Directories are specified relative to the Sphinx source directory, `docs/source`.\n\n* One consequence of the preceding bullet is that any change to the original files,\n  such as adding or removing a topic, requires a similar change to the `docs/source/toc.yaml`\n  file.  Updating the `docs/source/toc.yaml` file is not automatic.\n\n* Because the GitHub browsing expectation is that a `README.md` file is rendered when you\n  browse a directory, when a directory is copied, the `README.md` file is renamed to\n  `index.md` to meet the HTML web server expectation of locating an `index.html` file\n  in a directory.\n\n### Adding links\n\nTIP: When adding a link to a method or any heading that has underscores in it, repeat\nthe underscores in the link even though they are converted to hyphens in the HTML.\n\nRefer to the following examples:\n\n* `../somefile.md#2heading-with-spaces-and_underscore_separated_words-too`\n* `./otherfile.md#save_params_to_files-method`\n\n#### Docs-to-docs links\n\nThere is no concern for the GitHub browsing experience for files in the `docs/source/` directory.\nYou can use a relative path for the link.  For example--both the `README.md` file and the\n`CONTRIBUTING.md` file are copied to `docs/source`. Because they are are both in the same\ndirectory, you could add a link to a heading in the `README.md` file like this:\n\n```markdown\nTo build HierarchicalKV from scratch, refer to\n[How to Build](./README.md#how-to-build) in the `README` file.\n```\n\nWhen Sphinx renders the link, the `.md` file suffix is replaced with `.html`.\n\n#### Docs-to-repository links\n\nSome files that we publish as docs, such as the `CONTRIBUTING.md` file, refer readers to files\nthat are not published as docs. For example, we currently do not publish the `STYLE_GUIDE.md`\nfile.\n\nTo refer a reader to the `STYLE_GUIDE.md`, a README, or program, state that the link is to\nthe repository:\n\n```markdown\n## Coding Style\nRefer to the [Style Guide](http://github.com/NVIDIA-Merlin/HierarchicalKV/STYLE_GUIDE.md)\nin the GitHub repository for more details.\n```\n\nThe idea is to let a reader know that following the link&mdash;whether from an HTML docs page or\nfrom browsing GitHub&mdash;results in viewing our repository on GitHub.\n\n"
  },
  {
    "path": "docs/make.bat",
    "content": "@ECHO OFF\n\npushd %~dp0\n\nREM Command file for Sphinx documentation\n\nif \"%SPHINXBUILD%\" == \"\" (\n\tset SPHINXBUILD=sphinx-build\n)\nset SOURCEDIR=source\nset BUILDDIR=build\n\nif \"%1\" == \"\" goto help\n\n%SPHINXBUILD% >NUL 2>NUL\nif errorlevel 9009 (\n\techo.\n\techo.The 'sphinx-build' command was not found. Make sure you have Sphinx\n\techo.installed, then set the SPHINXBUILD environment variable to point\n\techo.to the full path of the 'sphinx-build' executable. Alternatively you\n\techo.may add the Sphinx directory to PATH.\n\techo.\n\techo.If you don't have Sphinx installed, grab it from\n\techo.http://sphinx-doc.org/\n\texit /b 1\n)\n\n%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%\ngoto end\n\n:help\n%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%\n\n:end\npopd\n"
  },
  {
    "path": "docs/requirements-doc.txt",
    "content": "# packages necessary to run tests and push PRs\n# assumes requirements for nvtabular logic are already installed\n\nwheel\n\n# docs\nSphinx<3.6\njinja2<3.1\nmarkupsafe==2.0.1\nsphinx-multiversion@git+https://github.com/mikemckiernan/sphinx-multiversion.git\nsphinxcontrib-copydirs@git+https://github.com/mikemckiernan/sphinxcontrib-copydirs.git\nsphinx-external-toc<0.4\nsphinx_rtd_theme\nnatsort<8.2\nmyst-nb\nmarkdown-it-py\nlinkify-it-py\n\n# C++\nexhale<0.4\n"
  },
  {
    "path": "docs/source/_static/.gitkeep",
    "content": ""
  },
  {
    "path": "docs/source/_static/css/banner.css",
    "content": ".wy-nav-content {\n    margin: 0;\n    background: #fcfcfc;\n    padding-top: 40px;\n}\n\n.wy-side-nav-search {\n    display: block;\n    width: 300px;\n    padding: .809em;\n    padding-top: 0.809em;\n    margin-bottom: .809em;\n    z-index: 200;\n    background-color: #2980b9;\n    text-align: center;\n    color: #fcfcfc;\n    padding-top: 40px;\n}\n\ndiv.banner {\n    position: fixed;\n    top: 10px;\n    left: 20px;\n    margin: 0;\n    z-index: 1000;\n    width: 1050px;\n    text-align: center;\n}\n\np.banner {\n  border-radius: 4px;\n  color: #004831;\n  background: #76b900;\n}"
  },
  {
    "path": "docs/source/_static/css/custom.css",
    "content": "dl.cpp > dt > span.pre { padding-right: 2px; }\n\n/* dl.cpp > dt > a > span.pre { padding-right: 2px; } */\n\ndl > dt > em > span.pre { padding-right: 0px; padding-left: 2px; }\n\ndl > dt > code.sig-name > span.pre { padding-left: 2px; }\n\nfooter div p {\n  font-size: 80%;\n}\n\nfooter div p a {\n  color: var(--small-font-color);\n}\n\nfooter div p a:hover {\n  color: var(--small-font-color);\n}\n"
  },
  {
    "path": "docs/source/_templates/footer.html",
    "content": "{% extends '!footer.html' %}\n{% block contentinfo %}\n{{ super() }}\n<p>\n<a href=\"https://www.nvidia.com/en-us/about-nvidia/privacy-policy/\" target=\"_blank\">Privacy Policy</a> |\n<a href=\"https://www.nvidia.com/en-us/about-nvidia/privacy-center/\" target=\"_blank\">Manage My Privacy</a> |\n<a href=\"https://www.nvidia.com/en-us/preferences/start/\" target=\"_blank\">Do Not Sell or Share My Data</a> |\n<a href=\"https://www.nvidia.com/en-us/about-nvidia/terms-of-service/\" target=\"_blank\">Terms of Service</a> |\n<a href=\"https://www.nvidia.com/en-us/about-nvidia/accessibility/\" target=\"_blank\">Accessibility</a> |\n<a href=\"https://www.nvidia.com/en-us/about-nvidia/company-policies/\" target=\"_blank\">Corporate Policies</a> |\n<a href=\"https://www.nvidia.com/en-us/product-security/\" target=\"_blank\">Product Security</a> |\n<a href=\"https://www.nvidia.com/en-us/contact/\" target=\"_blank\">Contact</a>\n</p>\n{% endblock %}"
  },
  {
    "path": "docs/source/_templates/versions.html",
    "content": "{%- if current_version %}\n<div class=\"rst-versions\" data-toggle=\"rst-versions\" role=\"note\" aria-label=\"versions\">\n  <span class=\"rst-current-version\" data-toggle=\"rst-current-version\">\n    <span class=\"fa fa-book\"> Other Versions</span>\n    v: {{ current_version.name }}\n    <span class=\"fa fa-caret-down\"></span>\n  </span>\n  <div class=\"rst-other-versions\">\n    {%- if versions.tags %}\n    <dl>\n      <dt>Tags</dt>\n      {%- for item in versions.tags %}\n      <dd><a href=\"{{ item.url }}\">{{ item.name }}</a></dd>\n      {%- endfor %}\n    </dl>\n    {%- endif %}\n    {%- if versions.branches %}\n    <dl>\n      <dt>Branches</dt>\n      {%- for item in versions.branches %}\n      <dd><a href=\"{{ item.url }}\">{{ item.name }}</a></dd>\n      {%- endfor %}\n    </dl>\n    {%- endif %}\n  </div>\n</div>\n{%- endif %}\n"
  },
  {
    "path": "docs/source/conf.py",
    "content": "\"\"\"\n Copyright (c) 2021, NVIDIA CORPORATION.\n\n Licensed under the Apache License, Version 2.0 (the \"License\");\n you may not use this file except in compliance with the License.\n You may obtain a copy of the License at\n\n     http://www.apache.org/licenses/LICENSE-2.0\n\n Unless required by applicable law or agreed to in writing, software\n distributed under the License is distributed on an \"AS IS\" BASIS,\n WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n See the License for the specific language governing permissions and\n limitations under the License.\n\"\"\"\n\n# Configuration file for the Sphinx documentation builder.\n#\n# This file only contains a selection of the most common options. For a full\n# list see the documentation:\n# https://www.sphinx-doc.org/en/master/usage/configuration.html\n\n# -- Path setup --------------------------------------------------------------\n\n# If extensions (or modules to document with autodoc) are in another directory,\n# add these directories to sys.path here. If the directory is relative to the\n# documentation root, use os.path.abspath to make it absolute, like shown here.\n#\nimport os\nimport subprocess\nimport sys\n\nfrom datetime import datetime\nfrom natsort import natsorted\n\nsys.path.insert(0, os.path.abspath(\"../..\"))\n\nrepodir = os.path.abspath(os.path.join(__file__, r\"../../..\"))\ngitdir = os.path.join(repodir, r\".git\")\n\n# -- Project information -----------------------------------------------------\n\nyear_range = \"2022\"\nyear_now = str(datetime.now().year)\nif year_range != year_now:\n    year_range = year_range + chr(8211) + year_now\n\nproject = 'Merlin Key-Value Storage'\ncopyright = year_range + \", NVIDIA\"\nauthor = 'NVIDIA'\n\n# -- General configuration ---------------------------------------------------\n\n# Add any Sphinx extension module names here, as strings. They can be\n# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom\n# ones.\nextensions = [\n    \"myst_nb\",\n    \"sphinx_external_toc\",\n    \"sphinx_rtd_theme\",\n    \"sphinx.ext.autodoc\",\n    \"sphinx.ext.autosummary\",\n    \"sphinx.ext.coverage\",\n    \"sphinx.ext.githubpages\",\n    \"sphinx.ext.napoleon\",\n    \"sphinx.ext.viewcode\",\n    \"sphinx.ext.intersphinx\",\n    \"sphinx_multiversion\",\n    \"sphinxcontrib.copydirs\",\n    \"breathe\",\n    \"exhale\",\n]\n\n# MyST configuration settings\nexternal_toc_path = \"toc.yaml\"\nmyst_enable_extensions = [\n    \"deflist\",\n    \"html_image\",\n    \"linkify\",\n    \"replacements\",\n    \"tasklist\",\n    \"dollarmath\",\n]\nmyst_linkify_fuzzy_links = False\nmyst_heading_anchors = 4\nnb_execution_mode = \"off\"\n\n# Some documents are RST and include `.. toctree::` directives.\nsuppress_warnings = [\"etoc.toctree\", \"myst.header\", \"misc.highlighting_failure\"]\n\n# Add any paths that contain templates here, relative to this directory.\ntemplates_path = ['_templates']\n\n# List of patterns, relative to source directory, that match files and\n# directories to ignore when looking for source files.\n# This pattern also affects html_static_path and html_extra_path.\nexclude_patterns = []\n\n\n# -- Options for HTML output -------------------------------------------------\n\n# The theme to use for HTML and HTML Help pages.  See the documentation for\n# a list of builtin themes.\n#\nhtml_theme = \"sphinx_rtd_theme\"\nhtml_theme_options = {\n    \"navigation_depth\": 2,\n    \"analytics_id\": \"G-NVJ1Y1YJHK\",\n}\nhtml_show_sourcelink = False\nhtml_show_sphinx = False\n\n# Whitelist pattern for tags (set to None to ignore all tags)\n# Determine if Sphinx is reading conf.py from the checked out\n# repo (a Git repo) vs SMV reading conf.py from an archive of the repo\n# at a commit (not a Git repo).\nif os.path.exists(gitdir):\n    tag_refs = (\n        subprocess.check_output([\"git\", \"tag\", \"-l\", \"v*\"]).decode(\"utf-8\").split()\n    )\n    tag_refs = natsorted(tag_refs)[-6:]\n    smv_tag_whitelist = r\"^(\" + r\"|\".join(tag_refs) + r\")$\"\nelse:\n    # SMV is reading conf.py from a Git archive of the repo at a specific commit.\n    smv_tag_whitelist = r\"^v.*$\"\n\n# Only include main branch for now\nsmv_branch_whitelist = \"^master$\"\n\nsmv_refs_override_suffix = \"-docs\"\n\nhtml_sidebars = {\"**\": [\"versions.html\"]}\nhtml_baseurl = \"https://nvidia-merlin.github.io/HierarchicalKV/master\"\n\nhtml_static_path = [ '_static' ]\nhtml_css_files = [ \"css/custom.css\", \"css/banner.css\" ]\n\n# Add any paths that contain custom static files (such as style sheets) here,\n# relative to this directory. They are copied after the builtin static files,\n# so a file named \"default.css\" will overwrite the builtin \"default.css\".\n\nsource_suffix = [\".rst\", \".md\"]\n\nbreathe_projects = {\n    \"HierarchicalKV\": \"/tmp/doxygen/xml\"\n}\nbreathe_default_project = \"HierarchicalKV\"\n\nexhale_args = {\n    \"containmentFolder\": \"./api\",\n    \"rootFileName\": \"index.rst\",\n    \"doxygenStripFromPath\": \"../../include\",\n    \"rootFileTitle\": \"HierarchicalKV C++ API Documentation\",\n    \"fullApiSubSectionTitle\": \"Complete HierarchicalKV API\",\n    \"createTreeView\": False,\n    \"exhaleExecutesDoxygen\": True,\n    \"exhaleDoxygenStdin\": \"\"\"\n      FILE_PATTERNS = *.h *.cuh\n      RECURSIVE = NO\n      EXTENSION_MAPPING = cuh=C++\n      HIDE_UNDOC_CLASSES = YES\n      HIDE_FRIEND_COMPOUNDS = YES\n      SORT_MEMBERS_CTORS_1ST = YES\n      SHOW_USED_FILES = NO\n      SHOW_FILES = NO\n      SHOW_NAMESPACES = NO\n      INPUT = ../../include\n      INPUT_ENCODING = UTF-8\n      \"\"\",\n}\n\ncopydirs_additional_dirs = [\n    \"../../CONTRIBUTING.md\",\n    \"../../README.md\",\n]\ncopydirs_file_rename = {\n    \"README.md\": \"index.md\",\n}\n"
  },
  {
    "path": "docs/source/index.rst",
    "content": "Merlin Key-Value Storage\n========================\n\nMerlin Key-Value Storage is an open source library that provides hierarchical key-value storage using on-GPU high-bandwidth memory (HBM) and host RAM.\n\nFor more information, see the `Introduction <README.html>`_.\n\nRelated Resources\n-----------------\n\nMerlin Key-Value Storage GitHub Repository\n  `<https://github.com/NVIDIA-Merlin/HierarchicalKV>`_\n\nAbout Merlin\n  Merlin is the overarching project that brings together the Merlin projects.\n  See the `documentation <https://nvidia-merlin.github.io/Merlin/main/README.html>`_\n  or the `repository <https://github.com/NVIDIA-Merlin/Merlin>`_ on GitHub.\n\nDeveloper website for Merlin\n  More information about Merlin is available at our developer website:\n  `<https://developer.nvidia.com/nvidia-merlin>`_.\n"
  },
  {
    "path": "docs/source/toc.yaml",
    "content": "root: index\nsubtrees:\n  - caption: Contents\n    entries:\n      - file: README.md\n        title: Introduction\n      - file: api/index.rst\n        title: API Documentation\n      - file: CONTRIBUTING.md\n        title: Contributing to HierarchicalKV\n\n\n# The multi-modal data example uses several notebooks to demonstrate how to use of multi-modal data (text and images)\n# to provide movie recommendations based on the MovieLens 25M dataset.\n\n# .. toctree::\n#    :maxdepth: 1\n\n"
  },
  {
    "path": "include/BUILD",
    "content": "load(\"@local_config_cuda//cuda:build_defs.bzl\", \"cuda_cc_library\")\n\ncuda_cc_library(\n    name = \"merlin_localfile\",\n    hdrs = [\n        \"merlin_localfile.hpp\",\n    ],\n    visibility = [\n        \"//visibility:public\",\n    ],\n    deps = [\n        \"//include/merlin\",\n        \"@local_config_cuda//cuda\",\n    ],\n)\n\ncuda_cc_library(\n    name = \"merlin_hashtable\",\n    hdrs = [\n        \"merlin_hashtable.cuh\",\n    ],\n    visibility = [\n        \"//visibility:public\",\n    ],\n    deps = [\n        \"//include/merlin\",\n        \"@local_config_cuda//cuda\",\n    ],\n)\n"
  },
  {
    "path": "include/merlin/BUILD",
    "content": "load(\"@local_config_cuda//cuda:build_defs.bzl\", \"cuda_cc_library\")\n\ncuda_cc_library(\n    name = \"types_and_utils\",\n    srcs = [\n    ],\n    hdrs = [\n        \"types.cuh\",\n        \"utils.cuh\",\n    ],\n    visibility = [\n        \"//visibility:public\",\n    ],\n    deps = [\n        \"@local_config_cuda//cuda\",\n    ],\n)\n\ncuda_cc_library(\n    name = \"merlin\",\n    srcs = [\n    ],\n    hdrs = [\n        \"allocator.cuh\",\n        \"array_kernels.cuh\",\n        \"core_kernels.cuh\",\n        \"debug.hpp\",\n        \"flexible_buffer.cuh\",\n        \"group_lock.cuh\",\n        \"memory_pool.cuh\",\n        \"optimizers.cuh\",\n    ],\n    visibility = [\n        \"//visibility:public\",\n    ],\n    deps = [\n        \"//include/merlin:types_and_utils\",\n        \"//include/merlin/core_kernels\",\n        \"@local_config_cuda//cuda\",\n    ],\n)\n"
  },
  {
    "path": "include/merlin/allocator.cuh",
    "content": "/*\n * Copyright (c) 2022, NVIDIA CORPORATION.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *     http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n#pragma once\n\n#include <stdlib.h>\n#include <thrust/device_malloc_allocator.h>\n#include \"debug.hpp\"\n#include \"utils.cuh\"\n\nnamespace nv {\nnamespace merlin {\n\nenum MemoryType {\n  Device,   // HBM\n  Pinned,   // Pinned Host Memory\n  Host,     // Host Memory\n  Managed,  // Pageable Host Memory(Not required)\n};\n\n/* This abstract class defines the allocator APIs required by HKV.\n   Any of the customized allocators should inherit from it.\n */\nclass BaseAllocator {\n public:\n  BaseAllocator(const BaseAllocator&) = delete;\n  BaseAllocator(BaseAllocator&&) = delete;\n\n  BaseAllocator& operator=(const BaseAllocator&) = delete;\n  BaseAllocator& operator=(BaseAllocator&&) = delete;\n\n  BaseAllocator() = default;\n  virtual ~BaseAllocator() = default;\n\n  virtual void alloc(const MemoryType type, void** ptr, size_t size,\n                     unsigned int pinned_flags = cudaHostAllocDefault) = 0;\n\n  virtual void alloc_async(const MemoryType type, void** ptr, size_t size,\n                           cudaStream_t stream) = 0;\n\n  virtual void free(const MemoryType type, void* ptr) = 0;\n\n  virtual void free_async(const MemoryType type, void* ptr,\n                          cudaStream_t stream) = 0;\n};\n\nclass DefaultAllocator : public virtual BaseAllocator {\n public:\n  DefaultAllocator() {};\n  ~DefaultAllocator() override {};\n\n  void alloc(const MemoryType type, void** ptr, size_t size,\n             unsigned int pinned_flags = cudaHostAllocDefault) override {\n    switch (type) {\n      case MemoryType::Device:\n        CUDA_CHECK(cudaMalloc(ptr, size));\n        break;\n      case MemoryType::Pinned:\n        CUDA_CHECK(cudaMallocHost(ptr, size, pinned_flags));\n        break;\n      case MemoryType::Host:\n        *ptr = std::malloc(size);\n        break;\n    }\n    return;\n  }\n\n  void alloc_async(const MemoryType type, void** ptr, size_t size,\n                   cudaStream_t stream) override {\n    if (type == MemoryType::Device) {\n      CUDA_CHECK(cudaMallocAsync(ptr, size, stream));\n    } else {\n      MERLIN_CHECK(false,\n                   \"[DefaultAllocator] alloc_async is only support for \"\n                   \"MemoryType::Device!\");\n    }\n    return;\n  }\n\n  void free(const MemoryType type, void* ptr) override {\n    if (ptr == nullptr) {\n      return;\n    }\n    switch (type) {\n      case MemoryType::Pinned:\n        CUDA_CHECK(cudaFreeHost(ptr));\n        break;\n      case MemoryType::Device:\n        CUDA_CHECK(cudaFree(ptr));\n        break;\n      case MemoryType::Host:\n        std::free(ptr);\n        break;\n    }\n    return;\n  }\n\n  void free_async(const MemoryType type, void* ptr,\n                  cudaStream_t stream) override {\n    if (ptr == nullptr) {\n      return;\n    }\n\n    if (type == MemoryType::Device) {\n      CUDA_CHECK(cudaFreeAsync(ptr, stream));\n    } else {\n      MERLIN_CHECK(false,\n                   \"[DefaultAllocator] free_async is only support for \"\n                   \"MemoryType::Device!\");\n    }\n  }\n};\n\ntemplate <typename T>\nstruct ThrustAllocator : thrust::device_malloc_allocator<T> {\n public:\n  typedef thrust::device_malloc_allocator<T> super_t;\n  typedef typename super_t::pointer pointer;\n  typedef typename super_t::size_type size_type;\n\n public:\n  pointer allocate(size_type n) {\n    void* ptr = nullptr;\n    MERLIN_CHECK(\n        allocator_ != nullptr,\n        \"[ThrustAllocator] set_allocator should be called in advance!\");\n    allocator_->alloc(MemoryType::Device, &ptr, sizeof(T) * n);\n    return pointer(reinterpret_cast<T*>(ptr));\n  }\n\n  void deallocate(pointer p, size_type n) {\n    MERLIN_CHECK(\n        allocator_ != nullptr,\n        \"[ThrustAllocator] set_allocator should be called in advance!\");\n    allocator_->free(MemoryType::Device, reinterpret_cast<void*>(p.get()));\n  }\n\n  void set_allocator(BaseAllocator* allocator) { allocator_ = allocator; }\n\n public:\n  BaseAllocator* allocator_ = nullptr;\n};\n\n}  // namespace merlin\n}  // namespace nv\n"
  },
  {
    "path": "include/merlin/array_kernels.cuh",
    "content": "/*\n * Copyright (c) 2023, NVIDIA CORPORATION.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *     http:///www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n#pragma once\n\n#include <cooperative_groups.h>\n#include \"cuda_runtime.h\"\n#include \"thrust/device_vector.h\"\n#include \"thrust/execution_policy.h\"\n#include \"thrust/scan.h\"\n#include \"types.cuh\"\n#include \"utils.cuh\"\n\nnamespace nv {\nnamespace merlin {\n\ntemplate <typename K>\n__global__ void keys_not_empty(const K* keys, bool* masks, size_t n) {\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  if (tid < n) {\n    masks[tid] = keys[tid] != EMPTY_KEY;\n  }\n}\n\ntemplate <typename Tidx, int TILE_SIZE = 8>\n__global__ void gpu_cell_count(const bool* masks, Tidx* offsets, size_t n,\n                               size_t* n_existed) {\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());\n  int rank = g.thread_rank();\n  bool is_existed = false;\n  if (tid < n) {\n    if (masks[tid]) {\n      is_existed = true;\n    }\n  }\n  unsigned int vote = g.ballot(is_existed);\n  int g_ones = __popc((int)vote);\n  if (rank == 0 && tid < n) {\n    offsets[tid / TILE_SIZE] = static_cast<Tidx>(g_ones);\n    atomicAdd(static_cast<uint64_t*>(n_existed), static_cast<uint64_t>(g_ones));\n  }\n}\n\ntemplate <typename K, typename V, typename S, typename Tidx, int TILE_SIZE = 8>\n__global__ void gpu_select_kvm_kernel(const bool* masks, size_t n,\n                                      const Tidx* offsets, K* __restrict keys,\n                                      V* __restrict values,\n                                      S* __restrict scores, const size_t dim) {\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());\n  int rank = g.thread_rank();\n\n  bool is_existed = false;\n  if (tid < n) {\n    if (masks[tid]) {\n      is_existed = true;\n    }\n  }\n  unsigned int vote = g.ballot(is_existed);\n  unsigned int r_vote = __brev(vote) >> (32 - TILE_SIZE);\n  K empty_key = (K)EMPTY_KEY;\n\n  if (tid < n) {\n    r_vote = r_vote >> (TILE_SIZE - rank - 1);\n    if (masks[tid]) {\n      int prefix_n = __popc(r_vote) - 1;\n      Tidx bias = offsets[tid / TILE_SIZE] + static_cast<Tidx>(prefix_n);\n\n      if (bias == tid) return;\n\n      K target_key = 0;\n      AtomicKey<K>* atomic_key = reinterpret_cast<AtomicKey<K>*>(keys) + bias;\n      while (target_key != empty_key) {\n        target_key = empty_key;\n        atomic_key->compare_exchange_weak(target_key, keys[tid],\n                                          cuda::std::memory_order_relaxed,\n                                          cuda::std::memory_order_relaxed);\n      }\n      if (scores) scores[bias] = scores[tid];\n      for (size_t j = 0; j < dim; j++) {\n        values[dim * bias + j] = values[dim * tid + j];\n      }\n      atomic_key = reinterpret_cast<AtomicKey<K>*>(keys) + tid;\n      atomic_key->store(empty_key, cuda::std::memory_order_relaxed);\n    }\n  }\n}\n\ntemplate <typename K, typename V, typename S, typename Tidx, int TILE_SIZE = 8>\nvoid gpu_boolean_mask(size_t grid_size, size_t block_size, const bool* masks,\n                      size_t n, size_t* n_evicted, Tidx* offsets,\n                      K* __restrict keys, V* __restrict values,\n                      S* __restrict scores, size_t dim, cudaStream_t stream) {\n  size_t n_offsets = (n + TILE_SIZE - 1) / TILE_SIZE;\n  gpu_cell_count<Tidx, TILE_SIZE>\n      <<<grid_size, block_size, 0, stream>>>(masks, offsets, n, n_evicted);\n#if THRUST_VERSION >= 101600\n  auto policy = thrust::cuda::par_nosync.on(stream);\n#else\n  auto policy = thrust::cuda::par.on(stream);\n#endif\n  thrust::device_ptr<Tidx> d_src(offsets);\n  thrust::device_ptr<Tidx> d_dest(offsets);\n  thrust::exclusive_scan(policy, d_src, d_src + n_offsets, d_dest);\n  gpu_select_kvm_kernel<K, V, S, Tidx, TILE_SIZE>\n      <<<grid_size, block_size, 0, stream>>>(masks, n, offsets, keys, values,\n                                             scores, dim);\n}\n\n}  // namespace merlin\n}  // namespace nv\n"
  },
  {
    "path": "include/merlin/core_kernels/BUILD",
    "content": "load(\"@local_config_cuda//cuda:build_defs.bzl\", \"cuda_cc_library\")\n\ncuda_cc_library(\n    name = \"core_kernels\",\n    srcs = [],\n    hdrs = glob([\n        \"**/*.cuh\",\n    ]),\n    visibility = [\n        \"//visibility:public\",\n    ],\n    deps = [\n        \"//include/merlin:types_and_utils\",\n        \"@local_config_cuda//cuda\",\n    ],\n)\n"
  },
  {
    "path": "include/merlin/core_kernels/accum_or_assign.cuh",
    "content": "/*\n * Copyright (c) 2023, NVIDIA CORPORATION.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *     http:///www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n#pragma once\n\n#include \"kernel_utils.cuh\"\n\nnamespace nv {\nnamespace merlin {\n\ntemplate <class V, uint32_t TILE_SIZE = 4>\n__device__ __forceinline__ void accum_or_assign_vector(\n    cg::thread_block_tile<TILE_SIZE> const& g, const V* delta_or_val, V* dst,\n    const bool is_accum, const size_t dim) {\n  for (auto i = g.thread_rank(); i < dim; i += g.size()) {\n    if (is_accum) {\n      dst[i] += delta_or_val[i];\n    } else {\n      dst[i] = delta_or_val[i];\n    }\n  }\n}\n\n/* Write the values of delta_or_val into the table. If the key[i] is already in\n   the table indicted be @exists[i], a @delta_or_val[i] will be added to the the\n   existing value. if the key not exists, the value @val_or_delta[i] will be\n   assigned to the address @dst[i].\n\n   `delta_or_val`: will be treated as val and accumlating should be executed.\n   `dst`: A pointer of pointer to V which should be on HBM,\n          but each value (a pointer of V) could point to a\n          memory on HBM or HMEM.\n   `existed`: If the keys existed before this kernel is executed.\n   `status`: The existence status for each key when the kernel is being\n   executed.\n\n   `N`: number of vectors needed to be writen.\n*/\ntemplate <class K, class V, class S>\n__global__ void write_with_accum_kernel(const V* __restrict delta_or_val,\n                                        V** __restrict dst,\n                                        const bool* __restrict existed,\n                                        const bool* __restrict status,\n                                        const int* __restrict src_offset,\n                                        const size_t dim, size_t N) {\n  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;\n\n  for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) {\n    int vec_index = int(t / dim);\n    int dim_index = t % dim;\n\n    if (dst[vec_index] != nullptr &&\n        existed[src_offset[vec_index]] == status[src_offset[vec_index]]) {\n      if (status[src_offset[vec_index]]) {\n        dst[vec_index][dim_index] +=\n            delta_or_val[src_offset[vec_index] * dim + dim_index];\n      } else {\n        dst[vec_index][dim_index] =\n            delta_or_val[src_offset[vec_index] * dim + dim_index];\n      }\n    }\n  }\n}\n\n/*\n * update with IO operation. This kernel is\n * usually used for the pure HBM mode for better performance.\n */\ntemplate <class K, class V, class S, int Strategy, uint32_t TILE_SIZE = 4>\n__global__ void accum_or_assign_kernel_with_io(\n    const Table<K, V, S>* __restrict table, const size_t bucket_max_size,\n    const size_t buckets_num, const size_t dim, const K* __restrict keys,\n    const V* __restrict value_or_deltas, const S* __restrict scores,\n    const bool* __restrict accum_or_assigns, const S global_epoch,\n    const size_t N) {\n  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());\n  int* buckets_size = table->buckets_size;\n\n  using ScoreFunctor = ScoreFunctor<K, V, S, Strategy>;\n\n  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;\n       t += blockDim.x * gridDim.x) {\n    int key_pos = -1;\n    size_t key_idx = t / TILE_SIZE;\n\n    const K insert_key = keys[key_idx];\n\n    if (IS_RESERVED_KEY<K>(insert_key)) continue;\n\n    const S insert_score =\n        ScoreFunctor::desired_when_missed(scores, key_idx, global_epoch);\n\n    const V* insert_value = value_or_deltas + key_idx * dim;\n    const bool is_accum = accum_or_assigns[key_idx];\n\n    size_t bkt_idx = 0;\n    size_t start_idx = 0;\n    int src_lane = -1;\n    K evicted_key;\n\n    Bucket<K, V, S>* bucket =\n        get_key_position<K>(table->buckets, insert_key, bkt_idx, start_idx,\n                            buckets_num, bucket_max_size);\n\n    OccupyResult occupy_result{OccupyResult::INITIAL};\n    const int bucket_size = buckets_size[bkt_idx];\n    do {\n      if (bucket_size < bucket_max_size) {\n        occupy_result = find_and_lock_when_vacant<K, V, S, TILE_SIZE>(\n            g, bucket, insert_key, insert_score, evicted_key, start_idx,\n            key_pos, src_lane, bucket_max_size);\n      } else {\n        start_idx = (start_idx / TILE_SIZE) * TILE_SIZE;\n        occupy_result = find_and_lock_when_full<K, V, S, TILE_SIZE,\n                                                ScoreFunctor::LOCK_MEM_ORDER,\n                                                ScoreFunctor::UNLOCK_MEM_ORDER>(\n            g, bucket, insert_key, insert_score, evicted_key, start_idx,\n            key_pos, src_lane, bucket_max_size);\n      }\n\n      occupy_result = g.shfl(occupy_result, src_lane);\n    } while (occupy_result == OccupyResult::CONTINUE);\n\n    if (occupy_result == OccupyResult::REFUSED) continue;\n\n    if ((is_accum && occupy_result != OccupyResult::DUPLICATE) ||\n        (!is_accum && occupy_result == OccupyResult::DUPLICATE)) {\n      if (g.thread_rank() == src_lane) {\n        if (occupy_result == OccupyResult::OCCUPIED_EMPTY) {\n          evicted_key = static_cast<K>(EMPTY_KEY);\n        }\n        if (occupy_result == OccupyResult::OCCUPIED_RECLAIMED) {\n          evicted_key = static_cast<K>(RECLAIM_KEY);\n        }\n        if (occupy_result == OccupyResult::DUPLICATE) {\n          evicted_key = insert_key;\n        }\n        (bucket->keys(key_pos))\n            ->store(evicted_key, ScoreFunctor::UNLOCK_MEM_ORDER);\n      }\n      g.sync();\n      continue;\n    }\n    if ((occupy_result == OccupyResult::OCCUPIED_EMPTY ||\n         occupy_result == OccupyResult::OCCUPIED_RECLAIMED) &&\n        g.thread_rank() == src_lane) {\n      atomicAdd(&(buckets_size[bkt_idx]), 1);\n    }\n\n    accum_or_assign_vector<V, TILE_SIZE>(\n        g, insert_value, bucket->vectors + key_pos * dim, is_accum, dim);\n\n    if (g.thread_rank() == src_lane) {\n      ScoreFunctor::update(bucket, key_pos, scores, key_idx, insert_score,\n                           (occupy_result != OccupyResult::DUPLICATE));\n      bucket->digests(key_pos)[0] = get_digest<K>(insert_key);\n      (bucket->keys(key_pos))\n          ->store(insert_key, ScoreFunctor::UNLOCK_MEM_ORDER);\n    }\n  }\n}\n\ntemplate <typename K, typename V, typename S, int Strategy>\nstruct SelectAccumOrAssignKernelWithIO {\n  static void execute_kernel(\n      const float& load_factor, const int& block_size,\n      const size_t bucket_max_size, const size_t buckets_num, const size_t dim,\n      cudaStream_t& stream, const size_t& n,\n      const Table<K, V, S>* __restrict table, const K* __restrict keys,\n      const V* __restrict value_or_deltas, const S* __restrict scores,\n      const bool* __restrict accum_or_assigns, const S global_epoch) {\n    if (load_factor <= 0.75) {\n      const unsigned int tile_size = 4;\n      const size_t N = n * tile_size;\n      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);\n      accum_or_assign_kernel_with_io<K, V, S, Strategy, tile_size>\n          <<<grid_size, block_size, 0, stream>>>(\n              table, bucket_max_size, buckets_num, dim, keys, value_or_deltas,\n              scores, accum_or_assigns, global_epoch, N);\n    } else {\n      const unsigned int tile_size = 32;\n      const size_t N = n * tile_size;\n      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);\n      accum_or_assign_kernel_with_io<K, V, S, Strategy, tile_size>\n          <<<grid_size, block_size, 0, stream>>>(\n              table, bucket_max_size, buckets_num, dim, keys, value_or_deltas,\n              scores, accum_or_assigns, global_epoch, N);\n    }\n    return;\n  }\n};\n\ntemplate <class K, class V, class S, int Strategy, uint32_t TILE_SIZE = 4>\n__global__ void accum_or_assign_kernel(\n    const Table<K, V, S>* __restrict table, const size_t bucket_max_size,\n    const size_t buckets_num, const size_t dim, const K* __restrict keys,\n    V** __restrict value_or_deltas, const S* __restrict scores,\n    const bool* __restrict accum_or_assigns, int* __restrict src_offset,\n    bool* __restrict founds, const S global_epoch, size_t N) {\n  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());\n  int* buckets_size = table->buckets_size;\n\n  using ScoreFunctor = ScoreFunctor<K, V, S, Strategy>;\n\n  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;\n       t += blockDim.x * gridDim.x) {\n    int key_pos = -1;\n    size_t key_idx = t / TILE_SIZE;\n\n    const K insert_key = keys[key_idx];\n\n    if (IS_RESERVED_KEY<K>(insert_key)) continue;\n\n    const S insert_score =\n        ScoreFunctor::desired_when_missed(scores, key_idx, global_epoch);\n\n    const bool is_accum = accum_or_assigns[key_idx];\n\n    size_t bkt_idx = 0;\n    size_t start_idx = 0;\n    int src_lane = -1;\n    K evicted_key;\n\n    Bucket<K, V, S>* bucket =\n        get_key_position<K>(table->buckets, insert_key, bkt_idx, start_idx,\n                            buckets_num, bucket_max_size);\n\n    if (g.thread_rank() == 0) {\n      *(src_offset + key_idx) = key_idx;\n    }\n\n    OccupyResult occupy_result{OccupyResult::INITIAL};\n    const int bucket_size = buckets_size[bkt_idx];\n    do {\n      if (bucket_size < bucket_max_size) {\n        occupy_result = find_and_lock_when_vacant<K, V, S, TILE_SIZE>(\n            g, bucket, insert_key, insert_score, evicted_key, start_idx,\n            key_pos, src_lane, bucket_max_size);\n      } else {\n        start_idx = (start_idx / TILE_SIZE) * TILE_SIZE;\n        occupy_result = find_and_lock_when_full<K, V, S, TILE_SIZE,\n                                                ScoreFunctor::LOCK_MEM_ORDER,\n                                                ScoreFunctor::UNLOCK_MEM_ORDER>(\n            g, bucket, insert_key, insert_score, evicted_key, start_idx,\n            key_pos, src_lane, bucket_max_size);\n      }\n\n      occupy_result = g.shfl(occupy_result, src_lane);\n    } while (occupy_result == OccupyResult::CONTINUE);\n\n    if (occupy_result == OccupyResult::REFUSED) continue;\n\n    if ((is_accum && occupy_result != OccupyResult::DUPLICATE) ||\n        (!is_accum && occupy_result == OccupyResult::DUPLICATE)) {\n      if (g.thread_rank() == src_lane) {\n        if (occupy_result == OccupyResult::OCCUPIED_EMPTY) {\n          evicted_key = static_cast<K>(EMPTY_KEY);\n        }\n        if (occupy_result == OccupyResult::OCCUPIED_RECLAIMED) {\n          evicted_key = static_cast<K>(RECLAIM_KEY);\n        }\n        if (occupy_result == OccupyResult::DUPLICATE) {\n          evicted_key = insert_key;\n        }\n\n        (bucket->keys(key_pos))\n            ->store(evicted_key, ScoreFunctor::UNLOCK_MEM_ORDER);\n      }\n      g.sync();\n      continue;\n    }\n\n    if ((occupy_result == OccupyResult::OCCUPIED_EMPTY ||\n         occupy_result == OccupyResult::OCCUPIED_RECLAIMED) &&\n        g.thread_rank() == src_lane) {\n      atomicAdd(&(buckets_size[bkt_idx]), 1);\n    }\n\n    if (g.thread_rank() == src_lane) {\n      *(value_or_deltas + key_idx) = (bucket->vectors + key_pos * dim);\n      *(founds + key_idx) = is_accum;\n      bucket->digests(key_pos)[0] = get_digest<K>(insert_key);\n      ScoreFunctor::update(bucket, key_pos, scores, key_idx, insert_score,\n                           (occupy_result != OccupyResult::DUPLICATE));\n      (bucket->keys(key_pos))\n          ->store(insert_key, ScoreFunctor::UNLOCK_MEM_ORDER);\n    }\n  }\n}\n\n}  // namespace merlin\n}  // namespace nv"
  },
  {
    "path": "include/merlin/core_kernels/contains.cuh",
    "content": "/*\n * Copyright (c) 2023, NVIDIA CORPORATION.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *     http:///www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n#pragma once\n\n#include \"kernel_utils.cuh\"\n\nnamespace nv {\nnamespace merlin {\n\ntemplate <typename K = uint64_t, typename V = float, typename S = uint64_t>\nstruct ContainsKernelParams {\n  ContainsKernelParams(Bucket<K, V, S>* __restrict buckets_,\n                       size_t buckets_num_, uint32_t dim_,\n                       const K* __restrict keys_, bool* __restrict founds_,\n                       size_t n_)\n      : buckets(buckets_),\n        buckets_num(buckets_num_),\n        dim(dim_),\n        keys(keys_),\n        founds(founds_),\n        n(n_) {}\n  Bucket<K, V, S>* __restrict buckets;\n  size_t buckets_num;\n  uint32_t dim;\n  const K* __restrict keys;\n  bool* __restrict founds;\n  size_t n;\n};\n\n// Using 32 threads to deal with one key\ntemplate <typename K = uint64_t, typename V = float, typename S = uint64_t>\n__global__ void contains_kernel_pipeline(Bucket<K, V, S>* buckets,\n                                         const size_t buckets_num,\n                                         const int dim,\n                                         const K* __restrict keys,\n                                         bool* __restrict founds, size_t n) {\n  constexpr int GROUP_SIZE = 32;\n  constexpr int RESERVE = 16;\n  constexpr int BLOCK_SIZE = 128;\n  constexpr int BUCKET_SIZE = 128;\n  constexpr int GROUP_NUM = BLOCK_SIZE / GROUP_SIZE;\n  constexpr int DIGEST_SPAN = BUCKET_SIZE / 4;\n\n  __shared__ int sm_target_digests[BLOCK_SIZE];\n  __shared__ K sm_target_keys[BLOCK_SIZE];\n  __shared__ K* sm_keys_ptr[BLOCK_SIZE];\n  // Reuse\n  int* sm_counts = sm_target_digests;\n\n  // Double buffer\n  __shared__ uint32_t sm_probing_digests[2][GROUP_NUM * DIGEST_SPAN];\n  __shared__ K sm_possible_keys[2][GROUP_NUM * RESERVE];\n\n  // Initialization\n  auto g = cg::tiled_partition<GROUP_SIZE>(cg::this_thread_block());\n  int groupID = threadIdx.x / GROUP_SIZE;\n  int rank = g.thread_rank();\n  int key_idx_base = (blockIdx.x * blockDim.x) + groupID * GROUP_SIZE;\n  if (key_idx_base >= n) return;\n  int loop_num =\n      (n - key_idx_base) < GROUP_SIZE ? (n - key_idx_base) : GROUP_SIZE;\n  if (rank < loop_num) {\n    int idx_block = groupID * GROUP_SIZE + rank;\n    K target_key = keys[key_idx_base + rank];\n    sm_target_keys[idx_block] = target_key;\n    const K hashed_key = Murmur3HashDevice(target_key);\n    const uint8_t target_digest = static_cast<uint8_t>(hashed_key >> 32);\n    sm_target_digests[idx_block] = static_cast<uint32_t>(target_digest);\n    int global_idx = hashed_key % (buckets_num * BUCKET_SIZE);\n    int bkt_idx = global_idx / BUCKET_SIZE;\n    Bucket<K, V, S>* bucket = buckets + bkt_idx;\n    __pipeline_memcpy_async(sm_keys_ptr + idx_block, bucket->keys_addr(),\n                            sizeof(K*));\n    __pipeline_commit();\n  }\n  __pipeline_wait_prior(0);\n\n  // Pipeline loading\n  uint8_t* digests_ptr =\n      reinterpret_cast<uint8_t*>(sm_keys_ptr[groupID * GROUP_SIZE]) -\n      BUCKET_SIZE;\n  __pipeline_memcpy_async(sm_probing_digests[0] + groupID * DIGEST_SPAN + rank,\n                          digests_ptr + rank * 4, sizeof(uint32_t));\n  __pipeline_commit();\n  __pipeline_commit();\n\n  for (int i = 0; i < loop_num; i++) {\n    int key_idx_block = groupID * GROUP_SIZE + i;\n\n    /* Step1: prefetch all digests in one bucket */\n    if ((i + 1) < loop_num) {\n      uint8_t* digests_ptr =\n          reinterpret_cast<uint8_t*>(sm_keys_ptr[key_idx_block + 1]) -\n          BUCKET_SIZE;\n      __pipeline_memcpy_async(\n          sm_probing_digests[diff_buf(i)] + groupID * DIGEST_SPAN + rank,\n          digests_ptr + rank * 4, sizeof(uint32_t));\n    }\n    __pipeline_commit();\n\n    /* Step2: check digests and load possible keys */\n    uint32_t target_digest = sm_target_digests[key_idx_block];\n    uint32_t target_digests = __byte_perm(target_digest, target_digest, 0x0000);\n    sm_counts[key_idx_block] = 0;\n    __pipeline_wait_prior(2);\n    uint32_t probing_digests =\n        sm_probing_digests[same_buf(i)][groupID * DIGEST_SPAN + rank];\n    uint32_t find_result_ = __vcmpeq4(probing_digests, target_digests);\n    uint32_t find_result = 0;\n    if ((find_result_ & 0x01) != 0) find_result |= 0x01;\n    if ((find_result_ & 0x0100) != 0) find_result |= 0x02;\n    if ((find_result_ & 0x010000) != 0) find_result |= 0x04;\n    if ((find_result_ & 0x01000000) != 0) find_result |= 0x08;\n    int find_number = __popc(find_result);\n    int group_base = 0;\n    if (find_number > 0) {\n      group_base = atomicAdd(sm_counts + key_idx_block, find_number);\n    }\n    bool gt_reserve = (group_base + find_number) > RESERVE;\n    int gt_vote = g.ballot(gt_reserve);\n    K* key_ptr = sm_keys_ptr[key_idx_block];\n    if (gt_vote == 0) {\n      do {\n        int digest_idx = __ffs(find_result) - 1;\n        if (digest_idx >= 0) {\n          find_result &= (find_result - 1);\n          int key_pos = rank * 4 + digest_idx;\n          __pipeline_memcpy_async(\n              sm_possible_keys[same_buf(i)] + (groupID * RESERVE + group_base),\n              key_ptr + key_pos, sizeof(K));\n          group_base += 1;\n        } else {\n          break;\n        }\n      } while (true);\n    } else {\n      K target_key = sm_target_keys[key_idx_block];\n      sm_counts[key_idx_block] = 0;\n      int found_vote = 0;\n      bool found = false;\n      do {\n        int digest_idx = __ffs(find_result) - 1;\n        if (digest_idx >= 0) {\n          find_result &= (find_result - 1);\n          int key_pos = rank * 4 + digest_idx;\n          K possible_key = key_ptr[key_pos];\n          if (possible_key == target_key) {\n            found = true;\n            sm_counts[key_idx_block] = 1;\n            sm_possible_keys[same_buf(i)][groupID * RESERVE] = possible_key;\n          }\n        }\n        found_vote = g.ballot(found);\n        if (found_vote) {\n          break;\n        }\n        found_vote = digest_idx >= 0;\n      } while (g.any(found_vote));\n    }\n    __pipeline_commit();\n\n    /* Step3: check possible keys, and prefecth the value and score */\n    if (i > 0) {\n      key_idx_block -= 1;\n      int key_idx_grid = blockIdx.x * blockDim.x + key_idx_block;\n      K target_key = sm_target_keys[key_idx_block];\n      int possible_num = sm_counts[key_idx_block];\n      __pipeline_wait_prior(2);\n      bool found_flag = false;\n      if (rank < possible_num) {\n        K possible_key =\n            sm_possible_keys[diff_buf(i)][groupID * RESERVE + rank];\n        if (possible_key == target_key) {\n          found_flag = true;\n        }\n      }\n      int found_vote = g.ballot(found_flag);\n      founds[key_idx_grid] = (found_vote > 0);\n    }\n  }  // End loop\n\n  /* Pipeline emptying: step3, i = loop_num */\n  {\n    int key_idx_block = groupID * GROUP_SIZE + (loop_num - 1);\n    int key_idx_grid = blockIdx.x * blockDim.x + key_idx_block;\n    K target_key = sm_target_keys[key_idx_block];\n    int possible_num = sm_counts[key_idx_block];\n    __pipeline_wait_prior(0);\n    bool found_flag = false;\n    if (rank < possible_num) {\n      K possible_key =\n          sm_possible_keys[diff_buf(loop_num)][groupID * RESERVE + rank];\n      if (target_key == possible_key) {\n        found_flag = true;\n      }\n    }\n    int found_vote = g.ballot(found_flag);\n    founds[key_idx_grid] = (found_vote > 0);\n  }\n\n}  // End function\n\ntemplate <typename K, typename V, typename S>\nstruct LaunchPipelineContains {\n  static void launch_kernel(ContainsKernelParams<K, V, S>& params,\n                            cudaStream_t& stream) {\n    constexpr int BLOCK_SIZE = 128;\n    // Using 32 threads to deal with one key\n    contains_kernel_pipeline<K, V, S>\n        <<<(params.n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(\n            params.buckets, params.buckets_num, params.dim, params.keys,\n            params.founds, params.n);\n  }\n};\n\ntemplate <typename K, typename V, typename S = uint64_t,\n          typename ArchTag = Sm80>\nstruct SelectPipelineContainsKernel {\n  static void select_kernel(ContainsKernelParams<K, V, S>& params,\n                            cudaStream_t& stream) {\n    LaunchPipelineContains<K, V, S>::launch_kernel(params, stream);\n  }\n};\n\ntemplate <class K, class V, class S, uint32_t TILE_SIZE = 4>\n__global__ void contains_kernel(const Table<K, V, S>* __restrict table,\n                                Bucket<K, V, S>* buckets,\n                                const size_t bucket_max_size,\n                                const size_t buckets_num, const size_t dim,\n                                const K* __restrict keys,\n                                bool* __restrict found, size_t N) {\n  int* buckets_size = table->buckets_size;\n\n  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());\n  int rank = g.thread_rank();\n\n  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;\n       t += blockDim.x * gridDim.x) {\n    int key_idx = t / TILE_SIZE;\n\n    const K find_key = keys[key_idx];\n    if (IS_RESERVED_KEY<K>(find_key)) continue;\n\n    int key_pos = -1;\n    int src_lane = -1;\n    size_t bkt_idx = 0;\n    size_t start_idx = 0;\n\n    Bucket<K, V, S>* bucket = get_key_position<K>(\n        buckets, find_key, bkt_idx, start_idx, buckets_num, bucket_max_size);\n\n    const int bucket_size = buckets_size[bkt_idx];\n    if (bucket_size >= bucket_max_size) {\n      start_idx = (start_idx / TILE_SIZE) * TILE_SIZE;\n    }\n\n    OccupyResult occupy_result{OccupyResult::INITIAL};\n    occupy_result = find_without_lock<K, V, S, TILE_SIZE>(\n        g, bucket, find_key, start_idx, key_pos, src_lane, bucket_max_size);\n\n    if (rank == src_lane) {\n      *(found + key_idx) = (occupy_result == OccupyResult::DUPLICATE);\n    }\n  }\n}\n\ntemplate <typename K, typename V, typename S>\nstruct SelectContainsKernel {\n  static void execute_kernel(const float& load_factor, const int& block_size,\n                             const size_t bucket_max_size,\n                             const size_t buckets_num, const size_t dim,\n                             cudaStream_t& stream, const size_t& n,\n                             const Table<K, V, S>* __restrict table,\n                             Bucket<K, V, S>* buckets, const K* __restrict keys,\n                             bool* __restrict found) {\n    if (load_factor <= 0.75) {\n      const unsigned int tile_size = 4;\n      const size_t N = n * tile_size;\n      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);\n      contains_kernel<K, V, S, tile_size><<<grid_size, block_size, 0, stream>>>(\n          table, buckets, bucket_max_size, buckets_num, dim, keys, found, N);\n    } else {\n      const unsigned int tile_size = 16;\n      const size_t N = n * tile_size;\n      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);\n      contains_kernel<K, V, S, tile_size><<<grid_size, block_size, 0, stream>>>(\n          table, buckets, bucket_max_size, buckets_num, dim, keys, found, N);\n    }\n    return;\n  }\n};\n\n}  // namespace merlin\n}  // namespace nv"
  },
  {
    "path": "include/merlin/core_kernels/dual_bucket_lookup.cuh",
    "content": "/*\n * Copyright (c) 2024, NVIDIA CORPORATION.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *     http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n#pragma once\n\n#include \"dual_bucket_utils.cuh\"\n#include \"kernel_utils.cuh\"\n\nnamespace nv {\nnamespace merlin {\n\n/**\n * Dual-bucket pipeline lookup kernel (sequential two-bucket search).\n *\n * For each key, computes (b1, b2) via high/low 32-bit split of Murmur3 hash.\n * First probes b1; if not found, probes b2.\n * Uses dual_bucket_digest (bit[56:63]) to avoid digest collision with b2\n * addressing.\n *\n * Architecture: Based on lookup_kernel_with_io_pipeline_v1 with 32 threads\n * per key, 128-thread blocks, 128-slot buckets. 4-stage IO pipeline\n * (prefetch digests -> digest match + key load -> key verify + value prefetch\n * -> value writeback).\n */\ntemplate <class K, class V, class S, class VecV,\n          typename CopyScore = CopyScoreEmpty<S, K, 128>,\n          typename CopyValue = CopyValueTwoGroup<VecV, 32>,\n          typename FoundFunctor = FoundFunctorV1<K>, int VALUE_BUF = 56>\n__global__ void dual_bucket_pipeline_lookup_kernel_with_io(\n    Bucket<K, V, S>* buckets, const int32_t* __restrict__ buckets_size,\n    const size_t buckets_num, const int dim, const K* __restrict keys,\n    VecV* __restrict values, S* __restrict scores, FoundFunctor found_functor,\n    size_t n) {\n  constexpr int GROUP_SIZE = 32;\n  constexpr int RESERVE = 16;\n  constexpr int BLOCK_SIZE = 128;\n  constexpr int BUCKET_SIZE = 128;\n  constexpr int GROUP_NUM = BLOCK_SIZE / GROUP_SIZE;\n  constexpr int DIGEST_SPAN = BUCKET_SIZE / 4;\n\n  using BUCKET = Bucket<K, V, S>;\n\n  // Shared memory declarations.\n  __shared__ int sm_target_digests[BLOCK_SIZE];\n  __shared__ K sm_target_keys[BLOCK_SIZE];\n  __shared__ K* sm_keys_ptr1[BLOCK_SIZE];       // b1 bucket keys ptr\n  __shared__ K* sm_keys_ptr2[BLOCK_SIZE];       // b2 bucket keys ptr\n  __shared__ VecV* sm_values_ptr1[BLOCK_SIZE];  // b1 values ptr\n  __shared__ VecV* sm_values_ptr2[BLOCK_SIZE];  // b2 values ptr\n  __shared__ S sm_target_scores[BLOCK_SIZE];\n  // Reuse sm_target_digests\n  int* sm_counts = sm_target_digests;\n  int* sm_founds = sm_counts;\n  // Double buffer\n  __shared__ uint32_t sm_probing_digests[2][GROUP_NUM * DIGEST_SPAN];\n  __shared__ K sm_possible_keys[2][GROUP_NUM * RESERVE];\n  __shared__ int sm_possible_pos[2][GROUP_NUM * RESERVE];\n  __shared__ VecV sm_vector[2][GROUP_NUM][VALUE_BUF];\n\n  // Initialization.\n  auto g = cg::tiled_partition<GROUP_SIZE>(cg::this_thread_block());\n  int groupID = threadIdx.x / GROUP_SIZE;\n  int rank = g.thread_rank();\n  int key_idx_base = (blockIdx.x * blockDim.x) + groupID * GROUP_SIZE;\n  if (key_idx_base >= n) return;\n  int loop_num =\n      (n - key_idx_base) < GROUP_SIZE ? (n - key_idx_base) : GROUP_SIZE;\n\n  // Phase 1: Initialize per-key data (hash, digest, bucket pointers).\n  // Save digest in register to avoid recomputing Murmur3 hash in Pass 2\n  // (sm_target_digests is aliased with sm_counts/sm_founds and gets\n  // corrupted during Pass 1).\n  uint32_t reg_target_digest = 0;\n  if (rank < loop_num) {\n    int idx_block = groupID * GROUP_SIZE + rank;\n    K target_key = keys[key_idx_base + rank];\n    sm_target_keys[idx_block] = target_key;\n    const K hashed_key = Murmur3HashDevice(target_key);\n\n    // Dual-bucket digest: bit[56:63]\n    const uint8_t target_digest =\n        static_cast<uint8_t>(static_cast<uint64_t>(hashed_key) >> 56);\n    reg_target_digest = static_cast<uint32_t>(target_digest);\n    sm_target_digests[idx_block] = reg_target_digest;\n\n    // Dual-bucket positions (centralized in dual_bucket_utils.cuh).\n    size_t bkt_idx1, bkt_idx2;\n    get_dual_bucket_indices<K>(hashed_key, buckets_num, bkt_idx1, bkt_idx2);\n\n    BUCKET* bucket1 = buckets + bkt_idx1;\n    BUCKET* bucket2 = buckets + bkt_idx2;\n    sm_keys_ptr1[idx_block] = reinterpret_cast<K*>(bucket1->keys(0));\n    sm_keys_ptr2[idx_block] = reinterpret_cast<K*>(bucket2->keys(0));\n    __pipeline_memcpy_async(sm_values_ptr1 + idx_block, &(bucket1->vectors),\n                            sizeof(VecV*));\n    __pipeline_commit();\n    __pipeline_memcpy_async(sm_values_ptr2 + idx_block, &(bucket2->vectors),\n                            sizeof(VecV*));\n  }\n  __pipeline_wait_prior(0);\n\n  // Helper lambda-like function to run pipeline lookup on one bucket.\n  // We process keys sequentially through the pipeline for one bucket,\n  // then process missed keys through the second bucket.\n\n  // --- PASS 1: Search bucket b1 ---\n  // Pipeline loading for b1.\n  {\n    uint8_t* digests_ptr =\n        reinterpret_cast<uint8_t*>(sm_keys_ptr1[groupID * GROUP_SIZE]) -\n        BUCKET_SIZE;\n    __pipeline_memcpy_async(\n        sm_probing_digests[0] + groupID * DIGEST_SPAN + rank,\n        digests_ptr + rank * 4, sizeof(uint32_t));\n  }\n  __pipeline_commit();\n  __pipeline_commit();\n  __pipeline_commit();\n\n  for (int i = 0; i < loop_num; i++) {\n    int key_idx_block = groupID * GROUP_SIZE + i;\n\n    // Step1: prefetch digests for next key's b1 bucket.\n    if ((i + 1) < loop_num) {\n      uint8_t* digests_ptr =\n          reinterpret_cast<uint8_t*>(sm_keys_ptr1[key_idx_block + 1]) -\n          BUCKET_SIZE;\n      __pipeline_memcpy_async(\n          sm_probing_digests[diff_buf(i)] + groupID * DIGEST_SPAN + rank,\n          digests_ptr + rank * 4, sizeof(uint32_t));\n    }\n    __pipeline_commit();\n\n    // Step2: check digests and load possible keys.\n    uint32_t target_digest = sm_target_digests[key_idx_block];\n    uint32_t target_digests_vec =\n        __byte_perm(target_digest, target_digest, 0x0000);\n    sm_counts[key_idx_block] = 0;\n    __pipeline_wait_prior(3);\n    uint32_t probing_digests =\n        sm_probing_digests[same_buf(i)][groupID * DIGEST_SPAN + rank];\n    uint32_t find_result_ = __vcmpeq4(probing_digests, target_digests_vec);\n    uint32_t find_result = 0;\n    if ((find_result_ & 0x01) != 0) find_result |= 0x01;\n    if ((find_result_ & 0x0100) != 0) find_result |= 0x02;\n    if ((find_result_ & 0x010000) != 0) find_result |= 0x04;\n    if ((find_result_ & 0x01000000) != 0) find_result |= 0x08;\n    int find_number = __popc(find_result);\n    int group_base = 0;\n    if (find_number > 0) {\n      group_base = atomicAdd(sm_counts + key_idx_block, find_number);\n    }\n    bool gt_reserve = (group_base + find_number) > RESERVE;\n    int gt_vote = g.ballot(gt_reserve);\n    K* key_ptr = sm_keys_ptr1[key_idx_block];\n    if (gt_vote == 0) {\n      do {\n        int digest_idx = __ffs(find_result) - 1;\n        if (digest_idx >= 0) {\n          find_result &= (find_result - 1);\n          int key_pos = rank * 4 + digest_idx;\n          sm_possible_pos[same_buf(i)][groupID * RESERVE + group_base] =\n              key_pos;\n          __pipeline_memcpy_async(\n              sm_possible_keys[same_buf(i)] + (groupID * RESERVE + group_base),\n              key_ptr + key_pos, sizeof(K));\n          group_base += 1;\n        } else {\n          break;\n        }\n      } while (true);\n    } else {\n      K target_key = sm_target_keys[key_idx_block];\n      sm_counts[key_idx_block] = 0;\n      int found_vote = 0;\n      bool found = false;\n      do {\n        int digest_idx = __ffs(find_result) - 1;\n        if (digest_idx >= 0) {\n          find_result &= (find_result - 1);\n          int key_pos = rank * 4 + digest_idx;\n          K possible_key = key_ptr[key_pos];\n          if (possible_key == target_key) {\n            found = true;\n            sm_counts[key_idx_block] = 1;\n            sm_possible_pos[same_buf(i)][groupID * RESERVE] = key_pos;\n            sm_possible_keys[same_buf(i)][groupID * RESERVE] = possible_key;\n          }\n        }\n        found_vote = g.ballot(found);\n        if (found_vote) break;\n        found_vote = digest_idx >= 0;\n      } while (g.any(found_vote));\n    }\n    __pipeline_commit();\n\n    // Step3: verify keys, prefetch values.\n    if (i > 0) {\n      int prev_block = groupID * GROUP_SIZE + i - 1;\n      K target_key = sm_target_keys[prev_block];\n      int possible_num = sm_counts[prev_block];\n      sm_founds[prev_block] = 0;\n      S* score_ptr = CopyScore::get_base_ptr(sm_keys_ptr1, prev_block);\n      VecV* value_ptr = sm_values_ptr1[prev_block];\n      __pipeline_wait_prior(3);\n      int key_pos;\n      bool found_flag = false;\n      if (rank < possible_num) {\n        K possible_key =\n            sm_possible_keys[diff_buf(i)][groupID * RESERVE + rank];\n        key_pos = sm_possible_pos[diff_buf(i)][groupID * RESERVE + rank];\n        if (possible_key == target_key) {\n          found_flag = true;\n          CopyScore::ldg_sts(sm_target_scores + prev_block,\n                             score_ptr + key_pos);\n        }\n      }\n      int found_vote = g.ballot(found_flag);\n      if (found_vote) {\n        VecV* v_dst = sm_vector[diff_buf(i)][groupID];\n        sm_founds[prev_block] = 1;\n        int src_lane = __ffs(found_vote) - 1;\n        int target_pos = g.shfl(key_pos, src_lane);\n        VecV* v_src = value_ptr + target_pos * dim;\n        CopyValue::ldg_sts(rank, v_dst, v_src, dim);\n      }\n    }\n    __pipeline_commit();\n\n    // Step4: write back value and score.\n    if (i > 1) {\n      int wb_block = groupID * GROUP_SIZE + i - 2;\n      int key_idx_grid = blockIdx.x * blockDim.x + wb_block;\n      VecV* v_src = sm_vector[same_buf(i)][groupID];\n      VecV* v_dst = values + key_idx_grid * dim;\n      int found_flag = sm_founds[wb_block];\n      __pipeline_wait_prior(3);\n      if (found_flag > 0) {\n        S score_ = CopyScore::lgs(sm_target_scores + wb_block);\n        CopyValue::lds_stg(rank, v_dst, v_src, dim);\n        CopyScore::stg(scores + key_idx_grid, score_);\n      }\n    }\n  }\n\n  // Pipeline emptying for b1: step3 for last key.\n  {\n    int key_idx_block = groupID * GROUP_SIZE + (loop_num - 1);\n    K target_key = sm_target_keys[key_idx_block];\n    int possible_num = sm_counts[key_idx_block];\n    sm_founds[key_idx_block] = 0;\n    S* score_ptr = CopyScore::get_base_ptr(sm_keys_ptr1, key_idx_block);\n    VecV* value_ptr = sm_values_ptr1[key_idx_block];\n    __pipeline_wait_prior(1);\n    int key_pos;\n    bool found_flag = false;\n    if (rank < possible_num) {\n      key_pos = sm_possible_pos[diff_buf(loop_num)][groupID * RESERVE + rank];\n      K possible_key =\n          sm_possible_keys[diff_buf(loop_num)][groupID * RESERVE + rank];\n      if (target_key == possible_key) {\n        found_flag = true;\n        CopyScore::ldg_sts(sm_target_scores + key_idx_block,\n                           score_ptr + key_pos);\n      }\n    }\n    int found_vote = g.ballot(found_flag);\n    if (found_vote) {\n      sm_founds[key_idx_block] = 1;\n      int src_lane = __ffs(found_vote) - 1;\n      int target_pos = g.shfl(key_pos, src_lane);\n      VecV* v_src = value_ptr + target_pos * dim;\n      VecV* v_dst = sm_vector[diff_buf(loop_num)][groupID];\n      CopyValue::ldg_sts(rank, v_dst, v_src, dim);\n    }\n  }\n  __pipeline_commit();\n\n  // Pipeline emptying: step4 for second-to-last key.\n  if (loop_num > 1) {\n    int key_idx_block = groupID * GROUP_SIZE + loop_num - 2;\n    int key_idx_grid = blockIdx.x * blockDim.x + key_idx_block;\n    VecV* v_src = sm_vector[same_buf(loop_num)][groupID];\n    VecV* v_dst = values + key_idx_grid * dim;\n    int found_flag = sm_founds[key_idx_block];\n    __pipeline_wait_prior(1);\n    if (found_flag > 0) {\n      S score_ = CopyScore::lgs(sm_target_scores + key_idx_block);\n      CopyValue::lds_stg(rank, v_dst, v_src, dim);\n      CopyScore::stg(scores + key_idx_grid, score_);\n    }\n  }\n\n  // Pipeline emptying: step4 for last key.\n  {\n    int key_idx_block = groupID * GROUP_SIZE + loop_num - 1;\n    int key_idx_grid = blockIdx.x * blockDim.x + key_idx_block;\n    VecV* v_src = sm_vector[same_buf(loop_num + 1)][groupID];\n    VecV* v_dst = values + key_idx_grid * dim;\n    int found_flag = sm_founds[key_idx_block];\n    __pipeline_wait_prior(0);\n    if (found_flag > 0) {\n      S score_ = CopyScore::lgs(sm_target_scores + key_idx_block);\n      CopyValue::lds_stg(rank, v_dst, v_src, dim);\n      CopyScore::stg(scores + key_idx_grid, score_);\n    }\n  }\n\n  // Finalize b1 pass and record found status.\n  // Keys found in b1 are marked. Unfound keys need b2 search.\n  if (rank < loop_num) {\n    int key_idx_block = groupID * GROUP_SIZE + rank;\n    int key_idx_grid = blockIdx.x * blockDim.x + key_idx_block;\n    // Only write found for b1 hits; b2 pass will handle misses.\n    if (sm_founds[key_idx_block] > 0) {\n      found_functor(key_idx_grid, sm_target_keys[key_idx_block], true);\n    }\n  }\n\n  // --- PASS 2: Search bucket b2 for keys not found in b1 ---\n  // Count unfound keys. If all found in b1, skip b2 entirely.\n  int any_unfound = 0;\n  if (rank < loop_num) {\n    int key_idx_block = groupID * GROUP_SIZE + rank;\n    if (sm_founds[key_idx_block] == 0) {\n      any_unfound = 1;\n    }\n  }\n  any_unfound = g.any(any_unfound);\n  if (!any_unfound) return;\n\n  // Save b1 found flags (sm_founds will be reused).\n  // We use a simple approach: store per-thread found flag in register.\n  int b1_found = 0;\n  if (rank < loop_num) {\n    b1_found = sm_founds[groupID * GROUP_SIZE + rank];\n  }\n\n  // Restore digests from registers saved during Phase 1 init.\n  // sm_target_digests was aliased with sm_counts/sm_founds and corrupted\n  // during Pass 1.  Using the register avoids recomputing Murmur3 hash.\n  if (rank < loop_num) {\n    int idx_block = groupID * GROUP_SIZE + rank;\n    sm_target_digests[idx_block] = reg_target_digest;\n  }\n  __syncwarp();\n\n  // Pipeline loading for b2.\n  {\n    uint8_t* digests_ptr =\n        reinterpret_cast<uint8_t*>(sm_keys_ptr2[groupID * GROUP_SIZE]) -\n        BUCKET_SIZE;\n    __pipeline_memcpy_async(\n        sm_probing_digests[0] + groupID * DIGEST_SPAN + rank,\n        digests_ptr + rank * 4, sizeof(uint32_t));\n  }\n  __pipeline_commit();\n  __pipeline_commit();\n  __pipeline_commit();\n\n  for (int i = 0; i < loop_num; i++) {\n    int key_idx_block = groupID * GROUP_SIZE + i;\n    // Check if this key was already found in b1.\n    int skip = g.shfl(b1_found, i);\n\n    // Step1: prefetch digests for next key's b2 bucket.\n    if ((i + 1) < loop_num) {\n      uint8_t* digests_ptr =\n          reinterpret_cast<uint8_t*>(sm_keys_ptr2[key_idx_block + 1]) -\n          BUCKET_SIZE;\n      __pipeline_memcpy_async(\n          sm_probing_digests[diff_buf(i)] + groupID * DIGEST_SPAN + rank,\n          digests_ptr + rank * 4, sizeof(uint32_t));\n    }\n    __pipeline_commit();\n\n    // Step2: check digests and load possible keys (skip if found in b1).\n    // Read digest BEFORE zeroing sm_counts (they alias sm_target_digests).\n    uint32_t target_digest = sm_target_digests[key_idx_block];\n    sm_counts[key_idx_block] = 0;\n    if (!skip) {\n      uint32_t target_digests_vec =\n          __byte_perm(target_digest, target_digest, 0x0000);\n      __pipeline_wait_prior(3);\n      uint32_t probing_digests =\n          sm_probing_digests[same_buf(i)][groupID * DIGEST_SPAN + rank];\n      uint32_t find_result_ = __vcmpeq4(probing_digests, target_digests_vec);\n      uint32_t find_result = 0;\n      if ((find_result_ & 0x01) != 0) find_result |= 0x01;\n      if ((find_result_ & 0x0100) != 0) find_result |= 0x02;\n      if ((find_result_ & 0x010000) != 0) find_result |= 0x04;\n      if ((find_result_ & 0x01000000) != 0) find_result |= 0x08;\n      int find_number = __popc(find_result);\n      int group_base = 0;\n      if (find_number > 0) {\n        group_base = atomicAdd(sm_counts + key_idx_block, find_number);\n      }\n      bool gt_reserve = (group_base + find_number) > RESERVE;\n      int gt_vote = g.ballot(gt_reserve);\n      K* key_ptr = sm_keys_ptr2[key_idx_block];\n      if (gt_vote == 0) {\n        do {\n          int digest_idx = __ffs(find_result) - 1;\n          if (digest_idx >= 0) {\n            find_result &= (find_result - 1);\n            int key_pos = rank * 4 + digest_idx;\n            sm_possible_pos[same_buf(i)][groupID * RESERVE + group_base] =\n                key_pos;\n            __pipeline_memcpy_async(sm_possible_keys[same_buf(i)] +\n                                        (groupID * RESERVE + group_base),\n                                    key_ptr + key_pos, sizeof(K));\n            group_base += 1;\n          } else {\n            break;\n          }\n        } while (true);\n      } else {\n        K target_key = sm_target_keys[key_idx_block];\n        sm_counts[key_idx_block] = 0;\n        int found_vote = 0;\n        bool found = false;\n        do {\n          int digest_idx = __ffs(find_result) - 1;\n          if (digest_idx >= 0) {\n            find_result &= (find_result - 1);\n            int key_pos = rank * 4 + digest_idx;\n            K possible_key = key_ptr[key_pos];\n            if (possible_key == target_key) {\n              found = true;\n              sm_counts[key_idx_block] = 1;\n              sm_possible_pos[same_buf(i)][groupID * RESERVE] = key_pos;\n              sm_possible_keys[same_buf(i)][groupID * RESERVE] = possible_key;\n            }\n          }\n          found_vote = g.ballot(found);\n          if (found_vote) break;\n          found_vote = digest_idx >= 0;\n        } while (g.any(found_vote));\n      }\n    } else {\n      __pipeline_wait_prior(3);\n    }\n    __pipeline_commit();\n\n    // Step3: verify keys and prefetch values from b2.\n    if (i > 0) {\n      int prev_block = groupID * GROUP_SIZE + i - 1;\n      int prev_skip = g.shfl(b1_found, i - 1);\n      if (!prev_skip) {\n        K target_key = sm_target_keys[prev_block];\n        // Read count BEFORE zeroing (sm_counts aliases sm_founds).\n        int possible_num = sm_counts[prev_block];\n        sm_founds[prev_block] = 0;\n        S* score_ptr = CopyScore::get_base_ptr(sm_keys_ptr2, prev_block);\n        VecV* value_ptr = sm_values_ptr2[prev_block];\n        __pipeline_wait_prior(3);\n        int key_pos;\n        bool found_flag = false;\n        if (rank < possible_num) {\n          K possible_key =\n              sm_possible_keys[diff_buf(i)][groupID * RESERVE + rank];\n          key_pos = sm_possible_pos[diff_buf(i)][groupID * RESERVE + rank];\n          if (possible_key == target_key) {\n            found_flag = true;\n            CopyScore::ldg_sts(sm_target_scores + prev_block,\n                               score_ptr + key_pos);\n          }\n        }\n        int found_vote = g.ballot(found_flag);\n        if (found_vote) {\n          VecV* v_dst = sm_vector[diff_buf(i)][groupID];\n          sm_founds[prev_block] = 1;\n          int src_lane = __ffs(found_vote) - 1;\n          int target_pos = g.shfl(key_pos, src_lane);\n          VecV* v_src = value_ptr + target_pos * dim;\n          CopyValue::ldg_sts(rank, v_dst, v_src, dim);\n        }\n      } else {\n        __pipeline_wait_prior(3);\n      }\n    }\n    __pipeline_commit();\n\n    // Step4: write back values from b2.\n    if (i > 1) {\n      int wb_block = groupID * GROUP_SIZE + i - 2;\n      int prev_skip = g.shfl(b1_found, i - 2);\n      if (!prev_skip) {\n        int key_idx_grid = blockIdx.x * blockDim.x + wb_block;\n        VecV* v_src = sm_vector[same_buf(i)][groupID];\n        VecV* v_dst = values + key_idx_grid * dim;\n        int found_flag = sm_founds[wb_block];\n        __pipeline_wait_prior(3);\n        if (found_flag > 0) {\n          S score_ = CopyScore::lgs(sm_target_scores + wb_block);\n          CopyValue::lds_stg(rank, v_dst, v_src, dim);\n          CopyScore::stg(scores + key_idx_grid, score_);\n        }\n      } else {\n        __pipeline_wait_prior(3);\n      }\n    }\n  }\n\n  // Pipeline emptying for b2: step3 for last key.\n  {\n    int key_idx_block = groupID * GROUP_SIZE + (loop_num - 1);\n    int last_skip = g.shfl(b1_found, loop_num - 1);\n    if (!last_skip) {\n      K target_key = sm_target_keys[key_idx_block];\n      // Read count BEFORE zeroing (sm_counts aliases sm_founds).\n      int possible_num = sm_counts[key_idx_block];\n      sm_founds[key_idx_block] = 0;\n      S* score_ptr = CopyScore::get_base_ptr(sm_keys_ptr2, key_idx_block);\n      VecV* value_ptr = sm_values_ptr2[key_idx_block];\n      __pipeline_wait_prior(1);\n      int key_pos;\n      bool found_flag = false;\n      if (rank < possible_num) {\n        key_pos = sm_possible_pos[diff_buf(loop_num)][groupID * RESERVE + rank];\n        K possible_key =\n            sm_possible_keys[diff_buf(loop_num)][groupID * RESERVE + rank];\n        if (target_key == possible_key) {\n          found_flag = true;\n          CopyScore::ldg_sts(sm_target_scores + key_idx_block,\n                             score_ptr + key_pos);\n        }\n      }\n      int found_vote = g.ballot(found_flag);\n      if (found_vote) {\n        sm_founds[key_idx_block] = 1;\n        int src_lane = __ffs(found_vote) - 1;\n        int target_pos = g.shfl(key_pos, src_lane);\n        VecV* v_src = value_ptr + target_pos * dim;\n        VecV* v_dst = sm_vector[diff_buf(loop_num)][groupID];\n        CopyValue::ldg_sts(rank, v_dst, v_src, dim);\n      }\n    } else {\n      __pipeline_wait_prior(1);\n    }\n  }\n  __pipeline_commit();\n\n  // Pipeline emptying: step4 for second-to-last key.\n  if (loop_num > 1) {\n    int key_idx_block = groupID * GROUP_SIZE + loop_num - 2;\n    int prev_skip = g.shfl(b1_found, loop_num - 2);\n    if (!prev_skip) {\n      int key_idx_grid = blockIdx.x * blockDim.x + key_idx_block;\n      VecV* v_src = sm_vector[same_buf(loop_num)][groupID];\n      VecV* v_dst = values + key_idx_grid * dim;\n      int found_flag = sm_founds[key_idx_block];\n      __pipeline_wait_prior(1);\n      if (found_flag > 0) {\n        S score_ = CopyScore::lgs(sm_target_scores + key_idx_block);\n        CopyValue::lds_stg(rank, v_dst, v_src, dim);\n        CopyScore::stg(scores + key_idx_grid, score_);\n      }\n    } else {\n      __pipeline_wait_prior(1);\n    }\n  }\n\n  // Pipeline emptying: step4 for last key.\n  {\n    int key_idx_block = groupID * GROUP_SIZE + loop_num - 1;\n    int last_skip = g.shfl(b1_found, loop_num - 1);\n    if (!last_skip) {\n      int key_idx_grid = blockIdx.x * blockDim.x + key_idx_block;\n      VecV* v_src = sm_vector[same_buf(loop_num + 1)][groupID];\n      VecV* v_dst = values + key_idx_grid * dim;\n      int found_flag = sm_founds[key_idx_block];\n      __pipeline_wait_prior(0);\n      if (found_flag > 0) {\n        S score_ = CopyScore::lgs(sm_target_scores + key_idx_block);\n        CopyValue::lds_stg(rank, v_dst, v_src, dim);\n        CopyScore::stg(scores + key_idx_grid, score_);\n      }\n    } else {\n      __pipeline_wait_prior(0);\n    }\n  }\n\n  // Finalize b2 pass: report found for keys found in b2.\n  if (rank < loop_num) {\n    int key_idx_block = groupID * GROUP_SIZE + rank;\n    int key_idx_grid = blockIdx.x * blockDim.x + key_idx_block;\n    if (b1_found == 0) {\n      // Key was not found in b1; report b2 result.\n      found_functor(key_idx_grid, sm_target_keys[key_idx_block],\n                    sm_founds[key_idx_block] > 0);\n    }\n  }\n}\n\n// --- Kernel Launchers ---\n\ntemplate <typename K, typename V, typename S, typename CopyScore, typename VecV,\n          uint32_t ValueBufSize>\nstruct LaunchDualBucketLookupV1 {\n  template <template <typename, typename, typename> typename LookupKernelParams>\n  static void launch_kernel(LookupKernelParams<K, V, S>& params,\n                            const int32_t* buckets_size, cudaStream_t& stream) {\n    constexpr int BLOCK_SIZE = 128;\n    constexpr int GROUP_SIZE = 32;\n    params.dim = params.dim * sizeof(V) / sizeof(VecV);\n    constexpr uint32_t VecSize = ValueBufSize / sizeof(VecV);\n    if (params.dim > (GROUP_SIZE * 2)) {\n      using CopyValue = CopyValueMultipleGroup<VecV, GROUP_SIZE>;\n      dual_bucket_pipeline_lookup_kernel_with_io<\n          K, V, S, VecV, CopyScore, CopyValue, decltype(params.found_functor),\n          VecSize>\n          <<<(params.n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(\n              params.buckets, buckets_size, params.buckets_num, params.dim,\n              params.keys, reinterpret_cast<VecV*>(params.values),\n              params.scores, params.found_functor, params.n);\n    } else if (params.dim > GROUP_SIZE) {\n      using CopyValue = CopyValueTwoGroup<VecV, GROUP_SIZE>;\n      dual_bucket_pipeline_lookup_kernel_with_io<\n          K, V, S, VecV, CopyScore, CopyValue, decltype(params.found_functor),\n          VecSize>\n          <<<(params.n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(\n              params.buckets, buckets_size, params.buckets_num, params.dim,\n              params.keys, reinterpret_cast<VecV*>(params.values),\n              params.scores, params.found_functor, params.n);\n    } else {\n      using CopyValue = CopyValueOneGroup<VecV, GROUP_SIZE>;\n      dual_bucket_pipeline_lookup_kernel_with_io<\n          K, V, S, VecV, CopyScore, CopyValue, decltype(params.found_functor),\n          VecSize>\n          <<<(params.n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(\n              params.buckets, buckets_size, params.buckets_num, params.dim,\n              params.keys, reinterpret_cast<VecV*>(params.values),\n              params.scores, params.found_functor, params.n);\n    }\n  }\n};\n\n// --- Kernel Selector ---\n\ntemplate <typename K, typename V, typename S = uint64_t,\n          typename ArchTag = Sm80>\nstruct SelectDualBucketLookupKernel {\n  using ValueBufConfig = LookupValueBufConfig<ArchTag>;\n\n  static inline uint32_t max_value_size() {\n    return ValueBufConfig::size_pipeline_v1;\n  }\n\n  template <template <typename, typename, typename> typename LookupKernelParams>\n  static void select_kernel(LookupKernelParams<K, V, S>& params,\n                            const int32_t* buckets_size, cudaStream_t& stream) {\n    constexpr int BUCKET_SIZE = 128;\n    constexpr uint32_t buf_size_v1 = ValueBufConfig::size_pipeline_v1;\n\n    uint32_t total_value_size = static_cast<uint32_t>(params.dim * sizeof(V));\n\n    // For dual-bucket lookup, we use v1 kernel (32 threads/key) only.\n    if (params.scores == nullptr) {\n      using CopyScore = CopyScoreEmpty<S, K, BUCKET_SIZE>;\n      if (total_value_size % sizeof(float4) == 0) {\n        using VecV = float4;\n        LaunchDualBucketLookupV1<K, V, S, CopyScore, VecV,\n                                 buf_size_v1>::launch_kernel(params,\n                                                             buckets_size,\n                                                             stream);\n      } else if (total_value_size % sizeof(float2) == 0) {\n        using VecV = float2;\n        LaunchDualBucketLookupV1<K, V, S, CopyScore, VecV,\n                                 buf_size_v1>::launch_kernel(params,\n                                                             buckets_size,\n                                                             stream);\n      } else if (total_value_size % sizeof(float) == 0) {\n        using VecV = float;\n        LaunchDualBucketLookupV1<K, V, S, CopyScore, VecV,\n                                 buf_size_v1>::launch_kernel(params,\n                                                             buckets_size,\n                                                             stream);\n      } else if (total_value_size % sizeof(uint16_t) == 0) {\n        using VecV = uint16_t;\n        LaunchDualBucketLookupV1<K, V, S, CopyScore, VecV,\n                                 buf_size_v1>::launch_kernel(params,\n                                                             buckets_size,\n                                                             stream);\n      } else {\n        using VecV = uint8_t;\n        LaunchDualBucketLookupV1<K, V, S, CopyScore, VecV,\n                                 buf_size_v1>::launch_kernel(params,\n                                                             buckets_size,\n                                                             stream);\n      }\n    } else {\n      using CopyScore = CopyScoreByPassCache<S, K, BUCKET_SIZE>;\n      if (total_value_size % sizeof(float4) == 0) {\n        using VecV = float4;\n        LaunchDualBucketLookupV1<K, V, S, CopyScore, VecV,\n                                 buf_size_v1>::launch_kernel(params,\n                                                             buckets_size,\n                                                             stream);\n      } else if (total_value_size % sizeof(float2) == 0) {\n        using VecV = float2;\n        LaunchDualBucketLookupV1<K, V, S, CopyScore, VecV,\n                                 buf_size_v1>::launch_kernel(params,\n                                                             buckets_size,\n                                                             stream);\n      } else if (total_value_size % sizeof(float) == 0) {\n        using VecV = float;\n        LaunchDualBucketLookupV1<K, V, S, CopyScore, VecV,\n                                 buf_size_v1>::launch_kernel(params,\n                                                             buckets_size,\n                                                             stream);\n      } else if (total_value_size % sizeof(uint16_t) == 0) {\n        using VecV = uint16_t;\n        LaunchDualBucketLookupV1<K, V, S, CopyScore, VecV,\n                                 buf_size_v1>::launch_kernel(params,\n                                                             buckets_size,\n                                                             stream);\n      } else {\n        using VecV = uint8_t;\n        LaunchDualBucketLookupV1<K, V, S, CopyScore, VecV,\n                                 buf_size_v1>::launch_kernel(params,\n                                                             buckets_size,\n                                                             stream);\n      }\n    }\n  }\n};\n\n}  // namespace merlin\n}  // namespace nv\n"
  },
  {
    "path": "include/merlin/core_kernels/dual_bucket_upsert.cuh",
    "content": "/*\n * Copyright (c) 2024, NVIDIA CORPORATION.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *     http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n#pragma once\n\n#include \"dual_bucket_utils.cuh\"\n#include \"kernel_utils.cuh\"\n\nnamespace nv {\nnamespace merlin {\n\n/**\n * Dual-bucket pipeline upsert kernel — True Two-Choice.\n *\n * Implements dual-bucket insert_or_assign with three distinct phases:\n *   Phase 0: DUPLICATE detection in BOTH buckets (no empty-slot occupation)\n *   Phase 1: D1 Two-Choice load-balance — compare bucket sizes, insert into\n *            the emptier bucket first, fallback to the other\n *   Phase 2: D2 score-eviction — when both buckets full, evict the entry\n *            with the global minimum score across both buckets\n *\n * Key invariant: DUPLICATE search completes in BOTH buckets before any\n * empty-slot insertion attempt. This ensures correct insert_or_assign\n * semantics (no spurious duplicates across buckets).\n *\n * Concurrent model: pure slot-level CAS (no per-bucket Mutex).\n * Constraint: unique_key=true (caller guarantees no duplicate keys in batch).\n *\n * Based on pipeline_upsert_kernel_with_io architecture:\n * - 32 threads per key (GROUP_SIZE)\n * - 128-thread blocks\n * - 128-slot buckets\n * - 4-stage software pipeline\n */\ntemplate <class K, class V, class S, class VecV, int BLOCK_SIZE = 128,\n          int Strategy = 0>\n__global__ void dual_bucket_pipeline_upsert_kernel_with_io(\n    Bucket<K, V, S>* __restrict__ buckets, int32_t* __restrict__ buckets_size,\n    const uint64_t buckets_num, const uint32_t dim, const K* __restrict__ keys,\n    const VecV* __restrict__ values, const S* __restrict__ scores, uint64_t n,\n    const S global_epoch) {\n  constexpr uint32_t BUCKET_SIZE = 128;\n  constexpr uint32_t GROUP_SIZE = 32;\n  constexpr uint32_t Comp_LEN = sizeof(VecD_Comp) / sizeof(D);\n  constexpr uint32_t Load_LEN = sizeof(VecD_Load) / sizeof(D);\n  constexpr uint32_t Load_LEN_S = sizeof(byte16) / sizeof(S);\n\n  using BUCKET = Bucket<K, V, S>;\n  using CopyValue = CopyValueMultipleGroup<VecV, GROUP_SIZE>;\n  using SMM = SharedMemoryManager_Pipeline_Upsert<K, V, S, VecV, BLOCK_SIZE,\n                                                  GROUP_SIZE, BUCKET_SIZE>;\n  using ScoreFunctor_ = ScoreFunctor<K, V, S, Strategy>;\n\n  __shared__ extern __align__(alignof(byte16)) byte smem[];\n\n  auto g = cg::tiled_partition<GROUP_SIZE>(cg::this_thread_block());\n  uint32_t tx = threadIdx.x;\n  uint32_t kv_idx = blockIdx.x * blockDim.x + tx;\n  K key{static_cast<K>(EMPTY_KEY)};\n  VecD_Comp target_digests;\n  K* bucket_keys_ptr1{nullptr};\n  K* bucket_keys_ptr2{nullptr};\n  VecV* bucket_values_ptr2{nullptr};\n  int* bucket_size_ptr2{nullptr};\n  OccupyResult occupy_result{OccupyResult::INITIAL};\n  uint32_t key_pos = 0;\n  uint32_t key_pos2 = 0;  // b2 start position (independent from b1)\n  int target_bucket = 1;  // 1 = b1, 2 = b2\n\n  if (kv_idx < n) {\n    key = keys[kv_idx];\n    if (scores != nullptr) {\n      S* sm_param_scores = SMM::param_scores(smem);\n      __pipeline_memcpy_async(sm_param_scores + tx, scores + kv_idx, sizeof(S));\n    }\n    if (!IS_RESERVED_KEY<K>(key)) {\n      const K hashed_key = Murmur3HashDevice(key);\n      // Dual-bucket digest from bit[56:63].\n      target_digests = dual_bucket_digests_from_hashed<K>(hashed_key);\n\n      // Dual-bucket indices (centralized in dual_bucket_utils.cuh).\n      size_t bkt_idx1, bkt_idx2;\n      get_dual_bucket_indices<K>(hashed_key, buckets_num, bkt_idx1, bkt_idx2);\n\n      // b1 setup (stored in SMM shared memory).\n      const uint32_t lo = static_cast<uint32_t>(hashed_key);\n      uint64_t global_idx1 =\n          static_cast<uint64_t>(lo % (buckets_num * BUCKET_SIZE));\n      key_pos = get_start_position(global_idx1, BUCKET_SIZE);\n\n      // b2 start position from high 32 bits (independent from b1).\n      const uint32_t hi =\n          static_cast<uint32_t>(static_cast<uint64_t>(hashed_key) >> 32);\n      uint64_t global_idx2 =\n          static_cast<uint64_t>(hi % (buckets_num * BUCKET_SIZE));\n      key_pos2 = get_start_position(global_idx2, BUCKET_SIZE);\n\n      int** sm_buckets_size_ptr = SMM::buckets_size_ptr(smem);\n      sm_buckets_size_ptr[tx] = buckets_size + bkt_idx1;\n\n      BUCKET* bucket1 = buckets + bkt_idx1;\n      bucket_keys_ptr1 = reinterpret_cast<K*>(bucket1->keys(0));\n      VecV** sm_bucket_values_ptr = SMM::bucket_values_ptr(smem);\n      __pipeline_memcpy_async(sm_bucket_values_ptr + tx, &(bucket1->vectors),\n                              sizeof(VecV*));\n\n      // b2 setup (stored in registers, broadcast via warp shuffle).\n      BUCKET* bucket2 = buckets + bkt_idx2;\n      bucket_keys_ptr2 = reinterpret_cast<K*>(bucket2->keys(0));\n      bucket_values_ptr2 = reinterpret_cast<VecV*>(bucket2->vectors);\n      bucket_size_ptr2 = buckets_size + bkt_idx2;\n    } else {\n      occupy_result = OccupyResult::ILLEGAL;\n    }\n  } else {\n    occupy_result = OccupyResult::ILLEGAL;\n  }\n\n  uint32_t rank = g.thread_rank();\n  uint32_t groupID = threadIdx.x / GROUP_SIZE;\n\n  // =========== Main pipeline loop (processes one key per iteration)\n  // =========== True Two-Choice algorithm for each key i in the warp:\n  //   Phase 0: DUPLICATE detection in BOTH b1 and b2 (no empty occupation)\n  //   Phase 1: D1 Two-Choice — compare bucket sizes, try emptier bucket first\n  //   Phase 2: D2 score-eviction when both buckets are full\n\n  auto occupy_result_next = g.shfl(occupy_result, 0);\n  auto keys_ptr_next = g.shfl(bucket_keys_ptr1, 0);\n\n  // Prefetch b1 digests for first key.\n  if (occupy_result_next == OccupyResult::INITIAL) {\n    D* sm_bucket_digests = SMM::bucket_digests(smem, groupID, 0);\n    D* dst = sm_bucket_digests + rank * Load_LEN;\n    D* src = BUCKET::digests(keys_ptr_next, BUCKET_SIZE, rank * Load_LEN);\n    if (rank * Load_LEN < BUCKET_SIZE) {\n      __pipeline_memcpy_async(dst, src, sizeof(VecD_Load));\n    }\n  }\n  __pipeline_commit();\n  __pipeline_commit();\n  __pipeline_commit();\n\n  for (int32_t i = 0; i < GROUP_SIZE; i++) {\n    // === Step 1: Prefetch b1 digests for next key ===\n    if (i + 1 < GROUP_SIZE) {\n      auto occupy_result_next = g.shfl(occupy_result, i + 1);\n      auto keys_ptr_next = g.shfl(bucket_keys_ptr1, i + 1);\n      if (occupy_result_next == OccupyResult::INITIAL) {\n        D* sm_bucket_digests = SMM::bucket_digests(smem, groupID, diff_buf(i));\n        D* dst = sm_bucket_digests + rank * Load_LEN;\n        D* src = BUCKET::digests(keys_ptr_next, BUCKET_SIZE, rank * Load_LEN);\n        if (rank * Load_LEN < BUCKET_SIZE) {\n          __pipeline_memcpy_async(dst, src, sizeof(VecD_Load));\n        }\n      }\n    }\n    __pipeline_commit();\n\n    // === Step 2: Three-phase True Two-Choice probe ===\n    auto occupy_result_cur = g.shfl(occupy_result, i);\n    if (occupy_result_cur == OccupyResult::INITIAL) {\n      uint32_t tx_cur = groupID * GROUP_SIZE + i;\n      int** sm_buckets_size_ptr = SMM::buckets_size_ptr(smem);\n      auto bucket_size_ptr1 = sm_buckets_size_ptr[tx_cur];\n      K key_cur = g.shfl(key, i);\n      auto target_digests_cur = g.shfl(target_digests, i);\n      auto start_pos_cur = g.shfl(key_pos, i);\n      auto keys_ptr_cur = g.shfl(bucket_keys_ptr1, i);\n\n      // b2 info for key i (shuffled from owning thread).\n      auto keys_ptr2_cur = g.shfl(bucket_keys_ptr2, i);\n      auto bsize_ptr2_cur = reinterpret_cast<int*>(static_cast<uintptr_t>(\n          g.shfl(static_cast<unsigned long long>(\n                     reinterpret_cast<uintptr_t>(bucket_size_ptr2)),\n                 i)));\n      auto start_pos2_cur = g.shfl(key_pos2, i);\n\n      __pipeline_wait_prior(3);\n      D* digest_src = SMM::bucket_digests(smem, groupID, same_buf(i));\n\n      // b1 probe offset (from b1's hash).\n      uint32_t start_offset = start_pos_cur / Comp_LEN;\n      uint32_t probe_offset =\n          Comp_LEN * ((start_offset + rank) & (GROUP_SIZE - 1));\n      VecD_Comp probe_digests =\n          *reinterpret_cast<VecD_Comp*>(digest_src + probe_offset);\n      uint32_t cmp_result = __vcmpeq4(probe_digests, target_digests_cur);\n      cmp_result &= 0x01010101;\n\n      // b2 probe offset (from b2's independent hash).\n      uint32_t start_offset2 = start_pos2_cur / Comp_LEN;\n      uint32_t b2_probe_offset =\n          Comp_LEN * ((start_offset2 + rank) & (GROUP_SIZE - 1));\n      // Load b2 digests (synchronous read).\n      D* b2_digests_ptr = BUCKET::digests(keys_ptr2_cur, BUCKET_SIZE, 0);\n      VecD_Comp b2_probe_digests =\n          *reinterpret_cast<VecD_Comp*>(b2_digests_ptr + b2_probe_offset);\n      uint32_t b2_cmp = __vcmpeq4(b2_probe_digests, target_digests_cur);\n      b2_cmp &= 0x01010101;\n\n      // ============================================================\n      // Phase 0: DUPLICATE detection in BOTH buckets\n      // ============================================================\n\n      // --- Phase 0a: DUPLICATE scan in b1 ---\n      uint32_t possible_pos = 0;\n      bool result = false;\n      {\n        uint32_t cmp_copy = cmp_result;\n        do {\n          if (cmp_copy == 0) break;\n          int32_t index = (__ffs(cmp_copy) - 1) >> 3;\n          cmp_copy &= (cmp_copy - 1);\n          possible_pos = probe_offset + index;\n          auto current_key = BUCKET::keys(keys_ptr_cur, possible_pos);\n          K expected_key = key_cur;\n          result = current_key->compare_exchange_strong(\n              expected_key, static_cast<K>(LOCKED_KEY),\n              cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);\n        } while (!result);\n      }\n\n      uint32_t found_vote = g.ballot(result);\n      if (found_vote) {\n        // DUPLICATE found in b1 -> update in place.\n        int32_t src_lane = __ffs(found_vote) - 1;\n        possible_pos = g.shfl(possible_pos, src_lane);\n        if (rank == i) {\n          occupy_result = OccupyResult::DUPLICATE;\n          key_pos = possible_pos;\n          target_bucket = 1;\n          S* sm_param_scores = SMM::param_scores(smem);\n          // Note: desired_when_missed is intentionally used here for\n          // DUPLICATE keys.  For kCustomized strategy the actual score\n          // semantics are determined by update_with_digest, which\n          // overwrites the score unconditionally.  The naming is\n          // inherited from the single-bucket API and does not imply\n          // \"key was absent\".\n          S score = ScoreFunctor_::desired_when_missed(sm_param_scores, tx,\n                                                       global_epoch);\n          D digest = get_dual_bucket_digest<K>(key);\n          ScoreFunctor_::update_with_digest(bucket_keys_ptr1, key_pos,\n                                            sm_param_scores, tx, score,\n                                            BUCKET_SIZE, digest, false);\n        }\n      }\n\n      // --- Phase 0b: DUPLICATE scan in b2 (only if not found in b1) ---\n      occupy_result_cur = g.shfl(occupy_result, i);\n      if (occupy_result_cur == OccupyResult::INITIAL) {\n        result = false;\n        possible_pos = 0;\n        {\n          uint32_t cmp_copy = b2_cmp;\n          do {\n            if (cmp_copy == 0) break;\n            int32_t index = (__ffs(cmp_copy) - 1) >> 3;\n            cmp_copy &= (cmp_copy - 1);\n            possible_pos = b2_probe_offset + index;\n            auto current_key = BUCKET::keys(keys_ptr2_cur, possible_pos);\n            K expected_key = key_cur;\n            result = current_key->compare_exchange_strong(\n                expected_key, static_cast<K>(LOCKED_KEY),\n                cuda::std::memory_order_acquire,\n                cuda::std::memory_order_relaxed);\n          } while (!result);\n        }\n\n        found_vote = g.ballot(result);\n        if (found_vote) {\n          // DUPLICATE found in b2.\n          int32_t src_lane = __ffs(found_vote) - 1;\n          possible_pos = g.shfl(possible_pos, src_lane);\n          if (rank == i) {\n            occupy_result = OccupyResult::DUPLICATE;\n            key_pos = possible_pos;\n            target_bucket = 2;\n            S* sm_param_scores = SMM::param_scores(smem);\n            // See Phase 0a comment: desired_when_missed is used for\n            // DUPLICATE keys; actual semantics governed by\n            // update_with_digest.\n            S score = ScoreFunctor_::desired_when_missed(sm_param_scores, tx,\n                                                         global_epoch);\n            D digest = get_dual_bucket_digest<K>(key);\n            ScoreFunctor_::update_with_digest(bucket_keys_ptr2, key_pos,\n                                              sm_param_scores, tx, score,\n                                              BUCKET_SIZE, digest, false);\n          }\n        }\n      }\n\n      // ============================================================\n      // Phase 1: D1 Two-Choice load-balanced EMPTY insertion\n      // ============================================================\n      occupy_result_cur = g.shfl(occupy_result, i);\n      if (occupy_result_cur == OccupyResult::INITIAL) {\n        auto bucket_size1 = *bucket_size_ptr1;\n        auto bucket_size2 = *bsize_ptr2_cur;\n\n        // True Two-Choice: prefer the emptier bucket.\n        bool prefer_b1 = (bucket_size1 <= bucket_size2);\n\n        // First bucket (emptier one).\n        K* first_keys_ptr = prefer_b1 ? keys_ptr_cur : keys_ptr2_cur;\n        int* first_bsize_ptr = prefer_b1 ? bucket_size_ptr1 : bsize_ptr2_cur;\n        int first_size = prefer_b1 ? bucket_size1 : bucket_size2;\n        VecD_Comp first_probe_digests =\n            prefer_b1 ? probe_digests : b2_probe_digests;\n        uint32_t first_probe_offset =\n            prefer_b1 ? probe_offset : b2_probe_offset;\n        int first_bucket_id = prefer_b1 ? 1 : 2;\n\n        // Second bucket (fuller one).\n        K* second_keys_ptr = prefer_b1 ? keys_ptr2_cur : keys_ptr_cur;\n        int* second_bsize_ptr = prefer_b1 ? bsize_ptr2_cur : bucket_size_ptr1;\n        int second_size = prefer_b1 ? bucket_size2 : bucket_size1;\n        VecD_Comp second_probe_digests =\n            prefer_b1 ? b2_probe_digests : probe_digests;\n        uint32_t second_probe_offset =\n            prefer_b1 ? b2_probe_offset : probe_offset;\n        int second_bucket_id = prefer_b1 ? 2 : 1;\n\n        // --- Try EMPTY in first (emptier) bucket ---\n        if (first_size < BUCKET_SIZE) {\n          VecD_Comp empty_digests_ = dual_bucket_empty_digests<K>();\n          uint32_t empty_result =\n              __vcmpeq4(first_probe_digests, empty_digests_);\n          empty_result &= 0x01010101;\n          result = false;\n          possible_pos = 0;\n          for (int32_t offset = 0; offset < GROUP_SIZE; offset += 1) {\n            if (rank == offset) {\n              do {\n                if (empty_result == 0) break;\n                int32_t index = (__ffs(empty_result) - 1) >> 3;\n                empty_result &= (empty_result - 1);\n                possible_pos = first_probe_offset + index;\n                auto current_key = BUCKET::keys(first_keys_ptr, possible_pos);\n                K expected_key = static_cast<K>(EMPTY_KEY);\n                result = current_key->compare_exchange_strong(\n                    expected_key, static_cast<K>(LOCKED_KEY),\n                    cuda::std::memory_order_acquire,\n                    cuda::std::memory_order_relaxed);\n              } while (!result);\n            }\n            found_vote = g.ballot(result);\n            if (found_vote) {\n              int32_t src_lane = __ffs(found_vote) - 1;\n              possible_pos = g.shfl(possible_pos, src_lane);\n              if (rank == i) {\n                occupy_result = OccupyResult::OCCUPIED_EMPTY;\n                key_pos = possible_pos;\n                target_bucket = first_bucket_id;\n                S* sm_param_scores = SMM::param_scores(smem);\n                S score = ScoreFunctor_::desired_when_missed(sm_param_scores,\n                                                             tx, global_epoch);\n                D digest = get_dual_bucket_digest<K>(key);\n                K* target_keys = (first_bucket_id == 1) ? bucket_keys_ptr1\n                                                        : bucket_keys_ptr2;\n                ScoreFunctor_::update_with_digest(target_keys, key_pos,\n                                                  sm_param_scores, tx, score,\n                                                  BUCKET_SIZE, digest, true);\n                atomicAdd(first_bsize_ptr, 1);\n              }\n              break;\n            }\n          }\n        }\n\n        // --- Try EMPTY in second (fuller) bucket (fallback) ---\n        occupy_result_cur = g.shfl(occupy_result, i);\n        if (occupy_result_cur == OccupyResult::INITIAL &&\n            second_size < BUCKET_SIZE) {\n          VecD_Comp empty_digests_ = dual_bucket_empty_digests<K>();\n          uint32_t empty_result =\n              __vcmpeq4(second_probe_digests, empty_digests_);\n          empty_result &= 0x01010101;\n          result = false;\n          possible_pos = 0;\n          for (int32_t offset = 0; offset < GROUP_SIZE; offset += 1) {\n            if (rank == offset) {\n              do {\n                if (empty_result == 0) break;\n                int32_t index = (__ffs(empty_result) - 1) >> 3;\n                empty_result &= (empty_result - 1);\n                possible_pos = second_probe_offset + index;\n                auto current_key = BUCKET::keys(second_keys_ptr, possible_pos);\n                K expected_key = static_cast<K>(EMPTY_KEY);\n                result = current_key->compare_exchange_strong(\n                    expected_key, static_cast<K>(LOCKED_KEY),\n                    cuda::std::memory_order_acquire,\n                    cuda::std::memory_order_relaxed);\n              } while (!result);\n            }\n            found_vote = g.ballot(result);\n            if (found_vote) {\n              int32_t src_lane = __ffs(found_vote) - 1;\n              possible_pos = g.shfl(possible_pos, src_lane);\n              if (rank == i) {\n                occupy_result = OccupyResult::OCCUPIED_EMPTY;\n                key_pos = possible_pos;\n                target_bucket = second_bucket_id;\n                S* sm_param_scores = SMM::param_scores(smem);\n                S score = ScoreFunctor_::desired_when_missed(sm_param_scores,\n                                                             tx, global_epoch);\n                D digest = get_dual_bucket_digest<K>(key);\n                K* target_keys = (second_bucket_id == 1) ? bucket_keys_ptr1\n                                                         : bucket_keys_ptr2;\n                ScoreFunctor_::update_with_digest(target_keys, key_pos,\n                                                  sm_param_scores, tx, score,\n                                                  BUCKET_SIZE, digest, true);\n                atomicAdd(second_bsize_ptr, 1);\n              }\n              break;\n            }\n          }\n        }\n      }\n\n      // ============================================================\n      // Phase 2: D2 Score Eviction (both buckets full)\n      // ============================================================\n      occupy_result_cur = g.shfl(occupy_result, i);\n      if (occupy_result_cur == OccupyResult::INITIAL) {\n        S* sm_param_scores = SMM::param_scores(smem);\n        S score_cur = ScoreFunctor_::desired_when_missed(sm_param_scores,\n                                                         tx_cur, global_epoch);\n\n        S* b1_scores = BUCKET::scores(keys_ptr_cur, BUCKET_SIZE, 0);\n        S* b2_scores = BUCKET::scores(keys_ptr2_cur, BUCKET_SIZE, 0);\n\n        // Cache scores in per-thread registers for eviction retry.\n        constexpr int SCORES_PER_THREAD =\n            BUCKET_SIZE / (GROUP_SIZE * Load_LEN_S) * Load_LEN_S;\n        S b1_cached[SCORES_PER_THREAD];\n        int b1_pos_cached[SCORES_PER_THREAD];\n        S b2_cached[SCORES_PER_THREAD];\n        int b2_pos_cached[SCORES_PER_THREAD];\n        {\n          int idx = 0;\n          for (int j = 0; j < BUCKET_SIZE; j += GROUP_SIZE * Load_LEN_S) {\n            S tmp[Load_LEN_S];\n            *reinterpret_cast<byte16*>(tmp) =\n                *reinterpret_cast<byte16*>(b1_scores + rank * Load_LEN_S + j);\n            for (int k = 0; k < Load_LEN_S; k++) {\n              b1_cached[idx] = tmp[k];\n              b1_pos_cached[idx] = rank * Load_LEN_S + j + k;\n              idx++;\n            }\n          }\n        }\n        {\n          int idx = 0;\n          for (int j = 0; j < BUCKET_SIZE; j += GROUP_SIZE * Load_LEN_S) {\n            S tmp[Load_LEN_S];\n            *reinterpret_cast<byte16*>(tmp) =\n                *reinterpret_cast<byte16*>(b2_scores + rank * Load_LEN_S + j);\n            for (int k = 0; k < Load_LEN_S; k++) {\n              b2_cached[idx] = tmp[k];\n              b2_pos_cached[idx] = rank * Load_LEN_S + j + k;\n              idx++;\n            }\n          }\n        }\n\n        // Eviction retry loop.\n        while (true) {\n          occupy_result_cur = g.shfl(occupy_result, i);\n          if (occupy_result_cur != OccupyResult::INITIAL) break;\n\n          // Find per-thread min for b1 and b2 from cached scores.\n          S min_b1_local = static_cast<S>(MAX_SCORE);\n          int min_b1_idx = -1;\n          for (int s = 0; s < SCORES_PER_THREAD; s++) {\n            if (b1_cached[s] < min_b1_local) {\n              min_b1_local = b1_cached[s];\n              min_b1_idx = s;\n            }\n          }\n          S min_b2_local = static_cast<S>(MAX_SCORE);\n          int min_b2_idx = -1;\n          for (int s = 0; s < SCORES_PER_THREAD; s++) {\n            if (b2_cached[s] < min_b2_local) {\n              min_b2_local = b2_cached[s];\n              min_b2_idx = s;\n            }\n          }\n\n          S min_b1_global = cg::reduce(g, min_b1_local, cg::less<S>());\n          S min_b2_global = cg::reduce(g, min_b2_local, cg::less<S>());\n          S overall_min =\n              (min_b1_global <= min_b2_global) ? min_b1_global : min_b2_global;\n\n          // REFUSED: new score too low to evict anything.\n          if (score_cur < overall_min) {\n            if (rank == i) {\n              occupy_result = OccupyResult::REFUSED;\n            }\n            break;\n          }\n\n          // Pick the bucket with lower min_score (Two-Choice eviction).\n          bool use_b1 = (min_b1_global <= min_b2_global);\n          S min_score_local = use_b1 ? min_b1_local : min_b2_local;\n          int min_local_idx = use_b1 ? min_b1_idx : min_b2_idx;\n          int min_pos_local = (min_local_idx >= 0)\n                                  ? (use_b1 ? b1_pos_cached[min_local_idx]\n                                            : b2_pos_cached[min_local_idx])\n                                  : -1;\n          S min_score_global = use_b1 ? min_b1_global : min_b2_global;\n          K* evict_keys_ptr = use_b1 ? keys_ptr_cur : keys_ptr2_cur;\n          int* evict_bsize_ptr = use_b1 ? bucket_size_ptr1 : bsize_ptr2_cur;\n\n          uint32_t vote = g.ballot(min_score_local <= min_score_global);\n          if (vote) {\n            int src_lane = __ffs(vote) - 1;\n            int min_pos_evict = g.shfl(min_pos_local, src_lane);\n\n            // Mark this position as visited for the winning thread.\n            if (use_b1) {\n              int visited_idx = g.shfl(min_local_idx, src_lane);\n              if (rank == src_lane && visited_idx >= 0)\n                b1_cached[visited_idx] = static_cast<S>(MAX_SCORE);\n            } else {\n              int visited_idx = g.shfl(min_local_idx, src_lane);\n              if (rank == src_lane && visited_idx >= 0)\n                b2_cached[visited_idx] = static_cast<S>(MAX_SCORE);\n            }\n\n            if (rank == i) {\n              auto min_score_key = BUCKET::keys(evict_keys_ptr, min_pos_evict);\n              auto expected_key =\n                  min_score_key->load(cuda::std::memory_order_relaxed);\n              if (expected_key != static_cast<K>(LOCKED_KEY) &&\n                  expected_key != static_cast<K>(EMPTY_KEY)) {\n                bool cas_ok = min_score_key->compare_exchange_strong(\n                    expected_key, static_cast<K>(LOCKED_KEY),\n                    cuda::std::memory_order_acquire,\n                    cuda::std::memory_order_relaxed);\n                if (cas_ok) {\n                  S* score_ptr = BUCKET::scores(evict_keys_ptr, BUCKET_SIZE,\n                                                min_pos_evict);\n                  auto verify_score_ptr =\n                      reinterpret_cast<AtomicScore<S>*>(score_ptr);\n                  auto verify_score =\n                      verify_score_ptr->load(cuda::std::memory_order_relaxed);\n                  if (verify_score <= min_score_global) {\n                    if (expected_key == static_cast<K>(RECLAIM_KEY)) {\n                      occupy_result = OccupyResult::OCCUPIED_RECLAIMED;\n                      atomicAdd(evict_bsize_ptr, 1);\n                    } else {\n                      occupy_result = OccupyResult::EVICT;\n                    }\n                    key_pos = min_pos_evict;\n                    target_bucket = use_b1 ? 1 : 2;\n                    K* target_keys_ptr =\n                        use_b1 ? bucket_keys_ptr1 : bucket_keys_ptr2;\n                    D digest = get_dual_bucket_digest<K>(key);\n                    ScoreFunctor_::update_with_digest(\n                        target_keys_ptr, key_pos, sm_param_scores, tx,\n                        score_cur, BUCKET_SIZE, digest, true);\n                  } else {\n                    min_score_key->store(expected_key,\n                                         cuda::std::memory_order_release);\n                  }\n                }\n              }\n            }\n          } else {\n            // No thread holds the minimum — all positions exhausted.\n            if (rank == i) {\n              occupy_result = OccupyResult::REFUSED;\n            }\n            break;\n          }\n        }  // while eviction retry\n      }\n    }  // end of INITIAL check\n\n    // === Step 3: Prefetch values to shared memory for previous key ===\n    if (i > 0) {\n      auto occupy_result_prev = g.shfl(occupy_result, i - 1);\n      if (occupy_result_prev != OccupyResult::ILLEGAL &&\n          occupy_result_prev != OccupyResult::REFUSED) {\n        VecV* dst = SMM::values_buffer(smem, groupID, diff_buf(i), dim);\n        auto kv_idx_cur = g.shfl(kv_idx, i - 1);\n        const VecV* src = values + kv_idx_cur * dim;\n        CopyValue::ldg_sts(rank, dst, src, dim);\n      }\n    }\n    __pipeline_commit();\n\n    // === Step 4: Write values for key (i-2) ===\n    if (i > 1) {\n      auto occupy_result_wb = g.shfl(occupy_result, i - 2);\n      if (occupy_result_wb != OccupyResult::ILLEGAL &&\n          occupy_result_wb != OccupyResult::REFUSED) {\n        VecV* src = SMM::values_buffer(smem, groupID, same_buf(i), dim);\n        auto key_pos_wb = g.shfl(key_pos, i - 2);\n        auto target_bucket_wb = g.shfl(target_bucket, i - 2);\n\n        // Get the correct values pointer for the target bucket.\n        VecV* dst;\n        if (target_bucket_wb == 1) {\n          VecV** sm_bucket_values_ptr = SMM::bucket_values_ptr(smem);\n          dst = sm_bucket_values_ptr[groupID * GROUP_SIZE + i - 2] +\n                key_pos_wb * dim;\n        } else {\n          auto bv2 = g.shfl(bucket_values_ptr2, i - 2);\n          dst = bv2 + key_pos_wb * dim;\n        }\n        __pipeline_wait_prior(3);\n        CopyValue::lds_stg(rank, dst, src, dim);\n\n        // Unlock key.\n        if (rank == i - 2) {\n          K* target_keys_ptr =\n              (target_bucket == 1) ? bucket_keys_ptr1 : bucket_keys_ptr2;\n          auto key_address = BUCKET::keys(target_keys_ptr, key_pos);\n          key_address->store(key, cuda::std::memory_order_release);\n        }\n      }\n    }\n  }  // end main loop\n\n  // =========== Pipeline draining ===========\n\n  // Step 3 for last key (i = GROUP_SIZE - 1).\n  {\n    auto occupy_result_prev = g.shfl(occupy_result, GROUP_SIZE - 1);\n    if (occupy_result_prev != OccupyResult::ILLEGAL &&\n        occupy_result_prev != OccupyResult::REFUSED) {\n      VecV* dst = SMM::values_buffer(smem, groupID, diff_buf(GROUP_SIZE), dim);\n      auto kv_idx_cur = g.shfl(kv_idx, GROUP_SIZE - 1);\n      const VecV* src = values + kv_idx_cur * dim;\n      CopyValue::ldg_sts(rank, dst, src, dim);\n    }\n  }\n  __pipeline_commit();\n\n  // Step 4 for key (GROUP_SIZE - 2).\n  {\n    auto occupy_result_wb = g.shfl(occupy_result, GROUP_SIZE - 2);\n    if (occupy_result_wb != OccupyResult::ILLEGAL &&\n        occupy_result_wb != OccupyResult::REFUSED) {\n      VecV* src = SMM::values_buffer(smem, groupID, same_buf(GROUP_SIZE), dim);\n      auto key_pos_wb = g.shfl(key_pos, GROUP_SIZE - 2);\n      auto target_bucket_wb = g.shfl(target_bucket, GROUP_SIZE - 2);\n      VecV* dst;\n      if (target_bucket_wb == 1) {\n        VecV** sm_bucket_values_ptr = SMM::bucket_values_ptr(smem);\n        dst = sm_bucket_values_ptr[groupID * GROUP_SIZE + GROUP_SIZE - 2] +\n              key_pos_wb * dim;\n      } else {\n        auto bv2 = g.shfl(bucket_values_ptr2, GROUP_SIZE - 2);\n        dst = bv2 + key_pos_wb * dim;\n      }\n      __pipeline_wait_prior(1);\n      CopyValue::lds_stg(rank, dst, src, dim);\n      if (rank == GROUP_SIZE - 2) {\n        K* target_keys_ptr =\n            (target_bucket == 1) ? bucket_keys_ptr1 : bucket_keys_ptr2;\n        auto key_address = BUCKET::keys(target_keys_ptr, key_pos);\n        key_address->store(key, cuda::std::memory_order_release);\n      }\n    }\n  }\n\n  // Step 4 for last key (GROUP_SIZE - 1).\n  {\n    auto occupy_result_wb = g.shfl(occupy_result, GROUP_SIZE - 1);\n    if (occupy_result_wb != OccupyResult::ILLEGAL &&\n        occupy_result_wb != OccupyResult::REFUSED) {\n      VecV* src =\n          SMM::values_buffer(smem, groupID, same_buf(GROUP_SIZE + 1), dim);\n      auto key_pos_wb = g.shfl(key_pos, GROUP_SIZE - 1);\n      auto target_bucket_wb = g.shfl(target_bucket, GROUP_SIZE - 1);\n      VecV* dst;\n      if (target_bucket_wb == 1) {\n        VecV** sm_bucket_values_ptr = SMM::bucket_values_ptr(smem);\n        dst = sm_bucket_values_ptr[groupID * GROUP_SIZE + GROUP_SIZE - 1] +\n              key_pos_wb * dim;\n      } else {\n        auto bv2 = g.shfl(bucket_values_ptr2, GROUP_SIZE - 1);\n        dst = bv2 + key_pos_wb * dim;\n      }\n      __pipeline_wait_prior(0);\n      CopyValue::lds_stg(rank, dst, src, dim);\n      if (rank == GROUP_SIZE - 1) {\n        K* target_keys_ptr =\n            (target_bucket == 1) ? bucket_keys_ptr1 : bucket_keys_ptr2;\n        auto key_address = BUCKET::keys(target_keys_ptr, key_pos);\n        key_address->store(key, cuda::std::memory_order_release);\n      }\n    }\n  }\n}\n\n// --- Kernel Launcher ---\n\ntemplate <typename K, typename V, typename S, typename VecV, int Strategy>\nstruct Launch_DualBucket_Pipeline_Upsert {\n  using Params = Params_Upsert<K, V, S>;\n  inline static void launch_kernel(Params& params, cudaStream_t& stream) {\n    constexpr int BLOCK_SIZE = 128;\n    constexpr uint32_t GROUP_SIZE = 32;\n    constexpr uint32_t BUCKET_SIZE = 128;\n    using SMM = SharedMemoryManager_Pipeline_Upsert<K, V, S, VecV, BLOCK_SIZE,\n                                                    GROUP_SIZE, BUCKET_SIZE>;\n\n    params.dim = params.dim * sizeof(V) / sizeof(VecV);\n    uint32_t shared_mem = SMM::total_size(params.dim);\n    shared_mem =\n        (shared_mem + sizeof(byte16) - 1) / sizeof(byte16) * sizeof(byte16);\n    dual_bucket_pipeline_upsert_kernel_with_io<K, V, S, VecV, BLOCK_SIZE,\n                                               Strategy>\n        <<<(params.n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, shared_mem,\n           stream>>>(params.buckets, params.buckets_size, params.buckets_num,\n                     params.dim, params.keys,\n                     reinterpret_cast<const VecV*>(params.values),\n                     params.scores, params.n, params.global_epoch);\n  }\n};\n\n// --- Kernel Selector ---\n\ntemplate <typename K, typename V, typename S, int Strategy, typename ArchTag>\nstruct KernelSelector_DualBucketUpsert {\n  using Params = Params_Upsert<K, V, S>;\n\n  static void select_kernel(Params& params, cudaStream_t& stream) {\n    const uint32_t total_value_size =\n        static_cast<uint32_t>(params.dim * sizeof(V));\n\n    // Dual-bucket always uses pipeline kernel (optimized for bucket_size=128).\n    if (total_value_size % sizeof(byte16) == 0) {\n      using VecV = byte16;\n      Launch_DualBucket_Pipeline_Upsert<K, V, S, VecV, Strategy>::launch_kernel(\n          params, stream);\n    } else if (total_value_size % sizeof(byte8) == 0) {\n      using VecV = byte8;\n      Launch_DualBucket_Pipeline_Upsert<K, V, S, VecV, Strategy>::launch_kernel(\n          params, stream);\n    } else if (total_value_size % sizeof(byte4) == 0) {\n      using VecV = byte4;\n      Launch_DualBucket_Pipeline_Upsert<K, V, S, VecV, Strategy>::launch_kernel(\n          params, stream);\n    } else if (total_value_size % sizeof(byte2) == 0) {\n      using VecV = byte2;\n      Launch_DualBucket_Pipeline_Upsert<K, V, S, VecV, Strategy>::launch_kernel(\n          params, stream);\n    } else {\n      using VecV = byte;\n      Launch_DualBucket_Pipeline_Upsert<K, V, S, VecV, Strategy>::launch_kernel(\n          params, stream);\n    }\n  }\n};\n\n}  // namespace merlin\n}  // namespace nv\n"
  },
  {
    "path": "include/merlin/core_kernels/dual_bucket_utils.cuh",
    "content": "/*\n * Copyright (c) 2024, NVIDIA CORPORATION.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *     http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n#pragma once\n\n#include \"kernel_utils.cuh\"\n\nnamespace nv {\nnamespace merlin {\n\n/**\n * Core dual-bucket index computation from a pre-computed hash.\n * b1 = low 32 bits mod buckets_num, b2 = high 32 bits mod buckets_num.\n * Guarantees b2 != b1 by advancing b2 on collision.\n *\n * This is the single source of truth for dual-bucket addressing.\n * All kernels (upsert, lookup, etc.) must use this function.\n */\ntemplate <class K>\n__device__ __forceinline__ void get_dual_bucket_indices(\n    const K hashed_key, const size_t buckets_num, size_t& bkt_idx1,\n    size_t& bkt_idx2) {\n  const uint32_t lo = static_cast<uint32_t>(hashed_key);\n  const uint32_t hi =\n      static_cast<uint32_t>(static_cast<uint64_t>(hashed_key) >> 32);\n\n  bkt_idx1 = lo % buckets_num;\n  bkt_idx2 = hi % buckets_num;\n  if (bkt_idx2 == bkt_idx1) {\n    bkt_idx2 = (bkt_idx2 + 1) % buckets_num;\n  }\n}\n\n/**\n * Digest functions for dual-bucket mode.\n *\n * Dual-bucket digests use bits [56:63] (highest 8 bits) of the Murmur3 hash,\n * whereas single-bucket digests use bits [32:39].  The different bit range\n * avoids collision with the b2 bucket address, which is derived from the high\n * 32 bits (bits [32:63]).  Using [56:63] ensures that two keys mapping to the\n * same b2 bucket can still have distinct digests.\n *\n * INVARIANT: `dual_bucket_empty_digest()` must ALWAYS return the true\n * hash-derived value for EMPTY_KEY.  Kernels rely on this sentinel to\n * distinguish empty slots from occupied ones during the SIMD scan pass.\n * Returning a constant would cause every occupied slot to match the empty\n * digest, breaking the probing logic.\n */\n\n// Target digest for a given key (bits [56:63] of Murmur3 hash).\ntemplate <class K>\n__device__ __forceinline__ D get_dual_bucket_digest(const K& key) {\n  const K hashed_key = Murmur3HashDevice(key);\n  return static_cast<D>(static_cast<uint64_t>(hashed_key) >> 56);\n}\n\n// Target digest from a pre-computed hash.\ntemplate <class K>\n__device__ __forceinline__ D\nget_dual_bucket_digest_from_hash(const K& hashed_key) {\n  return static_cast<D>(static_cast<uint64_t>(hashed_key) >> 56);\n}\n\n// Pack dual-bucket digest into all 4 bytes for SIMD `__vcmpeq4` comparison.\ntemplate <class K>\n__device__ __forceinline__ VecD_Comp\ndual_bucket_digests_from_hashed(const K& hashed_key) {\n  D digest = static_cast<D>(static_cast<uint64_t>(hashed_key) >> 56);\n  return static_cast<VecD_Comp>(__byte_perm(digest, digest, 0x0000));\n}\n\n// Sentinel digest for EMPTY_KEY in dual-bucket mode — must always use real\n// hash value (bits [56:63]).\ntemplate <class K>\n__device__ __forceinline__ D dual_bucket_empty_digest() {\n  const K hashed_key = Murmur3HashDevice(static_cast<K>(EMPTY_KEY));\n  return static_cast<D>(static_cast<uint64_t>(hashed_key) >> 56);\n}\n\n// Pack empty-key digest into all 4 bytes for SIMD comparison.\ntemplate <class K>\n__device__ __forceinline__ VecD_Comp dual_bucket_empty_digests() {\n  D digest = dual_bucket_empty_digest<K>();\n  return static_cast<VecD_Comp>(__byte_perm(digest, digest, 0x0000));\n}\n\n}  // namespace merlin\n}  // namespace nv\n"
  },
  {
    "path": "include/merlin/core_kernels/find_or_insert.cuh",
    "content": "/*\n * Copyright (c) 2023, NVIDIA CORPORATION.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *     http:///www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n#pragma once\n\n#include \"kernel_utils.cuh\"\n\nnamespace nv {\nnamespace merlin {\n\n// Use 1 thread to deal with a KV-pair, including copying value.\ntemplate <typename K = uint64_t, typename V = byte4, typename S = uint64_t,\n          typename VecV = byte16, uint32_t BLOCK_SIZE = 128, int Strategy = -1>\n__global__ void tlp_v1_find_or_insert_kernel_with_io(\n    Bucket<K, V, S>* __restrict__ buckets, int32_t* __restrict__ buckets_size,\n    const uint64_t buckets_num, uint32_t bucket_capacity, const uint32_t dim,\n    const K* __restrict__ keys, VecV* __restrict__ values,\n    S* __restrict__ scores, uint64_t n, const S global_epoch) {\n  using BUCKET = Bucket<K, V, S>;\n  using CopyValue = CopyValueMultipleGroup<VecV, 1>;\n  using ScoreFunctor = ScoreFunctor<K, V, S, Strategy>;\n\n  // bucket_capacity is a multiple of 4.\n  constexpr uint32_t STRIDE_S = 4;\n  constexpr uint32_t Load_LEN_S = sizeof(byte16) / sizeof(S);\n  __shared__ S sm_bucket_scores[BLOCK_SIZE][2 * STRIDE_S];\n\n  uint32_t tx = threadIdx.x;\n  uint32_t kv_idx = blockIdx.x * blockDim.x + tx;\n  K key{static_cast<K>(EMPTY_KEY)};\n  S score{static_cast<S>(EMPTY_SCORE)};\n  OccupyResult occupy_result{OccupyResult::INITIAL};\n  VecD_Comp target_digests{0};\n  VecV* bucket_values_ptr{nullptr};\n  K* bucket_keys_ptr{nullptr};\n  int32_t* bucket_size_ptr{nullptr};\n  uint32_t key_pos = {0};\n  uint32_t bucket_size{0};\n  if (kv_idx < n) {\n    key = keys[kv_idx];\n    score = ScoreFunctor::desired_when_missed(scores, kv_idx, global_epoch);\n\n    if (!IS_RESERVED_KEY<K>(key)) {\n      const K hashed_key = Murmur3HashDevice(key);\n      target_digests = digests_from_hashed<K>(hashed_key);\n      uint64_t global_idx =\n          static_cast<uint64_t>(hashed_key % (buckets_num * bucket_capacity));\n      key_pos = get_start_position(global_idx, bucket_capacity);\n      uint64_t bkt_idx = global_idx / bucket_capacity;\n      bucket_size_ptr = buckets_size + bkt_idx;\n      BUCKET* bucket = buckets + bkt_idx;\n      bucket_size = *bucket_size_ptr;\n      bucket_keys_ptr = reinterpret_cast<K*>(bucket->keys(0));\n      bucket_values_ptr = reinterpret_cast<VecV*>(bucket->vectors);\n    } else {\n      return;\n    }\n  } else {\n    return;\n  }\n\n  // Load `STRIDE` digests every time.\n  constexpr uint32_t STRIDE = sizeof(VecD_Load) / sizeof(D);\n  // One more loop to handle empty keys.\n  for (int offset = 0; offset < bucket_capacity + STRIDE; offset += STRIDE) {\n    if (occupy_result != OccupyResult::INITIAL) break;\n\n    uint32_t pos_cur = align_to<STRIDE>(key_pos);\n    pos_cur = (pos_cur + offset) & (bucket_capacity - 1);\n\n    D* digests_ptr = BUCKET::digests(bucket_keys_ptr, bucket_capacity, pos_cur);\n    VecD_Load digests_vec = *(reinterpret_cast<VecD_Load*>(digests_ptr));\n    VecD_Comp digests_arr[4] = {digests_vec.x, digests_vec.y, digests_vec.z,\n                                digests_vec.w};\n\n    for (int i = 0; i < 4; i++) {\n      VecD_Comp probe_digests = digests_arr[i];\n      uint32_t possible_pos = 0;\n      bool result = false;\n      // Perform a vectorized comparison by byte,\n      // and if they are equal, set the corresponding byte in the result to\n      // 0xff.\n      int cmp_result = __vcmpeq4(probe_digests, target_digests);\n      cmp_result &= 0x01010101;\n      do {\n        if (cmp_result == 0) break;\n        // CUDA uses little endian,\n        // and the lowest byte in register stores in the lowest address.\n        uint32_t index = (__ffs(cmp_result) - 1) >> 3;\n        cmp_result &= (cmp_result - 1);\n        possible_pos = pos_cur + i * 4 + index;\n        auto current_key = BUCKET::keys(bucket_keys_ptr, possible_pos);\n        K expected_key = key;\n        // Modifications to the bucket will not before this instruction.\n        result = current_key->compare_exchange_strong(\n            expected_key, static_cast<K>(LOCKED_KEY),\n            cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);\n      } while (!result);\n      if (result) {\n        occupy_result = OccupyResult::DUPLICATE;\n        key_pos = possible_pos;\n        ScoreFunctor::update_with_digest(bucket_keys_ptr, key_pos, scores,\n                                         kv_idx, score, bucket_capacity,\n                                         get_digest<K>(key), false);\n        break;\n      } else if (bucket_size == bucket_capacity) {\n        continue;\n      }\n      VecD_Comp empty_digests_ = empty_digests<K>();\n      cmp_result = __vcmpeq4(probe_digests, empty_digests_);\n      cmp_result &= 0x01010101;\n      do {\n        if (cmp_result == 0) break;\n        uint32_t index = (__ffs(cmp_result) - 1) >> 3;\n        cmp_result &= (cmp_result - 1);\n        possible_pos = pos_cur + i * 4 + index;\n        if (offset == 0 && possible_pos < key_pos) continue;\n        auto current_key = BUCKET::keys(bucket_keys_ptr, possible_pos);\n        K expected_key = static_cast<K>(EMPTY_KEY);\n        result = current_key->compare_exchange_strong(\n            expected_key, static_cast<K>(LOCKED_KEY),\n            cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);\n      } while (!result);\n      if (result) {\n        occupy_result = OccupyResult::OCCUPIED_EMPTY;\n        key_pos = possible_pos;\n        ScoreFunctor::update_with_digest(bucket_keys_ptr, key_pos, scores,\n                                         kv_idx, score, bucket_capacity,\n                                         get_digest<K>(key), true);\n\n        atomicAdd(bucket_size_ptr, 1);\n        break;\n      }\n    }\n  }\n  while (occupy_result == OccupyResult::INITIAL) {\n    S* bucket_scores_ptr = BUCKET::scores(bucket_keys_ptr, bucket_capacity, 0);\n    S min_score = static_cast<S>(MAX_SCORE);\n    int min_pos = -1;\n#pragma unroll\n    for (int j = 0; j < STRIDE_S; j += Load_LEN_S) {\n      __pipeline_memcpy_async(sm_bucket_scores[tx] + j, bucket_scores_ptr + j,\n                              sizeof(S) * Load_LEN_S);\n    }\n    __pipeline_commit();\n    for (int i = 0; i < bucket_capacity; i += STRIDE_S) {\n      if (i < bucket_capacity - STRIDE_S) {\n#pragma unroll\n        for (int j = 0; j < STRIDE_S; j += Load_LEN_S) {\n          __pipeline_memcpy_async(\n              sm_bucket_scores[tx] + diff_buf(i / STRIDE_S) * STRIDE_S + j,\n              bucket_scores_ptr + i + STRIDE_S + j, sizeof(S) * Load_LEN_S);\n        }\n      }\n      __pipeline_commit();\n      __pipeline_wait_prior(1);\n      S temp_scores[Load_LEN_S];\n      S* src = sm_bucket_scores[tx] + same_buf(i / STRIDE_S) * STRIDE_S;\n#pragma unroll\n      for (int k = 0; k < STRIDE_S; k += Load_LEN_S) {\n        *reinterpret_cast<byte16*>(temp_scores) =\n            *reinterpret_cast<byte16*>(src + k);\n#pragma unroll\n        for (int j = 0; j < Load_LEN_S; j += 1) {\n          S temp_score = temp_scores[j];\n          if (temp_score < min_score) {\n            min_score = temp_score;\n            min_pos = i + k + j;\n          }\n        }\n      }\n    }\n    score = ScoreFunctor::desired_when_missed(scores, kv_idx, global_epoch);\n    if (score < min_score) {\n      occupy_result = OccupyResult::REFUSED;\n      break;\n    }\n    auto min_score_key = BUCKET::keys(bucket_keys_ptr, min_pos);\n    auto expected_key = min_score_key->load(cuda::std::memory_order_relaxed);\n    if (expected_key != static_cast<K>(LOCKED_KEY) &&\n        expected_key != static_cast<K>(EMPTY_KEY)) {\n      bool result = min_score_key->compare_exchange_strong(\n          expected_key, static_cast<K>(LOCKED_KEY),\n          cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);\n      if (result) {\n        S* min_score_ptr =\n            BUCKET::scores(bucket_keys_ptr, bucket_capacity, min_pos);\n        auto verify_score_ptr =\n            reinterpret_cast<AtomicScore<S>*>(min_score_ptr);\n        auto verify_score =\n            verify_score_ptr->load(cuda::std::memory_order_relaxed);\n        if (verify_score <= min_score) {\n          key_pos = min_pos;\n          ScoreFunctor::update_with_digest(bucket_keys_ptr, key_pos, scores,\n                                           kv_idx, score, bucket_capacity,\n                                           get_digest<K>(key), true);\n\n          if (expected_key == static_cast<K>(RECLAIM_KEY)) {\n            occupy_result = OccupyResult::OCCUPIED_RECLAIMED;\n            atomicAdd(bucket_size_ptr, 1);\n          } else {\n            occupy_result = OccupyResult::EVICT;\n          }\n\n        } else {\n          min_score_key->store(expected_key, cuda::std::memory_order_release);\n        }\n      }\n    }\n  }\n  VecV* bucket_value_ptr = bucket_values_ptr + key_pos * dim;\n  VecV* param_value_ptr = values + kv_idx * dim;\n\n  if (occupy_result != OccupyResult::REFUSED) {\n    if (occupy_result == OccupyResult::DUPLICATE) {\n      CopyValue::ldg_stg(0, param_value_ptr, bucket_value_ptr, dim);\n    } else {\n      CopyValue::ldg_stg(0, bucket_value_ptr, param_value_ptr, dim);\n    }\n    auto key_address = BUCKET::keys(bucket_keys_ptr, key_pos);\n    // memory_order_release:\n    // Modifications to the bucket will not after this instruction.\n    key_address->store(key, cuda::std::memory_order_release);\n  }\n}\n\n// Use 1 thread to deal with a KV-pair, including copying value.\ntemplate <typename K = uint64_t, typename V = byte4, typename S = uint64_t,\n          typename VecV = byte16, uint32_t BLOCK_SIZE = 128,\n          uint32_t GROUP_SIZE = 16, int Strategy = -1>\n__global__ void tlp_v2_find_or_insert_kernel_with_io(\n    Bucket<K, V, S>* __restrict__ buckets, int32_t* __restrict__ buckets_size,\n    const uint64_t buckets_num, uint32_t bucket_capacity, const uint32_t dim,\n    const K* __restrict__ keys, VecV* __restrict__ values,\n    S* __restrict__ scores, uint64_t n, const S global_epoch) {\n  using BUCKET = Bucket<K, V, S>;\n  using CopyValue = CopyValueMultipleGroup<VecV, GROUP_SIZE>;\n  using ScoreFunctor = ScoreFunctor<K, V, S, Strategy>;\n\n  auto g = cg::tiled_partition<GROUP_SIZE>(cg::this_thread_block());\n\n  // bucket_capacity is a multiple of 4.\n  constexpr uint32_t STRIDE_S = 4;\n  constexpr uint32_t Load_LEN_S = sizeof(byte16) / sizeof(S);\n  __shared__ S sm_bucket_scores[BLOCK_SIZE][2 * STRIDE_S];\n\n  uint32_t tx = threadIdx.x;\n  uint32_t kv_idx = blockIdx.x * blockDim.x + tx;\n  K key{static_cast<K>(EMPTY_KEY)};\n  S score{static_cast<S>(EMPTY_SCORE)};\n  OccupyResult occupy_result{OccupyResult::INITIAL};\n  VecD_Comp target_digests{0};\n  VecV* bucket_values_ptr{nullptr};\n  K* bucket_keys_ptr{nullptr};\n  int32_t* bucket_size_ptr{nullptr};\n  uint32_t key_pos = {0};\n  uint32_t bucket_size{0};\n  if (kv_idx < n) {\n    key = keys[kv_idx];\n    score = ScoreFunctor::desired_when_missed(scores, kv_idx, global_epoch);\n\n    if (!IS_RESERVED_KEY<K>(key)) {\n      const K hashed_key = Murmur3HashDevice(key);\n      target_digests = digests_from_hashed<K>(hashed_key);\n      uint64_t global_idx =\n          static_cast<uint64_t>(hashed_key % (buckets_num * bucket_capacity));\n      key_pos = get_start_position(global_idx, bucket_capacity);\n      uint64_t bkt_idx = global_idx / bucket_capacity;\n      bucket_size_ptr = buckets_size + bkt_idx;\n      BUCKET* bucket = buckets + bkt_idx;\n      bucket_size = *bucket_size_ptr;\n      bucket_keys_ptr = reinterpret_cast<K*>(bucket->keys(0));\n      bucket_values_ptr = reinterpret_cast<VecV*>(bucket->vectors);\n    } else {\n      occupy_result = OccupyResult::ILLEGAL;\n    }\n  } else {\n    occupy_result = OccupyResult::ILLEGAL;\n  }\n\n  // Load `STRIDE` digests every time.\n  constexpr uint32_t STRIDE = sizeof(VecD_Load) / sizeof(D);\n  // One more loop to handle empty keys.\n  for (int offset = 0; offset < bucket_capacity + STRIDE; offset += STRIDE) {\n    if (occupy_result != OccupyResult::INITIAL) break;\n\n    uint32_t pos_cur = align_to<STRIDE>(key_pos);\n    pos_cur = (pos_cur + offset) & (bucket_capacity - 1);\n\n    D* digests_ptr = BUCKET::digests(bucket_keys_ptr, bucket_capacity, pos_cur);\n    VecD_Load digests_vec = *(reinterpret_cast<VecD_Load*>(digests_ptr));\n    VecD_Comp digests_arr[4] = {digests_vec.x, digests_vec.y, digests_vec.z,\n                                digests_vec.w};\n\n    for (int i = 0; i < 4; i++) {\n      VecD_Comp probe_digests = digests_arr[i];\n      uint32_t possible_pos = 0;\n      bool result = false;\n      // Perform a vectorized comparison by byte,\n      // and if they are equal, set the corresponding byte in the result to\n      // 0xff.\n      int cmp_result = __vcmpeq4(probe_digests, target_digests);\n      cmp_result &= 0x01010101;\n      do {\n        if (cmp_result == 0) break;\n        // CUDA uses little endian,\n        // and the lowest byte in register stores in the lowest address.\n        uint32_t index = (__ffs(cmp_result) - 1) >> 3;\n        cmp_result &= (cmp_result - 1);\n        possible_pos = pos_cur + i * 4 + index;\n        auto current_key = BUCKET::keys(bucket_keys_ptr, possible_pos);\n        K expected_key = key;\n        // Modifications to the bucket will not before this instruction.\n        result = current_key->compare_exchange_strong(\n            expected_key, static_cast<K>(LOCKED_KEY),\n            cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);\n      } while (!result);\n      if (result) {\n        occupy_result = OccupyResult::DUPLICATE;\n        key_pos = possible_pos;\n        ScoreFunctor::update_with_digest(bucket_keys_ptr, key_pos, scores,\n                                         kv_idx, score, bucket_capacity,\n                                         get_digest<K>(key), false);\n        break;\n      } else if (bucket_size == bucket_capacity) {\n        continue;\n      }\n      VecD_Comp empty_digests_ = empty_digests<K>();\n      cmp_result = __vcmpeq4(probe_digests, empty_digests_);\n      cmp_result &= 0x01010101;\n      do {\n        if (cmp_result == 0) break;\n        uint32_t index = (__ffs(cmp_result) - 1) >> 3;\n        cmp_result &= (cmp_result - 1);\n        possible_pos = pos_cur + i * 4 + index;\n        if (offset == 0 && possible_pos < key_pos) continue;\n        auto current_key = BUCKET::keys(bucket_keys_ptr, possible_pos);\n        K expected_key = static_cast<K>(EMPTY_KEY);\n        result = current_key->compare_exchange_strong(\n            expected_key, static_cast<K>(LOCKED_KEY),\n            cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);\n      } while (!result);\n      if (result) {\n        occupy_result = OccupyResult::OCCUPIED_EMPTY;\n        key_pos = possible_pos;\n        ScoreFunctor::update_with_digest(bucket_keys_ptr, key_pos, scores,\n                                         kv_idx, score, bucket_capacity,\n                                         get_digest<K>(key), true);\n        atomicAdd(bucket_size_ptr, 1);\n        break;\n      }\n    }\n  }\n  while (occupy_result == OccupyResult::INITIAL) {\n    S* bucket_scores_ptr = BUCKET::scores(bucket_keys_ptr, bucket_capacity, 0);\n    S min_score = static_cast<S>(MAX_SCORE);\n    int min_pos = -1;\n#pragma unroll\n    for (int j = 0; j < STRIDE_S; j += Load_LEN_S) {\n      __pipeline_memcpy_async(sm_bucket_scores[tx] + j, bucket_scores_ptr + j,\n                              sizeof(S) * Load_LEN_S);\n    }\n    __pipeline_commit();\n    for (int i = 0; i < bucket_capacity; i += STRIDE_S) {\n      if (i < bucket_capacity - STRIDE_S) {\n#pragma unroll\n        for (int j = 0; j < STRIDE_S; j += Load_LEN_S) {\n          __pipeline_memcpy_async(\n              sm_bucket_scores[tx] + diff_buf(i / STRIDE_S) * STRIDE_S + j,\n              bucket_scores_ptr + i + STRIDE_S + j, sizeof(S) * Load_LEN_S);\n        }\n      }\n      __pipeline_commit();\n      __pipeline_wait_prior(1);\n      S temp_scores[Load_LEN_S];\n      S* src = sm_bucket_scores[tx] + same_buf(i / STRIDE_S) * STRIDE_S;\n#pragma unroll\n      for (int k = 0; k < STRIDE_S; k += Load_LEN_S) {\n        *reinterpret_cast<byte16*>(temp_scores) =\n            *reinterpret_cast<byte16*>(src + k);\n#pragma unroll\n        for (int j = 0; j < Load_LEN_S; j += 1) {\n          S temp_score = temp_scores[j];\n          if (temp_score < min_score) {\n            auto verify_key_ptr = BUCKET::keys(bucket_keys_ptr, i + k + j);\n            auto verify_key =\n                verify_key_ptr->load(cuda::std::memory_order_relaxed);\n            if (verify_key != static_cast<K>(LOCKED_KEY) &&\n                verify_key != static_cast<K>(EMPTY_KEY)) {\n              min_score = temp_score;\n              min_pos = i + k + j;\n            }\n          }\n        }\n      }\n    }\n\n    score = ScoreFunctor::desired_when_missed(scores, kv_idx, global_epoch);\n    if (score < min_score) {\n      occupy_result = OccupyResult::REFUSED;\n      break;\n    }\n    auto min_score_key = BUCKET::keys(bucket_keys_ptr, min_pos);\n    auto expected_key = min_score_key->load(cuda::std::memory_order_relaxed);\n    if (expected_key != static_cast<K>(LOCKED_KEY) &&\n        expected_key != static_cast<K>(EMPTY_KEY)) {\n      bool result = min_score_key->compare_exchange_strong(\n          expected_key, static_cast<K>(LOCKED_KEY),\n          cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);\n      if (result) {\n        S* min_score_ptr =\n            BUCKET::scores(bucket_keys_ptr, bucket_capacity, min_pos);\n        auto verify_score_ptr =\n            reinterpret_cast<AtomicScore<S>*>(min_score_ptr);\n        auto verify_score =\n            verify_score_ptr->load(cuda::std::memory_order_relaxed);\n        if (verify_score <= min_score) {\n          key_pos = min_pos;\n          ScoreFunctor::update_with_digest(bucket_keys_ptr, key_pos, scores,\n                                           kv_idx, score, bucket_capacity,\n                                           get_digest<K>(key), true);\n\n          if (expected_key == static_cast<K>(RECLAIM_KEY)) {\n            occupy_result = OccupyResult::OCCUPIED_RECLAIMED;\n            atomicAdd(bucket_size_ptr, 1);\n          } else {\n            occupy_result = OccupyResult::EVICT;\n          }\n        } else {\n          min_score_key->store(expected_key, cuda::std::memory_order_release);\n        }\n      }\n    }\n  }\n\n  VecV* bucket_value_ptr{nullptr};\n  if ((occupy_result != OccupyResult::ILLEGAL) &&\n      (occupy_result != OccupyResult::REFUSED)) {\n    bucket_value_ptr = bucket_values_ptr + key_pos * dim;\n  }\n  __syncthreads();\n  uint32_t rank = g.thread_rank();\n  uint32_t groupID = threadIdx.x / GROUP_SIZE;\n\n  // Shared memory reuse:\n  // __shared__ S sm_bucket_scores[BLOCK_SIZE][2 * STRIDE_S];\n  // __shared__ VecV sm_values_buffer[GROUP_NUM][2][GROUP_BUF];\n  // assert(GROUP_BUF >= 2 * dim);\n  constexpr uint32_t GROUP_BUFs =\n      GROUP_SIZE * 2 * STRIDE_S * sizeof(S) / sizeof(VecV);\n  constexpr uint32_t GROUP_BUF = GROUP_BUFs / 2;\n  auto sm_values_buffer =\n      reinterpret_cast<VecV*>(&(sm_bucket_scores[0][0])) + groupID * GROUP_BUFs;\n\n  auto occupy_result_next = g.shfl(occupy_result, 0);\n  if ((occupy_result_next != OccupyResult::ILLEGAL) &&\n      (occupy_result_next != OccupyResult::REFUSED)) {\n    VecV* dst = sm_values_buffer;\n    if (occupy_result_next == OccupyResult::DUPLICATE) {\n      const VecV* src = g.shfl(bucket_value_ptr, 0);\n      CopyValue::ldg_sts(rank, dst, src, dim);\n    } else {\n      auto kv_idx_next = g.shfl(kv_idx, 0);\n      const VecV* src = values + kv_idx_next * dim;\n      CopyValue::ldg_sts(rank, dst, src, dim);\n    }\n  }\n  __pipeline_commit();\n\n  for (int i = 0; i < GROUP_SIZE; i++) {\n    if (i + 1 < GROUP_SIZE) {\n      auto occupy_result_next = g.shfl(occupy_result, i + 1);\n      if ((occupy_result_next != OccupyResult::ILLEGAL) &&\n          (occupy_result_next != OccupyResult::REFUSED)) {\n        VecV* dst = sm_values_buffer + diff_buf(i) * GROUP_BUF;\n        if (occupy_result_next == OccupyResult::DUPLICATE) {\n          const VecV* src = g.shfl(bucket_value_ptr, i + 1);\n          CopyValue::ldg_sts(rank, dst, src, dim);\n        } else {\n          auto kv_idx_next = g.shfl(kv_idx, i + 1);\n          const VecV* src = values + kv_idx_next * dim;\n          CopyValue::ldg_sts(rank, dst, src, dim);\n        }\n      }\n    }\n    __pipeline_commit();\n    auto occupy_result_cur = g.shfl(occupy_result, i);\n    if ((occupy_result_cur != OccupyResult::ILLEGAL) &&\n        (occupy_result_cur != OccupyResult::REFUSED)) {\n      VecV* src = sm_values_buffer + same_buf(i) * GROUP_BUF;\n      __pipeline_wait_prior(0);\n      if (occupy_result_cur == OccupyResult::DUPLICATE) {\n        auto kv_idx_cur = g.shfl(kv_idx, i);\n        VecV* dst = values + kv_idx_cur * dim;\n        __pipeline_wait_prior(1);\n        CopyValue::lds_stg(rank, dst, src, dim);\n      } else {\n        VecV* dst = g.shfl(bucket_value_ptr, i);\n        __pipeline_wait_prior(1);\n        CopyValue::lds_stg(rank, dst, src, dim);\n      }\n    }\n  }\n\n  if ((occupy_result != OccupyResult::ILLEGAL) &&\n      (occupy_result != OccupyResult::REFUSED)) {\n    auto key_address = BUCKET::keys(bucket_keys_ptr, key_pos);\n    // memory_order_release:\n    // Modifications to the bucket will not after this instruction.\n    key_address->store(key, cuda::std::memory_order_release);\n  }\n}\n\ntemplate <\n    typename K, typename V, typename S, typename VecV, uint32_t BLOCK_SIZE,\n    uint32_t GROUP_SIZE, uint32_t BUCKET_SIZE,\n    uint32_t GROUP_NUM = BLOCK_SIZE / GROUP_SIZE, uint32_t OFST_ParamScores = 0,\n    uint32_t OFST_BucketValuesPtr = OFST_ParamScores + sizeof(S) * BLOCK_SIZE,\n    uint32_t OFST_BucketsSizePtr =\n        OFST_BucketValuesPtr + sizeof(VecV*) * BLOCK_SIZE,\n    uint32_t OFST_BucketDigests =\n        OFST_BucketsSizePtr + sizeof(int*) * BLOCK_SIZE,\n    uint32_t OFST_BucketScores =\n        OFST_BucketDigests + sizeof(D) * GROUP_NUM * 2 * BUCKET_SIZE,\n    uint32_t OFST_BucketValues =\n        OFST_BucketScores + sizeof(S) * GROUP_NUM * 2 * BUCKET_SIZE>\nstruct SharedMemoryManager_Pipeline_FindOrInsert {\n  /*\n    __shared__ S sm_param_scores[BLOCK_SIZE];\n    __shared__ VecV* sm_bucket_values_ptr[BLOCK_SIZE];\n    __shared__ int* sm_buckets_size_ptr[BLOCK_SIZE];\n    __shared__ D sm_bucket_digests[GROUP_NUM][2][BUCKET_SIZE];\n    __shared__ S sm_bucket_scores[GROUP_NUM][2][BUCKET_SIZE];\n    __shared__ VecV sm_values_buffer[GROUP_NUM][2][dim];\n  */\n  static inline uint32_t total_size(uint32_t dim) {\n    return BLOCK_SIZE * (sizeof(S) + sizeof(VecV*) + sizeof(int*)) +\n           GROUP_NUM * 2 *\n               (BUCKET_SIZE * (sizeof(D) + sizeof(S)) + dim * sizeof(VecV));\n  }\n  static __forceinline__ __device__ S* param_scores(byte* smem) {\n    return reinterpret_cast<S*>(smem + OFST_ParamScores);\n  }\n  static __forceinline__ __device__ VecV** bucket_values_ptr(byte* smem) {\n    return reinterpret_cast<VecV**>(smem + OFST_BucketValuesPtr);\n  }\n  static __forceinline__ __device__ int** buckets_size_ptr(byte* smem) {\n    return reinterpret_cast<int**>(smem + OFST_BucketsSizePtr);\n  }\n  static __forceinline__ __device__ D* bucket_digests(byte* smem,\n                                                      uint32_t groupID,\n                                                      uint32_t buf) {\n    return reinterpret_cast<D*>(smem + OFST_BucketDigests) +\n           BUCKET_SIZE * (groupID * 2 + buf);\n  }\n  static __forceinline__ __device__ S* bucket_scores(byte* smem,\n                                                     uint32_t groupID,\n                                                     uint32_t buf) {\n    return reinterpret_cast<S*>(smem + OFST_BucketScores) +\n           BUCKET_SIZE * (groupID * 2 + buf);\n  }\n  static __forceinline__ __device__ VecV* values_buffer(byte* smem,\n                                                        uint32_t groupID,\n                                                        uint32_t buf,\n                                                        uint32_t dim) {\n    return reinterpret_cast<VecV*>(smem + OFST_BucketValues) +\n           dim * (groupID * 2 + buf);\n  }\n};\n\ntemplate <typename K = uint64_t, typename V = byte4, typename S = uint64_t,\n          typename VecV = byte16, uint32_t BLOCK_SIZE = 128, int Strategy = -1>\n__global__ void pipeline_find_or_insert_kernel_with_io(\n    Bucket<K, V, S>* __restrict__ buckets, int32_t* __restrict__ buckets_size,\n    const uint64_t buckets_num, const uint32_t dim, const K* __restrict__ keys,\n    VecV* __restrict__ values, S* __restrict__ scores, uint64_t n,\n    const S global_epoch) {\n  // Here, GROUP_SIZE * Comp_LEN = BUCKET_SIZE.\n  constexpr uint32_t BUCKET_SIZE = 128;\n  constexpr uint32_t GROUP_SIZE = 32;\n  constexpr uint32_t Comp_LEN = sizeof(VecD_Comp) / sizeof(D);\n  constexpr uint32_t Load_LEN = sizeof(VecD_Load) / sizeof(D);\n  constexpr uint32_t Load_LEN_S = sizeof(byte16) / sizeof(S);\n\n  using BUCKET = Bucket<K, V, S>;\n  using CopyValue = CopyValueMultipleGroup<VecV, GROUP_SIZE>;\n  using SMM =\n      SharedMemoryManager_Pipeline_FindOrInsert<K, V, S, VecV, BLOCK_SIZE,\n                                                GROUP_SIZE, BUCKET_SIZE>;\n  using ScoreFunctor = ScoreFunctor<K, V, S, Strategy>;\n\n  __shared__ extern __align__(alignof(byte16)) byte smem[];\n\n  // Initialization.\n  auto g = cg::tiled_partition<GROUP_SIZE>(cg::this_thread_block());\n  uint32_t tx = threadIdx.x;\n  uint32_t kv_idx = blockIdx.x * blockDim.x + tx;\n  K key{static_cast<K>(EMPTY_KEY)};\n  VecD_Comp target_digests;\n  K* bucket_keys_ptr{nullptr};\n  OccupyResult occupy_result{OccupyResult::INITIAL};\n  uint32_t key_pos = 0;\n  if (kv_idx < n) {\n    key = keys[kv_idx];\n    if (scores != nullptr) {\n      S* sm_param_scores = SMM::param_scores(smem);\n      __pipeline_memcpy_async(sm_param_scores + tx, scores + kv_idx, sizeof(S));\n    }\n    if (!IS_RESERVED_KEY<K>(key)) {\n      const K hashed_key = Murmur3HashDevice(key);\n      target_digests = digests_from_hashed<K>(hashed_key);\n      uint64_t global_idx =\n          static_cast<uint64_t>(hashed_key % (buckets_num * BUCKET_SIZE));\n      uint64_t bkt_idx = global_idx / BUCKET_SIZE;\n      key_pos = get_start_position(global_idx, BUCKET_SIZE);\n      int** sm_buckets_size_ptr = SMM::buckets_size_ptr(smem);\n      sm_buckets_size_ptr[tx] = buckets_size + bkt_idx;\n      BUCKET* bucket = buckets + bkt_idx;\n      bucket_keys_ptr = reinterpret_cast<K*>(bucket->keys(0));\n      VecV** sm_bucket_values_ptr = SMM::bucket_values_ptr(smem);\n      __pipeline_memcpy_async(sm_bucket_values_ptr + tx, &(bucket->vectors),\n                              sizeof(VecV*));\n    } else {\n      occupy_result = OccupyResult::ILLEGAL;\n    }\n  } else {\n    occupy_result = OccupyResult::ILLEGAL;\n  }\n\n  uint32_t rank = g.thread_rank();\n  uint32_t groupID = threadIdx.x / GROUP_SIZE;\n\n  // Pipeline loading.\n  auto occupy_result_next = g.shfl(occupy_result, 0);\n  auto keys_ptr_next = g.shfl(bucket_keys_ptr, 0);\n  if (occupy_result_next == OccupyResult::INITIAL) {\n    D* sm_bucket_digests = SMM::bucket_digests(smem, groupID, 0);\n    D* dst = sm_bucket_digests + rank * Load_LEN;\n    D* src = BUCKET::digests(keys_ptr_next, BUCKET_SIZE, rank * Load_LEN);\n    if (rank * Load_LEN < BUCKET_SIZE) {\n      __pipeline_memcpy_async(dst, src, sizeof(VecD_Load));\n    }\n  }\n  __pipeline_commit();\n  // Padding, meet the param of the first `__pipeline_wait_prior`\n  // in the first loop.\n  __pipeline_commit();\n  __pipeline_commit();\n  for (int32_t i = 0; i < GROUP_SIZE; i++) {\n    // Step1: load digests from global memory to shared memory.\n    if (i + 1 < GROUP_SIZE) {\n      auto occupy_result_next = g.shfl(occupy_result, i + 1);\n      auto keys_ptr_next = g.shfl(bucket_keys_ptr, i + 1);\n      if (occupy_result_next == OccupyResult::INITIAL) {\n        D* sm_bucket_digests = SMM::bucket_digests(smem, groupID, diff_buf(i));\n        D* dst = sm_bucket_digests + rank * Load_LEN;\n        D* src = BUCKET::digests(keys_ptr_next, BUCKET_SIZE, rank * Load_LEN);\n        if (rank * Load_LEN < BUCKET_SIZE) {\n          __pipeline_memcpy_async(dst, src, sizeof(VecD_Load));\n        }\n      }\n    }\n    __pipeline_commit();\n    // Step2: to lock the target_key or empty_key by querying digests.\n    auto occupy_result_cur = g.shfl(occupy_result, i);\n    if (occupy_result_cur == OccupyResult::INITIAL) {\n      uint32_t tx_cur = groupID * GROUP_SIZE + i;\n      int** sm_buckets_size_ptr = SMM::buckets_size_ptr(smem);\n      auto bucket_size_ptr = sm_buckets_size_ptr[tx_cur];\n      K key_cur = g.shfl(key, i);\n      auto target_digests_cur = g.shfl(target_digests, i);\n      auto start_pos_cur = g.shfl(key_pos, i);\n      auto keys_ptr_cur = g.shfl(bucket_keys_ptr, i);\n      auto bucket_size_cur = *bucket_size_ptr;\n      __pipeline_wait_prior(3);\n      D* src = SMM::bucket_digests(smem, groupID, same_buf(i));\n      uint32_t start_offset = start_pos_cur / Comp_LEN;\n      uint32_t probe_offset =\n          Comp_LEN * ((start_offset + rank) & (GROUP_SIZE - 1));\n      VecD_Comp probe_digests =\n          *reinterpret_cast<VecD_Comp*>(src + probe_offset);\n      uint32_t cmp_result = __vcmpeq4(probe_digests, target_digests_cur);\n      cmp_result &= 0x01010101;\n      uint32_t possible_pos = 0;\n      bool result = false;\n      do {\n        if (cmp_result == 0) break;\n        int32_t index = (__ffs(cmp_result) - 1) >> 3;\n        cmp_result &= (cmp_result - 1);\n        possible_pos = probe_offset + index;\n        auto current_key = BUCKET::keys(keys_ptr_cur, possible_pos);\n        K expected_key = key_cur;\n        result = current_key->compare_exchange_strong(\n            expected_key, static_cast<K>(LOCKED_KEY),\n            cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);\n      } while (!result);\n      uint32_t found_vote = g.ballot(result);\n      if (found_vote) {\n        int32_t src_lane = __ffs(found_vote) - 1;\n        possible_pos = g.shfl(possible_pos, src_lane);\n        if (rank == i) {\n          occupy_result = OccupyResult::DUPLICATE;\n          key_pos = possible_pos;\n          S* sm_param_scores = SMM::param_scores(smem);\n          S score = ScoreFunctor::desired_when_missed(sm_param_scores, tx,\n                                                      global_epoch);\n          ScoreFunctor::update_with_digest(\n              bucket_keys_ptr, key_pos, sm_param_scores, tx, score, BUCKET_SIZE,\n              get_digest<K>(key), false);\n        }\n      } else if (bucket_size_cur < BUCKET_SIZE) {\n        VecD_Comp empty_digests_ = empty_digests<K>();\n        cmp_result = __vcmpeq4(probe_digests, empty_digests_);\n        cmp_result &= 0x01010101;\n        for (int32_t offset = 0; offset < GROUP_SIZE; offset += 1) {\n          if (rank == offset) {\n            do {\n              if (cmp_result == 0) break;\n              int32_t index = (__ffs(cmp_result) - 1) >> 3;\n              cmp_result &= (cmp_result - 1);\n              possible_pos = probe_offset + index;\n              auto current_key = BUCKET::keys(keys_ptr_cur, possible_pos);\n              K expected_key = static_cast<K>(EMPTY_KEY);\n              result = current_key->compare_exchange_strong(\n                  expected_key, static_cast<K>(LOCKED_KEY),\n                  cuda::std::memory_order_acquire,\n                  cuda::std::memory_order_relaxed);\n            } while (!result);\n          }\n          uint32_t found_vote = g.ballot(result);\n          if (found_vote) {\n            int32_t src_lane = __ffs(found_vote) - 1;\n            possible_pos = g.shfl(possible_pos, src_lane);\n            if (rank == i) {\n              occupy_result = OccupyResult::OCCUPIED_EMPTY;\n              S* sm_param_scores = SMM::param_scores(smem);\n              S score = ScoreFunctor::desired_when_missed(sm_param_scores, tx,\n                                                          global_epoch);\n              key_pos = possible_pos;\n              ScoreFunctor::update_with_digest(\n                  bucket_keys_ptr, key_pos, sm_param_scores, tx, score,\n                  BUCKET_SIZE, get_digest<K>(key), true);\n              atomicAdd(bucket_size_ptr, 1);\n            }\n            break;\n          }\n        }\n      }\n      occupy_result_cur = g.shfl(occupy_result, i);\n      if (occupy_result_cur == OccupyResult::INITIAL) {\n        S* sm_bucket_scores = SMM::bucket_scores(smem, groupID, same_buf(i));\n        S* dst = sm_bucket_scores + rank * Load_LEN_S;\n        S* src = BUCKET::scores(keys_ptr_cur, BUCKET_SIZE, rank * Load_LEN_S);\n#pragma unroll\n        for (int32_t k = 0; k < BUCKET_SIZE; k += GROUP_SIZE * Load_LEN_S) {\n          __pipeline_memcpy_async(dst + k, src + k, sizeof(S) * Load_LEN_S);\n        }\n      }\n    }\n    __pipeline_commit();\n    // Step 3: reduce to get the key with the minimum score.\n    if (i > 0) {\n      occupy_result_cur = g.shfl(occupy_result, i - 1);\n      uint32_t tx_cur = groupID * GROUP_SIZE + i - 1;\n      S* sm_param_scores = SMM::param_scores(smem);\n      S score_cur = ScoreFunctor::desired_when_missed(sm_param_scores, tx_cur,\n                                                      global_epoch);\n\n      int** sm_buckets_size_ptr = SMM::buckets_size_ptr(smem);\n      auto bucket_size_ptr = sm_buckets_size_ptr[tx_cur];\n      __pipeline_wait_prior(3);\n      S* src = SMM::bucket_scores(smem, groupID, diff_buf(i));\n      while (occupy_result_cur == OccupyResult::INITIAL) {\n        int min_pos_local = -1;\n        S min_score_local = static_cast<S>(MAX_SCORE);\n#pragma unroll\n        for (int j = 0; j < BUCKET_SIZE; j += GROUP_SIZE * Load_LEN_S) {\n          S temp_scores[Load_LEN_S];\n          *reinterpret_cast<byte16*>(temp_scores) =\n              *reinterpret_cast<byte16*>(src + rank * Load_LEN_S + j);\n#pragma unroll\n          for (int k = 0; k < Load_LEN_S; k++) {\n            S temp_score = temp_scores[k];\n            if (temp_score < min_score_local) {\n              min_score_local = temp_score;\n              min_pos_local = rank * Load_LEN_S + j + k;\n            }\n          }\n        }\n        const S min_score_global =\n            cg::reduce(g, min_score_local, cg::less<S>());\n        if (score_cur < min_score_global) {\n          if (rank == i - 1) {\n            occupy_result = OccupyResult::REFUSED;\n          }\n          occupy_result_cur = g.shfl(occupy_result, i - 1);\n          break;\n        }\n        uint32_t vote = g.ballot(min_score_local <= min_score_global);\n        if (vote) {\n          int src_lane = __ffs(vote) - 1;\n          int min_pos_global = g.shfl(min_pos_local, src_lane);\n          if (rank == i - 1) {\n            src[min_pos_global] = static_cast<S>(MAX_SCORE);  // Mark visited.\n            auto min_score_key = BUCKET::keys(bucket_keys_ptr, min_pos_global);\n            auto expected_key =\n                min_score_key->load(cuda::std::memory_order_relaxed);\n            if (expected_key != static_cast<K>(LOCKED_KEY) &&\n                expected_key != static_cast<K>(EMPTY_KEY)) {\n              bool result = min_score_key->compare_exchange_strong(\n                  expected_key, static_cast<K>(LOCKED_KEY),\n                  cuda::std::memory_order_acquire,\n                  cuda::std::memory_order_relaxed);\n              if (result) {\n                S* score_ptr = BUCKET::scores(bucket_keys_ptr, BUCKET_SIZE,\n                                              min_pos_global);\n                auto verify_score_ptr =\n                    reinterpret_cast<AtomicScore<S>*>(score_ptr);\n                auto verify_score =\n                    verify_score_ptr->load(cuda::std::memory_order_relaxed);\n                if (verify_score <= min_score_global) {\n                  if (expected_key == static_cast<K>(RECLAIM_KEY)) {\n                    occupy_result = OccupyResult::OCCUPIED_RECLAIMED;\n                    atomicAdd(bucket_size_ptr, 1);\n                  } else {\n                    occupy_result = OccupyResult::EVICT;\n                  }\n                  key_pos = min_pos_global;\n                  ScoreFunctor::update_with_digest(\n                      bucket_keys_ptr, key_pos, sm_param_scores, tx_cur,\n                      score_cur, BUCKET_SIZE, get_digest<K>(key), true);\n                } else {\n                  min_score_key->store(expected_key,\n                                       cuda::std::memory_order_release);\n                }\n              }\n            }\n          }\n          occupy_result_cur = g.shfl(occupy_result, i - 1);\n        }\n      }\n      // Prefetch values to shared memory.\n      if (occupy_result_cur != OccupyResult::ILLEGAL &&\n          occupy_result_cur != OccupyResult::REFUSED) {\n        VecV* dst = SMM::values_buffer(smem, groupID, diff_buf(i), dim);\n        if (occupy_result_cur == OccupyResult::DUPLICATE) {\n          VecV** sm_bucket_values_ptr = SMM::bucket_values_ptr(smem);\n          auto bucket_values_ptr =\n              sm_bucket_values_ptr[groupID * GROUP_SIZE + i - 1];\n          auto key_pos_cur = g.shfl(key_pos, i - 1);\n          const VecV* src = bucket_values_ptr + key_pos_cur * dim;\n          CopyValue::ldg_sts(rank, dst, src, dim);\n        } else {\n          auto kv_idx_cur = g.shfl(kv_idx, i - 1);\n          const VecV* src = values + kv_idx_cur * dim;\n          CopyValue::ldg_sts(rank, dst, src, dim);\n        }\n      }\n    }\n    __pipeline_commit();\n\n    // Step 4: write values to bucket or param buffer.\n    if (i > 1) {\n      occupy_result_cur = g.shfl(occupy_result, i - 2);\n      if (occupy_result_cur != OccupyResult::ILLEGAL &&\n          occupy_result_cur != OccupyResult::REFUSED) {\n        VecV* src = SMM::values_buffer(smem, groupID, same_buf(i), dim);\n        if (occupy_result_cur == OccupyResult::DUPLICATE) {\n          uint32_t kv_idx_cur = g.shfl(kv_idx, i - 2);\n          VecV* dst = values + kv_idx_cur * dim;\n          __pipeline_wait_prior(3);\n          CopyValue::lds_stg(rank, dst, src, dim);\n        } else {\n          VecV** sm_bucket_values_ptr = SMM::bucket_values_ptr(smem);\n          auto bucket_values_ptr =\n              sm_bucket_values_ptr[groupID * GROUP_SIZE + i - 2];\n          auto key_pos_cur = g.shfl(key_pos, i - 2);\n          VecV* dst = bucket_values_ptr + key_pos_cur * dim;\n          __pipeline_wait_prior(3);\n          CopyValue::lds_stg(rank, dst, src, dim);\n        }\n        if (rank == i - 2) {\n          auto key_address = BUCKET::keys(bucket_keys_ptr, key_pos);\n          key_address->store(key, cuda::std::memory_order_release);\n        }\n      }\n    }\n  }\n  auto occupy_result_cur = g.shfl(occupy_result, GROUP_SIZE - 1);\n  uint32_t tx_cur = groupID * GROUP_SIZE + GROUP_SIZE - 1;\n  S* sm_param_scores = SMM::param_scores(smem);\n  S score_cur =\n      ScoreFunctor::desired_when_missed(sm_param_scores, tx_cur, global_epoch);\n\n  int** sm_buckets_size_ptr = SMM::buckets_size_ptr(smem);\n  auto bucket_size_ptr = sm_buckets_size_ptr[tx_cur];\n  __pipeline_wait_prior(1);\n  S* src = SMM::bucket_scores(smem, groupID, diff_buf(GROUP_SIZE));\n  while (occupy_result_cur == OccupyResult::INITIAL) {\n    int min_pos_local = -1;\n    S min_score_local = MAX_SCORE;\n#pragma unroll\n    for (int j = 0; j < BUCKET_SIZE; j += GROUP_SIZE * Load_LEN_S) {\n      S temp_scores[Load_LEN_S];\n      *reinterpret_cast<byte16*>(temp_scores) =\n          *reinterpret_cast<byte16*>(src + rank * Load_LEN_S + j);\n#pragma unroll\n      for (int k = 0; k < Load_LEN_S; k++) {\n        S temp_score = temp_scores[k];\n        if (temp_score < min_score_local) {\n          min_score_local = temp_score;\n          min_pos_local = rank * Load_LEN_S + j + k;\n        }\n      }\n    }\n    const S min_score_global = cg::reduce(g, min_score_local, cg::less<S>());\n    if (score_cur < min_score_global) {\n      if (rank == GROUP_SIZE - 1) {\n        occupy_result = OccupyResult::REFUSED;\n      }\n      occupy_result_cur = g.shfl(occupy_result, GROUP_SIZE - 1);\n      break;\n    }\n    uint32_t vote = g.ballot(min_score_local <= min_score_global);\n    if (vote) {\n      int src_lane = __ffs(vote) - 1;\n      int min_pos_global = g.shfl(min_pos_local, src_lane);\n      if (rank == GROUP_SIZE - 1) {\n        src[min_pos_global] = static_cast<S>(MAX_SCORE);  // Mark visited.\n        auto min_score_key = BUCKET::keys(bucket_keys_ptr, min_pos_global);\n        auto expected_key =\n            min_score_key->load(cuda::std::memory_order_relaxed);\n        if (expected_key != static_cast<K>(LOCKED_KEY) &&\n            expected_key != static_cast<K>(EMPTY_KEY)) {\n          auto min_score_ptr =\n              BUCKET::scores(bucket_keys_ptr, BUCKET_SIZE, min_pos_global);\n          bool result = min_score_key->compare_exchange_strong(\n              expected_key, static_cast<K>(LOCKED_KEY),\n              cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);\n          if (result) {\n            S* score_ptr =\n                BUCKET::scores(bucket_keys_ptr, BUCKET_SIZE, min_pos_global);\n            auto verify_score_ptr =\n                reinterpret_cast<AtomicScore<S>*>(score_ptr);\n            auto verify_score =\n                verify_score_ptr->load(cuda::std::memory_order_relaxed);\n            if (verify_score <= min_score_global) {\n              if (expected_key == static_cast<K>(RECLAIM_KEY)) {\n                atomicAdd(bucket_size_ptr, 1);\n                occupy_result = OccupyResult::OCCUPIED_RECLAIMED;\n              } else {\n                occupy_result = OccupyResult::EVICT;\n              }\n              key_pos = min_pos_global;\n              ScoreFunctor::update_with_digest(\n                  bucket_keys_ptr, key_pos, sm_param_scores, tx_cur, score_cur,\n                  BUCKET_SIZE, get_digest<K>(key), true);\n            } else {\n              min_score_key->store(expected_key,\n                                   cuda::std::memory_order_release);\n            }\n          }\n        }\n      }\n      occupy_result_cur = g.shfl(occupy_result, GROUP_SIZE - 1);\n    }\n  }\n  // Prefetch values to shared memory.\n  if (occupy_result_cur != OccupyResult::ILLEGAL &&\n      occupy_result_cur != OccupyResult::REFUSED) {\n    VecV* dst = SMM::values_buffer(smem, groupID, diff_buf(GROUP_SIZE), dim);\n    if (occupy_result_cur == OccupyResult::DUPLICATE) {\n      VecV** sm_bucket_values_ptr = SMM::bucket_values_ptr(smem);\n      auto bucket_values_ptr =\n          sm_bucket_values_ptr[groupID * GROUP_SIZE + GROUP_SIZE - 1];\n      auto key_pos_cur = g.shfl(key_pos, GROUP_SIZE - 1);\n      const VecV* src = bucket_values_ptr + key_pos_cur * dim;\n      CopyValue::ldg_sts(rank, dst, src, dim);\n    } else {\n      auto kv_idx_cur = g.shfl(kv_idx, GROUP_SIZE - 1);\n      const VecV* src = values + kv_idx_cur * dim;\n      CopyValue::ldg_sts(rank, dst, src, dim);\n    }\n  }\n  __pipeline_commit();\n\n  // Step 4: write values to bucket or param buffer.\n  occupy_result_cur = g.shfl(occupy_result, GROUP_SIZE - 2);\n  if (occupy_result_cur != OccupyResult::ILLEGAL &&\n      occupy_result_cur != OccupyResult::REFUSED) {\n    VecV* src = SMM::values_buffer(smem, groupID, same_buf(GROUP_SIZE), dim);\n    if (occupy_result_cur == OccupyResult::DUPLICATE) {\n      uint32_t kv_idx_cur = g.shfl(kv_idx, GROUP_SIZE - 2);\n      VecV* dst = values + kv_idx_cur * dim;\n      __pipeline_wait_prior(1);\n      CopyValue::lds_stg(rank, dst, src, dim);\n    } else {\n      VecV** sm_bucket_values_ptr = SMM::bucket_values_ptr(smem);\n      auto bucket_values_ptr =\n          sm_bucket_values_ptr[groupID * GROUP_SIZE + GROUP_SIZE - 2];\n      auto key_pos_cur = g.shfl(key_pos, GROUP_SIZE - 2);\n      VecV* dst = bucket_values_ptr + key_pos_cur * dim;\n      __pipeline_wait_prior(1);\n      CopyValue::lds_stg(rank, dst, src, dim);\n    }\n    if (rank == GROUP_SIZE - 2) {\n      auto key_address = BUCKET::keys(bucket_keys_ptr, key_pos);\n      key_address->store(key, cuda::std::memory_order_release);\n    }\n  }\n\n  // Step 4: write values to bucket or param buffer.\n  occupy_result_cur = g.shfl(occupy_result, GROUP_SIZE - 1);\n  if (occupy_result_cur != OccupyResult::ILLEGAL &&\n      occupy_result_cur != OccupyResult::REFUSED) {\n    VecV* src =\n        SMM::values_buffer(smem, groupID, same_buf(GROUP_SIZE + 1), dim);\n    if (occupy_result_cur == OccupyResult::DUPLICATE) {\n      uint32_t kv_idx_cur = g.shfl(kv_idx, GROUP_SIZE - 1);\n      VecV* dst = values + kv_idx_cur * dim;\n      __pipeline_wait_prior(0);\n      CopyValue::lds_stg(rank, dst, src, dim);\n    } else {\n      VecV** sm_bucket_values_ptr = SMM::bucket_values_ptr(smem);\n      auto bucket_values_ptr =\n          sm_bucket_values_ptr[groupID * GROUP_SIZE + GROUP_SIZE - 1];\n      auto key_pos_cur = g.shfl(key_pos, GROUP_SIZE - 1);\n      VecV* dst = bucket_values_ptr + key_pos_cur * dim;\n      __pipeline_wait_prior(0);\n      CopyValue::lds_stg(rank, dst, src, dim);\n    }\n    if (rank == GROUP_SIZE - 1) {\n      auto key_address = BUCKET::keys(bucket_keys_ptr, key_pos);\n      key_address->store(key, cuda::std::memory_order_release);\n    }\n  }\n}\n\ntemplate <typename K = uint64_t, typename V = float, typename S = uint64_t>\nstruct Params_FindOrInsert {\n  Params_FindOrInsert(float load_factor_,\n                      Bucket<K, V, S>* __restrict__ buckets_,\n                      int* buckets_size_, size_t buckets_num_,\n                      uint32_t bucket_capacity_, uint32_t dim_,\n                      const K* __restrict__ keys_, V* __restrict__ values_,\n                      S* __restrict__ scores_, size_t n_, const S global_epoch_)\n      : load_factor(load_factor_),\n        buckets(buckets_),\n        buckets_size(buckets_size_),\n        buckets_num(buckets_num_),\n        bucket_capacity(bucket_capacity_),\n        dim(dim_),\n        keys(keys_),\n        values(values_),\n        scores(scores_),\n        n(n_),\n        global_epoch(global_epoch_) {}\n  float load_factor;\n  Bucket<K, V, S>* __restrict__ buckets;\n  int* buckets_size;\n  size_t buckets_num;\n  uint32_t bucket_capacity;\n  uint32_t dim;\n  const K* __restrict__ keys;\n  V* __restrict__ values;\n  S* __restrict__ scores;\n  uint64_t n;\n  const S global_epoch;\n};\n\ntemplate <typename K, typename V, typename S, typename VecV, int Strategy>\nstruct Launch_TLPv1_FindOrInsert {\n  using Params = Params_FindOrInsert<K, V, S>;\n  inline static void launch_kernel(Params& params, cudaStream_t& stream) {\n    constexpr int BLOCK_SIZE = 128;\n    params.dim = params.dim * sizeof(V) / sizeof(VecV);\n    tlp_v1_find_or_insert_kernel_with_io<K, V, S, VecV, BLOCK_SIZE, Strategy>\n        <<<(params.n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(\n            params.buckets, params.buckets_size, params.buckets_num,\n            params.bucket_capacity, params.dim, params.keys,\n            reinterpret_cast<VecV*>(params.values), params.scores, params.n,\n            params.global_epoch);\n  }\n};\n\ntemplate <typename K, typename V, typename S, typename VecV, int Strategy>\nstruct Launch_TLPv2_FindOrInsert {\n  using Params = Params_FindOrInsert<K, V, S>;\n  inline static void launch_kernel(Params& params, cudaStream_t& stream) {\n    constexpr int BLOCK_SIZE = 128;\n    const uint32_t value_size = params.dim * sizeof(V);\n    params.dim = value_size / sizeof(VecV);\n\n    if (value_size <= 256) {\n      constexpr int GROUP_SIZE = 8;\n      tlp_v2_find_or_insert_kernel_with_io<K, V, S, VecV, BLOCK_SIZE,\n                                           GROUP_SIZE, Strategy>\n          <<<(params.n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(\n              params.buckets, params.buckets_size, params.buckets_num,\n              params.bucket_capacity, params.dim, params.keys,\n              reinterpret_cast<VecV*>(params.values), params.scores, params.n,\n              params.global_epoch);\n    } else {\n      constexpr int GROUP_SIZE = 16;\n      tlp_v2_find_or_insert_kernel_with_io<K, V, S, VecV, BLOCK_SIZE,\n                                           GROUP_SIZE, Strategy>\n          <<<(params.n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(\n              params.buckets, params.buckets_size, params.buckets_num,\n              params.bucket_capacity, params.dim, params.keys,\n              reinterpret_cast<VecV*>(params.values), params.scores, params.n,\n              params.global_epoch);\n    }\n  }\n};\n\ntemplate <typename K, typename V, typename S, typename VecV, int Strategy>\nstruct Launch_Pipeline_FindOrInsert {\n  using Params = Params_FindOrInsert<K, V, S>;\n  inline static void launch_kernel(Params& params, cudaStream_t& stream) {\n    constexpr int BLOCK_SIZE = 128;\n    constexpr uint32_t GROUP_SIZE = 32;\n    constexpr uint32_t BUCKET_SIZE = 128;\n    using SMM =\n        SharedMemoryManager_Pipeline_FindOrInsert<K, V, S, VecV, BLOCK_SIZE,\n                                                  GROUP_SIZE, BUCKET_SIZE>;\n\n    params.dim = params.dim * sizeof(V) / sizeof(VecV);\n    uint32_t shared_mem = SMM::total_size(params.dim);\n    shared_mem =\n        (shared_mem + sizeof(byte16) - 1) / sizeof(byte16) * sizeof(byte16);\n    pipeline_find_or_insert_kernel_with_io<K, V, S, VecV, BLOCK_SIZE, Strategy>\n        <<<(params.n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, shared_mem,\n           stream>>>(params.buckets, params.buckets_size, params.buckets_num,\n                     params.dim, params.keys,\n                     reinterpret_cast<VecV*>(params.values), params.scores,\n                     params.n, params.global_epoch);\n  }\n};\n\ntemplate <typename ArchTag>\nstruct ValueConfig_FindOrInsert;\n\ntemplate <>\nstruct ValueConfig_FindOrInsert<Sm80> {\n  // Value size greater than it will bring poor performance for TLPv1.\n  static constexpr uint32_t size_tlp_v1 = 16 * sizeof(byte4);\n  static constexpr uint32_t size_tlp_v2 = 128 * sizeof(byte4);\n};\n\ntemplate <>\nstruct ValueConfig_FindOrInsert<Sm70> {\n  // Value size greater than it will bring poor performance for TLPv1.\n  static constexpr uint32_t size_tlp_v1 = 16 * sizeof(byte4);\n  static constexpr uint32_t size_tlp_v2 = 128 * sizeof(byte4);\n};\n\ntemplate <typename K, typename V, typename S, int Strategy, typename ArchTag>\nstruct KernelSelector_FindOrInsert {\n  using ValueConfig = ValueConfig_FindOrInsert<ArchTag>;\n  using Params = Params_FindOrInsert<K, V, S>;\n\n  static bool callable(bool unique_key, uint32_t bucket_size, uint32_t dim) {\n    constexpr uint32_t MinBucketCap = sizeof(VecD_Load) / sizeof(D);\n    if (!unique_key || bucket_size < MinBucketCap) return false;\n    uint32_t value_size = dim * sizeof(V);\n#if defined(CUDART_VERSION) && (CUDART_VERSION >= 11030)\n    if (value_size <= ValueConfig::size_tlp_v2) return true;\n#else\n    if (value_size <= ValueConfig::size_tlp_v1) return true;\n#endif\n    return false;\n  }\n\n  static void select_kernel(Params& params, cudaStream_t& stream) {\n    const uint32_t total_value_size =\n        static_cast<uint32_t>(params.dim * sizeof(V));\n\n    auto launch_TLPv1 = [&]() {\n      if (total_value_size % sizeof(byte16) == 0) {\n        using VecV = byte16;\n        Launch_TLPv1_FindOrInsert<K, V, S, VecV, Strategy>::launch_kernel(\n            params, stream);\n      } else if (total_value_size % sizeof(byte8) == 0) {\n        using VecV = byte8;\n        Launch_TLPv1_FindOrInsert<K, V, S, VecV, Strategy>::launch_kernel(\n            params, stream);\n      } else if (total_value_size % sizeof(byte4) == 0) {\n        using VecV = byte4;\n        Launch_TLPv1_FindOrInsert<K, V, S, VecV, Strategy>::launch_kernel(\n            params, stream);\n      } else if (total_value_size % sizeof(byte2) == 0) {\n        using VecV = byte2;\n        Launch_TLPv1_FindOrInsert<K, V, S, VecV, Strategy>::launch_kernel(\n            params, stream);\n      } else {\n        using VecV = byte;\n        Launch_TLPv1_FindOrInsert<K, V, S, VecV, Strategy>::launch_kernel(\n            params, stream);\n      }\n    };\n\n#if defined(CUDART_VERSION) && (CUDART_VERSION >= 11030)\n    auto launch_TLPv2 = [&]() {\n      if (total_value_size % sizeof(byte16) == 0) {\n        using VecV = byte16;\n        Launch_TLPv2_FindOrInsert<K, V, S, VecV, Strategy>::launch_kernel(\n            params, stream);\n      } else if (total_value_size % sizeof(byte8) == 0) {\n        using VecV = byte8;\n        Launch_TLPv2_FindOrInsert<K, V, S, VecV, Strategy>::launch_kernel(\n            params, stream);\n      } else if (total_value_size % sizeof(byte4) == 0) {\n        using VecV = byte4;\n        Launch_TLPv2_FindOrInsert<K, V, S, VecV, Strategy>::launch_kernel(\n            params, stream);\n      } else if (total_value_size % sizeof(byte2) == 0) {\n        using VecV = byte2;\n        Launch_TLPv2_FindOrInsert<K, V, S, VecV, Strategy>::launch_kernel(\n            params, stream);\n      } else {\n        using VecV = byte;\n        Launch_TLPv2_FindOrInsert<K, V, S, VecV, Strategy>::launch_kernel(\n            params, stream);\n      }\n    };\n#endif\n\n    auto launch_Pipeline = [&]() {\n      if (total_value_size % sizeof(byte16) == 0) {\n        using VecV = byte16;\n        Launch_Pipeline_FindOrInsert<K, V, S, VecV, Strategy>::launch_kernel(\n            params, stream);\n      } else if (total_value_size % sizeof(byte8) == 0) {\n        using VecV = byte8;\n        Launch_Pipeline_FindOrInsert<K, V, S, VecV, Strategy>::launch_kernel(\n            params, stream);\n      } else if (total_value_size % sizeof(byte4) == 0) {\n        using VecV = byte4;\n        Launch_Pipeline_FindOrInsert<K, V, S, VecV, Strategy>::launch_kernel(\n            params, stream);\n      } else if (total_value_size % sizeof(byte2) == 0) {\n        using VecV = byte2;\n        Launch_Pipeline_FindOrInsert<K, V, S, VecV, Strategy>::launch_kernel(\n            params, stream);\n      } else {\n        using VecV = byte;\n        Launch_Pipeline_FindOrInsert<K, V, S, VecV, Strategy>::launch_kernel(\n            params, stream);\n      }\n    };\n\n    // This part is according to the test on A100.\n    if (params.bucket_capacity != 128) {\n      if (total_value_size <= ValueConfig::size_tlp_v1) {\n        launch_TLPv1();\n      } else {\n#if defined(CUDART_VERSION) && (CUDART_VERSION >= 11030)\n        launch_TLPv2();\n#else\n        launch_TLPv1();\n#endif\n      }\n    } else {\n      if (total_value_size <= ValueConfig::size_tlp_v1) {\n        if (params.load_factor <= 0.98f) {\n          launch_TLPv1();\n        } else {\n          launch_Pipeline();\n        }\n      } else {\n        if (params.load_factor <= 0.95f) {\n#if defined(CUDART_VERSION) && (CUDART_VERSION >= 11030)\n          launch_TLPv2();\n#else\n          launch_Pipeline();\n#endif\n        } else {\n          launch_Pipeline();\n        }\n      }\n    }\n  }  // End function\n};\n\n/*\n * find or insert with IO operation. This kernel is\n * usually used for the pure HBM mode for better performance.\n */\ntemplate <class K, class V, class S, int Strategy, uint32_t TILE_SIZE = 4>\n__global__ void find_or_insert_kernel_with_io(\n    const Table<K, V, S>* __restrict table, Bucket<K, V, S>* buckets,\n    const size_t bucket_max_size, const size_t buckets_num, const size_t dim,\n    const K* __restrict keys, V* __restrict values, S* __restrict scores,\n    const S global_epoch, const size_t N) {\n  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());\n  int* buckets_size = table->buckets_size;\n\n  using ScoreFunctor = ScoreFunctor<K, V, S, Strategy>;\n\n  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;\n       t += blockDim.x * gridDim.x) {\n    int key_pos = -1;\n    const size_t key_idx = t / TILE_SIZE;\n\n    const K find_or_insert_key = keys[key_idx];\n\n    if (IS_RESERVED_KEY<K>(find_or_insert_key)) continue;\n\n    const S find_or_insert_score =\n        ScoreFunctor::desired_when_missed(scores, key_idx, global_epoch);\n    V* find_or_insert_value = values + key_idx * dim;\n\n    size_t bkt_idx = 0;\n    size_t start_idx = 0;\n    int src_lane = -1;\n    K evicted_key;\n\n    Bucket<K, V, S>* bucket =\n        get_key_position<K>(buckets, find_or_insert_key, bkt_idx, start_idx,\n                            buckets_num, bucket_max_size);\n\n    OccupyResult occupy_result{OccupyResult::INITIAL};\n    const int bucket_size = buckets_size[bkt_idx];\n    do {\n      if (bucket_size < bucket_max_size) {\n        occupy_result = find_and_lock_when_vacant<K, V, S, TILE_SIZE>(\n            g, bucket, find_or_insert_key, find_or_insert_score, evicted_key,\n            start_idx, key_pos, src_lane, bucket_max_size);\n      } else {\n        start_idx = (start_idx / TILE_SIZE) * TILE_SIZE;\n        occupy_result = find_and_lock_when_full<K, V, S, TILE_SIZE,\n                                                ScoreFunctor::LOCK_MEM_ORDER,\n                                                ScoreFunctor::UNLOCK_MEM_ORDER>(\n            g, bucket, find_or_insert_key, find_or_insert_score, evicted_key,\n            start_idx, key_pos, src_lane, bucket_max_size);\n      }\n\n      occupy_result = g.shfl(occupy_result, src_lane);\n    } while (occupy_result == OccupyResult::CONTINUE);\n\n    if (occupy_result == OccupyResult::REFUSED) continue;\n\n    if ((occupy_result == OccupyResult::OCCUPIED_EMPTY ||\n         occupy_result == OccupyResult::OCCUPIED_RECLAIMED) &&\n        g.thread_rank() == src_lane) {\n      atomicAdd(&(buckets_size[bkt_idx]), 1);\n    }\n\n    if (occupy_result == OccupyResult::DUPLICATE) {\n      copy_vector<V, TILE_SIZE>(g, bucket->vectors + key_pos * dim,\n                                find_or_insert_value, dim);\n    } else {\n      copy_vector<V, TILE_SIZE>(g, find_or_insert_value,\n                                bucket->vectors + key_pos * dim, dim);\n    }\n    if (g.thread_rank() == src_lane) {\n      ScoreFunctor::update(bucket, key_pos, scores, key_idx,\n                           find_or_insert_score,\n                           (occupy_result != OccupyResult::DUPLICATE));\n    }\n\n    if (g.thread_rank() == src_lane) {\n      bucket->digests(key_pos)[0] = get_digest<K>(find_or_insert_key);\n      (bucket->keys(key_pos))\n          ->store(find_or_insert_key, ScoreFunctor::UNLOCK_MEM_ORDER);\n    }\n  }\n}\n\ntemplate <typename K, typename V, typename S, int Strategy>\nstruct SelectFindOrInsertKernelWithIO {\n  static void execute_kernel(const float& load_factor, const int& block_size,\n                             const size_t bucket_max_size,\n                             const size_t buckets_num, const size_t dim,\n                             cudaStream_t& stream, const size_t& n,\n                             const Table<K, V, S>* __restrict table,\n                             Bucket<K, V, S>* buckets, const K* __restrict keys,\n                             V* __restrict values, S* __restrict scores,\n                             const S global_epoch) {\n    if (load_factor <= 0.75) {\n      const unsigned int tile_size = 4;\n      const size_t N = n * tile_size;\n      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);\n      find_or_insert_kernel_with_io<K, V, S, Strategy, tile_size>\n          <<<grid_size, block_size, 0, stream>>>(\n              table, buckets, bucket_max_size, buckets_num, dim, keys, values,\n              scores, global_epoch, N);\n    } else {\n      const unsigned int tile_size = 32;\n      const size_t N = n * tile_size;\n      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);\n      find_or_insert_kernel_with_io<K, V, S, Strategy, tile_size>\n          <<<grid_size, block_size, 0, stream>>>(\n              table, buckets, bucket_max_size, buckets_num, dim, keys, values,\n              scores, global_epoch, N);\n    }\n    return;\n  }\n};\n\n// Use 1 thread to deal with a KV-pair.\ntemplate <typename K = uint64_t, typename V = byte4, typename S = uint64_t,\n          uint32_t BLOCK_SIZE = 128, int Strategy = -1>\n__global__ void find_or_insert_kernel_lock_key_hybrid(\n    Bucket<K, V, S>* __restrict__ buckets, int32_t* __restrict__ buckets_size,\n    const uint64_t buckets_num, uint32_t bucket_capacity, const uint32_t dim,\n    const K* __restrict__ keys, V** __restrict__ value_ptrs,\n    S* __restrict__ scores, K** __restrict__ key_ptrs,\n    int* __restrict keys_index, bool* __restrict__ founds, uint64_t n,\n    const S global_epoch) {\n  using BUCKET = Bucket<K, V, S>;\n  using ScoreFunctor = ScoreFunctor<K, V, S, Strategy>;\n\n  // bucket_capacity is a multiple of 4.\n  constexpr uint32_t STRIDE_S = 4;\n  constexpr uint32_t Load_LEN_S = sizeof(byte16) / sizeof(S);\n  __shared__ S sm_bucket_scores[BLOCK_SIZE][2 * STRIDE_S];\n\n  uint32_t tx = threadIdx.x;\n  uint32_t kv_idx = blockIdx.x * blockDim.x + tx;\n  K key{static_cast<K>(EMPTY_KEY)};\n  S score{static_cast<S>(EMPTY_SCORE)};\n  OccupyResult occupy_result{OccupyResult::INITIAL};\n  VecD_Comp target_digests{0};\n  V* bucket_values_ptr{nullptr};\n  K* bucket_keys_ptr{nullptr};\n  int32_t* bucket_size_ptr{nullptr};\n  uint32_t key_pos = {0};\n  uint32_t bucket_size{0};\n  if (kv_idx < n) {\n    key = keys[kv_idx];\n\n    // help to address the original key after sorting value pointers.\n    if (keys_index) {\n      keys_index[kv_idx] = kv_idx;\n    }\n\n    score = ScoreFunctor::desired_when_missed(scores, kv_idx, global_epoch);\n\n    if (!IS_RESERVED_KEY<K>(key)) {\n      const K hashed_key = Murmur3HashDevice(key);\n      target_digests = digests_from_hashed<K>(hashed_key);\n      uint64_t global_idx =\n          static_cast<uint64_t>(hashed_key % (buckets_num * bucket_capacity));\n      key_pos = get_start_position(global_idx, bucket_capacity);\n      uint64_t bkt_idx = global_idx / bucket_capacity;\n      bucket_size_ptr = buckets_size + bkt_idx;\n      BUCKET* bucket = buckets + bkt_idx;\n      bucket_size = *bucket_size_ptr;\n      bucket_keys_ptr = reinterpret_cast<K*>(bucket->keys(0));\n      bucket_values_ptr = reinterpret_cast<V*>(bucket->vectors);\n    } else {\n      key_ptrs[kv_idx] = nullptr;\n      return;\n    }\n  } else {\n    return;\n  }\n\n  // Load `STRIDE` digests every time.\n  constexpr uint32_t STRIDE = sizeof(VecD_Load) / sizeof(D);\n  // One more loop to handle empty keys.\n  for (int offset = 0; offset < bucket_capacity + STRIDE; offset += STRIDE) {\n    if (occupy_result != OccupyResult::INITIAL) break;\n\n    uint32_t pos_cur = align_to<STRIDE>(key_pos);\n    pos_cur = (pos_cur + offset) & (bucket_capacity - 1);\n\n    D* digests_ptr = BUCKET::digests(bucket_keys_ptr, bucket_capacity, pos_cur);\n    VecD_Load digests_vec = *(reinterpret_cast<VecD_Load*>(digests_ptr));\n    VecD_Comp digests_arr[4] = {digests_vec.x, digests_vec.y, digests_vec.z,\n                                digests_vec.w};\n\n    for (int i = 0; i < 4; i++) {\n      VecD_Comp probe_digests = digests_arr[i];\n      uint32_t possible_pos = 0;\n      bool result = false;\n      // Perform a vectorized comparison by byte,\n      // and if they are equal, set the corresponding byte in the result to\n      // 0xff.\n      int cmp_result = __vcmpeq4(probe_digests, target_digests);\n      cmp_result &= 0x01010101;\n      do {\n        if (cmp_result == 0) break;\n        // CUDA uses little endian,\n        // and the lowest byte in register stores in the lowest address.\n        uint32_t index = (__ffs(cmp_result) - 1) >> 3;\n        cmp_result &= (cmp_result - 1);\n        possible_pos = pos_cur + i * 4 + index;\n        auto current_key = BUCKET::keys(bucket_keys_ptr, possible_pos);\n        K expected_key = key;\n        // Modifications to the bucket will not before this instruction.\n        result = current_key->compare_exchange_strong(\n            expected_key, static_cast<K>(LOCKED_KEY),\n            cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);\n      } while (!result);\n      if (result) {\n        occupy_result = OccupyResult::DUPLICATE;\n        key_pos = possible_pos;\n        ScoreFunctor::update_with_digest(bucket_keys_ptr, key_pos, scores,\n                                         kv_idx, score, bucket_capacity,\n                                         get_digest<K>(key), false);\n        break;\n      } else if (bucket_size == bucket_capacity) {\n        continue;\n      }\n      VecD_Comp empty_digests_ = empty_digests<K>();\n      cmp_result = __vcmpeq4(probe_digests, empty_digests_);\n      cmp_result &= 0x01010101;\n      do {\n        if (cmp_result == 0) break;\n        uint32_t index = (__ffs(cmp_result) - 1) >> 3;\n        cmp_result &= (cmp_result - 1);\n        possible_pos = pos_cur + i * 4 + index;\n        if (offset == 0 && possible_pos < key_pos) continue;\n        auto current_key = BUCKET::keys(bucket_keys_ptr, possible_pos);\n        K expected_key = static_cast<K>(EMPTY_KEY);\n        result = current_key->compare_exchange_strong(\n            expected_key, static_cast<K>(LOCKED_KEY),\n            cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);\n      } while (!result);\n      if (result) {\n        occupy_result = OccupyResult::OCCUPIED_EMPTY;\n        key_pos = possible_pos;\n        ScoreFunctor::update_with_digest(bucket_keys_ptr, key_pos, scores,\n                                         kv_idx, score, bucket_capacity,\n                                         get_digest<K>(key), true);\n\n        atomicAdd(bucket_size_ptr, 1);\n        break;\n      }\n    }\n  }\n\n  while (occupy_result == OccupyResult::INITIAL) {\n    S* bucket_scores_ptr = BUCKET::scores(bucket_keys_ptr, bucket_capacity, 0);\n    S min_score = static_cast<S>(MAX_SCORE);\n    int min_pos = -1;\n#pragma unroll\n    for (int j = 0; j < STRIDE_S; j += Load_LEN_S) {\n      __pipeline_memcpy_async(sm_bucket_scores[tx] + j, bucket_scores_ptr + j,\n                              sizeof(S) * Load_LEN_S);\n    }\n    __pipeline_commit();\n    for (int i = 0; i < bucket_capacity; i += STRIDE_S) {\n      if (i < bucket_capacity - STRIDE_S) {\n#pragma unroll\n        for (int j = 0; j < STRIDE_S; j += Load_LEN_S) {\n          __pipeline_memcpy_async(\n              sm_bucket_scores[tx] + diff_buf(i / STRIDE_S) * STRIDE_S + j,\n              bucket_scores_ptr + i + STRIDE_S + j, sizeof(S) * Load_LEN_S);\n        }\n      }\n      __pipeline_commit();\n      __pipeline_wait_prior(1);\n      S temp_scores[Load_LEN_S];\n      S* src = sm_bucket_scores[tx] + same_buf(i / STRIDE_S) * STRIDE_S;\n#pragma unroll\n      for (int k = 0; k < STRIDE_S; k += Load_LEN_S) {\n        *reinterpret_cast<byte16*>(temp_scores) =\n            *reinterpret_cast<byte16*>(src + k);\n#pragma unroll\n        for (int j = 0; j < Load_LEN_S; j += 1) {\n          S temp_score = temp_scores[j];\n          if (temp_score < min_score) {\n            auto verify_key_ptr = BUCKET::keys(bucket_keys_ptr, i + k + j);\n            auto verify_key =\n                verify_key_ptr->load(cuda::std::memory_order_relaxed);\n            if (verify_key != static_cast<K>(LOCKED_KEY) &&\n                verify_key != static_cast<K>(EMPTY_KEY)) {\n              min_score = temp_score;\n              min_pos = i + k + j;\n            }\n          }\n        }\n      }\n    }\n\n    score = ScoreFunctor::desired_when_missed(scores, kv_idx, global_epoch);\n    if (score <= min_score) {\n      occupy_result = OccupyResult::REFUSED;\n      break;\n    }\n    auto min_score_key = BUCKET::keys(bucket_keys_ptr, min_pos);\n    auto expected_key = min_score_key->load(cuda::std::memory_order_relaxed);\n    if (expected_key != static_cast<K>(LOCKED_KEY) &&\n        expected_key != static_cast<K>(EMPTY_KEY)) {\n      bool result = min_score_key->compare_exchange_strong(\n          expected_key, static_cast<K>(LOCKED_KEY),\n          cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);\n      if (result) {\n        S* min_score_ptr =\n            BUCKET::scores(bucket_keys_ptr, bucket_capacity, min_pos);\n        auto verify_score_ptr =\n            reinterpret_cast<AtomicScore<S>*>(min_score_ptr);\n        auto verify_score =\n            verify_score_ptr->load(cuda::std::memory_order_relaxed);\n        if (verify_score <= min_score) {\n          key_pos = min_pos;\n          ScoreFunctor::update_with_digest(\n              bucket_keys_ptr, key_pos, scores, kv_idx, score, bucket_capacity,\n              get_digest<K>(key), (occupy_result != OccupyResult::DUPLICATE));\n          if (expected_key == static_cast<K>(RECLAIM_KEY)) {\n            occupy_result = OccupyResult::OCCUPIED_RECLAIMED;\n            atomicAdd(bucket_size_ptr, 1);\n          } else {\n            occupy_result = OccupyResult::EVICT;\n          }\n        } else {\n          min_score_key->store(expected_key, cuda::std::memory_order_release);\n        }\n      }\n    }\n  }\n\n  if (kv_idx < n) {\n    if (occupy_result == OccupyResult::REFUSED) {\n      value_ptrs[kv_idx] = nullptr;\n      key_ptrs[kv_idx] = nullptr;\n    } else {\n      value_ptrs[kv_idx] = bucket_values_ptr + key_pos * dim;\n      founds[kv_idx] = occupy_result == OccupyResult::DUPLICATE;\n      auto key_address = BUCKET::keys(bucket_keys_ptr, key_pos);\n      key_ptrs[kv_idx] = reinterpret_cast<K*>(key_address);\n    }\n  }\n}\n\ntemplate <class K, class V, class S, class VecV = byte16>\n__global__ void read_or_write_kernel_unlock_key(\n    VecV** __restrict table_value_addrs, VecV* __restrict param_values,\n    const bool* mask, const int* __restrict param_key_index,\n    K** __restrict__ key_ptrs, const K* __restrict__ keys, const size_t dim,\n    const size_t N) {\n  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;\n\n  for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) {\n    int vec_index = int(t / dim);\n    int dim_index = t % dim;\n    int real_key_index =\n        param_key_index != nullptr ? param_key_index[vec_index] : vec_index;\n\n    K* key_ptr = key_ptrs[real_key_index];\n    K key = keys[real_key_index];\n\n    /// if found, read the value form table, otherwise write it\n    if (table_value_addrs[vec_index] != nullptr) {\n      // unlock the key.\n      if (key_ptr && dim_index == 0) *key_ptr = key;\n\n      /// find\n      if (mask[real_key_index]) {\n        param_values[real_key_index * dim + dim_index] =\n            table_value_addrs[vec_index][dim_index];\n      }\n      /// insert\n      else {\n        table_value_addrs[vec_index][dim_index] =\n            param_values[real_key_index * dim + dim_index];\n      }\n    }\n  }\n}\n\n/* find or insert with the end-user specified score.\n */\ntemplate <class K, class V, class S, int Strategy, uint32_t TILE_SIZE = 4>\n__global__ void find_or_insert_kernel(\n    const Table<K, V, S>* __restrict table, Bucket<K, V, S>* buckets,\n    const size_t bucket_max_size, const size_t buckets_num, const size_t dim,\n    const K* __restrict keys, V** __restrict vectors, S* __restrict scores,\n    bool* __restrict found, int* __restrict keys_index, const S global_epoch,\n    const size_t N) {\n  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());\n  int* buckets_size = table->buckets_size;\n\n  using ScoreFunctor = ScoreFunctor<K, V, S, Strategy>;\n\n  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;\n       t += blockDim.x * gridDim.x) {\n    int key_pos = -1;\n    size_t key_idx = t / TILE_SIZE;\n\n    const K find_or_insert_key = keys[key_idx];\n\n    if (IS_RESERVED_KEY<K>(find_or_insert_key)) continue;\n\n    const S find_or_insert_score =\n        ScoreFunctor::desired_when_missed(scores, key_idx, global_epoch);\n\n    size_t bkt_idx = 0;\n    size_t start_idx = 0;\n    int src_lane = -1;\n    K evicted_key;\n\n    Bucket<K, V, S>* bucket =\n        get_key_position<K>(buckets, find_or_insert_key, bkt_idx, start_idx,\n                            buckets_num, bucket_max_size);\n\n    if (g.thread_rank() == 0) {\n      *(keys_index + key_idx) = key_idx;\n    }\n\n    OccupyResult occupy_result{OccupyResult::INITIAL};\n    const int bucket_size = buckets_size[bkt_idx];\n    do {\n      if (bucket_size < bucket_max_size) {\n        occupy_result = find_and_lock_when_vacant<K, V, S, TILE_SIZE>(\n            g, bucket, find_or_insert_key, find_or_insert_score, evicted_key,\n            start_idx, key_pos, src_lane, bucket_max_size);\n      } else {\n        start_idx = (start_idx / TILE_SIZE) * TILE_SIZE;\n        occupy_result = find_and_lock_when_full<K, V, S, TILE_SIZE,\n                                                ScoreFunctor::LOCK_MEM_ORDER,\n                                                ScoreFunctor::UNLOCK_MEM_ORDER>(\n            g, bucket, find_or_insert_key, find_or_insert_score, evicted_key,\n            start_idx, key_pos, src_lane, bucket_max_size);\n      }\n\n      occupy_result = g.shfl(occupy_result, src_lane);\n    } while (occupy_result == OccupyResult::CONTINUE);\n\n    if (occupy_result == OccupyResult::REFUSED) continue;\n\n    if ((occupy_result == OccupyResult::OCCUPIED_EMPTY ||\n         occupy_result == OccupyResult::OCCUPIED_RECLAIMED) &&\n        g.thread_rank() == src_lane) {\n      atomicAdd(&(buckets_size[bkt_idx]), 1);\n    }\n\n    if (g.thread_rank() == src_lane) {\n      *(vectors + key_idx) = (bucket->vectors + key_pos * dim);\n      ScoreFunctor::update(bucket, key_pos, scores, key_idx,\n                           find_or_insert_score,\n                           occupy_result != OccupyResult::DUPLICATE);\n      if (occupy_result == OccupyResult::DUPLICATE) {\n        if (found != nullptr) {\n          *(found + key_idx) = true;\n        }\n      }\n      bucket->digests(key_pos)[0] = get_digest<K>(find_or_insert_key);\n      (bucket->keys(key_pos))\n          ->store(find_or_insert_key, ScoreFunctor::UNLOCK_MEM_ORDER);\n    }\n  }\n}\n\n/* Read the data from address of table_value_addrs to corresponding position\n  in param_value if mask[i] is true, otherwise write data to table_value_addrs\n  form param_value,\n  usually called by find_or_insert kernel.\n\n  `table_value_addrs`: A pointer of pointer of V which should be on HBM,\n        but each value (a pointer of V) could point to a\n        memory on HBM or HMEM.\n  `param_value`: A continue memory pointer with Vector\n        which should be HBM.\n  `mask`: One for each `param_value`. If true, reading from table_value_addrs,\n          or false writing table_value_addrs from  param_value.\n  `param_key_index`: N values from address of table_value_addrs are mapped to\n        param_values according to param_key_index.\n  `dim`: the dim of value.\n  `N`: The number of vectors needed to be read.\n*/\ntemplate <class K, class V, class S>\n__global__ void read_or_write_kernel(V** __restrict table_value_addrs,\n                                     V* __restrict param_values,\n                                     const bool* mask,\n                                     const int* __restrict param_key_index,\n                                     const size_t dim, const size_t N) {\n  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;\n\n  for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) {\n    int vec_index = int(t / dim);\n    int dim_index = t % dim;\n    int real_key_index =\n        param_key_index != nullptr ? param_key_index[vec_index] : vec_index;\n\n    /// if found, read the value form table, otherwise write it\n    if (table_value_addrs[vec_index] != nullptr) {\n      /// find\n      if (mask[real_key_index]) {\n        param_values[real_key_index * dim + dim_index] =\n            table_value_addrs[vec_index][dim_index];\n      }\n      /// insert\n      else {\n        table_value_addrs[vec_index][dim_index] =\n            param_values[real_key_index * dim + dim_index];\n      }\n    }\n  }\n}\n\n/* If founds[i] = true, read data from corresponding address of\n * table_value_addrs and write to param_values; if founds[i] = false, write data\n * from param_values to corresponding address of table_value_addrs. usually\n * called by find_or_insert kernel.\n */\ntemplate <class V>\nvoid read_or_write_by_cpu(V** __restrict table_value_addrs,\n                          V* __restrict param_values,\n                          const int* __restrict offset, const bool* founds,\n                          size_t dim, int N, int n_worker = 16) {\n  std::vector<std::thread> thds;\n  if (n_worker < 1) n_worker = 1;\n\n  auto functor = [founds, dim](V** __restrict table_value_addrs,\n                               V* __restrict param_values,\n                               const int* __restrict offset, int handled_size,\n                               int trunk_size) -> void {\n    for (int i = handled_size; i < handled_size + trunk_size; i++) {\n      if (table_value_addrs[i] != nullptr) {\n        if (founds[offset[i]]) {\n          memcpy(param_values + offset[i] * dim, table_value_addrs[i],\n                 sizeof(V) * dim);\n        } else {\n          memcpy(table_value_addrs[i], param_values + offset[i] * dim,\n                 sizeof(V) * dim);\n        }\n      }\n    }\n  };\n\n  int32_t trunk_size_floor = N / n_worker;\n  int32_t trunk_size_remain = N % n_worker;\n  int32_t n_worker_used = trunk_size_floor == 0 ? trunk_size_remain : n_worker;\n\n  size_t handled_size = 0;\n  for (int i = 0; i < n_worker_used; i++) {\n    int32_t cur_trunk_size = trunk_size_floor;\n    if (trunk_size_remain != 0) {\n      cur_trunk_size += 1;\n      trunk_size_remain--;\n    }\n    thds.push_back(std::thread(functor, table_value_addrs, param_values, offset,\n                               handled_size, cur_trunk_size));\n    handled_size += cur_trunk_size;\n  }\n\n  for (int i = 0; i < n_worker_used; i++) {\n    thds[i].join();\n  }\n}\n\n}  // namespace merlin\n}  // namespace nv"
  },
  {
    "path": "include/merlin/core_kernels/find_ptr_or_insert.cuh",
    "content": "/*\n * Copyright (c) 2023, NVIDIA CORPORATION.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *     http:///www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n#pragma once\n\n#include \"kernel_utils.cuh\"\n\nnamespace nv {\nnamespace merlin {\n\n// Use 1 thread to deal with a KV-pair.\ntemplate <typename K = uint64_t, typename V = byte4, typename S = uint64_t,\n          uint32_t BLOCK_SIZE = 128, int Strategy = -1>\n__global__ void find_or_insert_ptr_kernel_lock_key(\n    Bucket<K, V, S>* __restrict__ buckets, int32_t* __restrict__ buckets_size,\n    const uint64_t buckets_num, uint32_t bucket_capacity, const uint32_t dim,\n    const K* __restrict__ keys, V** __restrict__ value_ptrs,\n    S* __restrict__ scores, K** __restrict__ key_ptrs, uint64_t n,\n    bool* __restrict__ founds, const S global_epoch) {\n  using BUCKET = Bucket<K, V, S>;\n  using ScoreFunctor = ScoreFunctor<K, V, S, Strategy>;\n\n  // bucket_capacity is a multiple of 4.\n  constexpr uint32_t STRIDE_S = 4;\n  constexpr uint32_t Load_LEN_S = sizeof(byte16) / sizeof(S);\n  __shared__ S sm_bucket_scores[BLOCK_SIZE][2 * STRIDE_S];\n\n  uint32_t tx = threadIdx.x;\n  uint32_t kv_idx = blockIdx.x * blockDim.x + tx;\n  K key{static_cast<K>(EMPTY_KEY)};\n  S score{static_cast<S>(EMPTY_SCORE)};\n  OccupyResult occupy_result{OccupyResult::INITIAL};\n  VecD_Comp target_digests{0};\n  V* bucket_values_ptr{nullptr};\n  K* bucket_keys_ptr{nullptr};\n  int32_t* bucket_size_ptr{nullptr};\n  uint32_t key_pos = {0};\n  uint32_t bucket_size{0};\n  if (kv_idx < n) {\n    key = keys[kv_idx];\n    score = ScoreFunctor::desired_when_missed(scores, kv_idx, global_epoch);\n\n    if (!IS_RESERVED_KEY<K>(key)) {\n      const K hashed_key = Murmur3HashDevice(key);\n      target_digests = digests_from_hashed<K>(hashed_key);\n      uint64_t global_idx =\n          static_cast<uint64_t>(hashed_key % (buckets_num * bucket_capacity));\n      key_pos = get_start_position(global_idx, bucket_capacity);\n      uint64_t bkt_idx = global_idx / bucket_capacity;\n      bucket_size_ptr = buckets_size + bkt_idx;\n      BUCKET* bucket = buckets + bkt_idx;\n      bucket_size = *bucket_size_ptr;\n      bucket_keys_ptr = reinterpret_cast<K*>(bucket->keys(0));\n      bucket_values_ptr = reinterpret_cast<V*>(bucket->vectors);\n    } else {\n      key_ptrs[kv_idx] = nullptr;\n      return;\n    }\n  } else {\n    return;\n  }\n\n  // Load `STRIDE` digests every time.\n  constexpr uint32_t STRIDE = sizeof(VecD_Load) / sizeof(D);\n  // One more loop to handle empty keys.\n  for (int offset = 0; offset < bucket_capacity + STRIDE; offset += STRIDE) {\n    if (occupy_result != OccupyResult::INITIAL) break;\n\n    uint32_t pos_cur = align_to<STRIDE>(key_pos);\n    pos_cur = (pos_cur + offset) & (bucket_capacity - 1);\n\n    D* digests_ptr = BUCKET::digests(bucket_keys_ptr, bucket_capacity, pos_cur);\n    VecD_Load digests_vec = *(reinterpret_cast<VecD_Load*>(digests_ptr));\n    VecD_Comp digests_arr[4] = {digests_vec.x, digests_vec.y, digests_vec.z,\n                                digests_vec.w};\n\n    for (int i = 0; i < 4; i++) {\n      VecD_Comp probe_digests = digests_arr[i];\n      uint32_t possible_pos = 0;\n      bool result = false;\n      // Perform a vectorized comparison by byte,\n      // and if they are equal, set the corresponding byte in the result to\n      // 0xff.\n      int cmp_result = __vcmpeq4(probe_digests, target_digests);\n      cmp_result &= 0x01010101;\n      do {\n        if (cmp_result == 0) break;\n        // CUDA uses little endian,\n        // and the lowest byte in register stores in the lowest address.\n        uint32_t index = (__ffs(cmp_result) - 1) >> 3;\n        cmp_result &= (cmp_result - 1);\n        possible_pos = pos_cur + i * 4 + index;\n        auto current_key = BUCKET::keys(bucket_keys_ptr, possible_pos);\n        K expected_key = key;\n        // Modifications to the bucket will not before this instruction.\n        result = current_key->compare_exchange_strong(\n            expected_key, static_cast<K>(LOCKED_KEY),\n            cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);\n      } while (!result);\n      if (result) {\n        occupy_result = OccupyResult::DUPLICATE;\n        key_pos = possible_pos;\n        ScoreFunctor::update_with_digest(bucket_keys_ptr, key_pos, scores,\n                                         kv_idx, score, bucket_capacity,\n                                         get_digest<K>(key), false);\n        break;\n      } else if (bucket_size == bucket_capacity) {\n        continue;\n      }\n      VecD_Comp empty_digests_ = empty_digests<K>();\n      cmp_result = __vcmpeq4(probe_digests, empty_digests_);\n      cmp_result &= 0x01010101;\n      do {\n        if (cmp_result == 0) break;\n        uint32_t index = (__ffs(cmp_result) - 1) >> 3;\n        cmp_result &= (cmp_result - 1);\n        possible_pos = pos_cur + i * 4 + index;\n        if (offset == 0 && possible_pos < key_pos) continue;\n        auto current_key = BUCKET::keys(bucket_keys_ptr, possible_pos);\n        K expected_key = static_cast<K>(EMPTY_KEY);\n        result = current_key->compare_exchange_strong(\n            expected_key, static_cast<K>(LOCKED_KEY),\n            cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);\n      } while (!result);\n      if (result) {\n        occupy_result = OccupyResult::OCCUPIED_EMPTY;\n        key_pos = possible_pos;\n        ScoreFunctor::update_with_digest(bucket_keys_ptr, key_pos, scores,\n                                         kv_idx, score, bucket_capacity,\n                                         get_digest<K>(key), true);\n\n        atomicAdd(bucket_size_ptr, 1);\n        break;\n      }\n    }\n  }\n\n  while (occupy_result == OccupyResult::INITIAL) {\n    S* bucket_scores_ptr = BUCKET::scores(bucket_keys_ptr, bucket_capacity, 0);\n    S min_score = static_cast<S>(MAX_SCORE);\n    int min_pos = -1;\n#pragma unroll\n    for (int j = 0; j < STRIDE_S; j += Load_LEN_S) {\n      __pipeline_memcpy_async(sm_bucket_scores[tx] + j, bucket_scores_ptr + j,\n                              sizeof(S) * Load_LEN_S);\n    }\n    __pipeline_commit();\n    for (int i = 0; i < bucket_capacity; i += STRIDE_S) {\n      if (i < bucket_capacity - STRIDE_S) {\n#pragma unroll\n        for (int j = 0; j < STRIDE_S; j += Load_LEN_S) {\n          __pipeline_memcpy_async(\n              sm_bucket_scores[tx] + diff_buf(i / STRIDE_S) * STRIDE_S + j,\n              bucket_scores_ptr + i + STRIDE_S + j, sizeof(S) * Load_LEN_S);\n        }\n      }\n      __pipeline_commit();\n      __pipeline_wait_prior(1);\n      S temp_scores[Load_LEN_S];\n      S* src = sm_bucket_scores[tx] + same_buf(i / STRIDE_S) * STRIDE_S;\n#pragma unroll\n      for (int k = 0; k < STRIDE_S; k += Load_LEN_S) {\n        *reinterpret_cast<byte16*>(temp_scores) =\n            *reinterpret_cast<byte16*>(src + k);\n#pragma unroll\n        for (int j = 0; j < Load_LEN_S; j += 1) {\n          S temp_score = temp_scores[j];\n          if (temp_score < min_score) {\n            auto verify_key_ptr = BUCKET::keys(bucket_keys_ptr, i + k + j);\n            auto verify_key =\n                verify_key_ptr->load(cuda::std::memory_order_relaxed);\n            if (verify_key != static_cast<K>(LOCKED_KEY) &&\n                verify_key != static_cast<K>(EMPTY_KEY)) {\n              min_score = temp_score;\n              min_pos = i + k + j;\n            }\n          }\n        }\n      }\n    }\n\n    score = ScoreFunctor::desired_when_missed(scores, kv_idx, global_epoch);\n    if (score <= min_score) {\n      occupy_result = OccupyResult::REFUSED;\n      break;\n    }\n    auto min_score_key = BUCKET::keys(bucket_keys_ptr, min_pos);\n    auto expected_key = min_score_key->load(cuda::std::memory_order_relaxed);\n    if (expected_key != static_cast<K>(LOCKED_KEY) &&\n        expected_key != static_cast<K>(EMPTY_KEY)) {\n      bool result = min_score_key->compare_exchange_strong(\n          expected_key, static_cast<K>(LOCKED_KEY),\n          cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);\n      if (result) {\n        S* min_score_ptr =\n            BUCKET::scores(bucket_keys_ptr, bucket_capacity, min_pos);\n        auto verify_score_ptr =\n            reinterpret_cast<AtomicScore<S>*>(min_score_ptr);\n        auto verify_score =\n            verify_score_ptr->load(cuda::std::memory_order_relaxed);\n        if (verify_score <= min_score) {\n          key_pos = min_pos;\n          ScoreFunctor::update_with_digest(bucket_keys_ptr, key_pos, scores,\n                                           kv_idx, score, bucket_capacity,\n                                           get_digest<K>(key), true);\n\n          if (expected_key == static_cast<K>(RECLAIM_KEY)) {\n            occupy_result = OccupyResult::OCCUPIED_RECLAIMED;\n            atomicAdd(bucket_size_ptr, 1);\n          } else {\n            occupy_result = OccupyResult::EVICT;\n          }\n        } else {\n          min_score_key->store(expected_key, cuda::std::memory_order_release);\n        }\n      }\n    }\n  }\n\n  if (kv_idx < n) {\n    if (occupy_result == OccupyResult::REFUSED) {\n      value_ptrs[kv_idx] = nullptr;\n      key_ptrs[kv_idx] = nullptr;\n    } else {\n      value_ptrs[kv_idx] = bucket_values_ptr + key_pos * dim;\n      auto key_address = BUCKET::keys(bucket_keys_ptr, key_pos);\n      key_ptrs[kv_idx] = reinterpret_cast<K*>(key_address);\n    }\n    founds[kv_idx] = occupy_result == OccupyResult::DUPLICATE;\n  }\n}\n\ntemplate <typename K>\n__global__ void find_or_insert_ptr_kernel_unlock_key(const K* __restrict__ keys,\n                                                     K** __restrict__ key_ptrs,\n                                                     uint64_t n) {\n  int kv_idx = blockIdx.x * blockDim.x + threadIdx.x;\n  K key;\n  K* key_ptr{nullptr};\n  if (kv_idx < n) {\n    key = keys[kv_idx];\n    key_ptr = key_ptrs[kv_idx];\n    if (key_ptr) {\n      *key_ptr = key;\n    }\n  }\n}\n\n/* find or insert with the end-user specified score.\n */\ntemplate <class K, class V, class S, int Strategy, uint32_t TILE_SIZE = 4>\n__global__ void find_ptr_or_insert_kernel(\n    const Table<K, V, S>* __restrict table, Bucket<K, V, S>* buckets,\n    const size_t bucket_max_size, const size_t buckets_num, const size_t dim,\n    const K* __restrict keys, V** __restrict vectors, S* __restrict scores,\n    bool* __restrict found, const S global_epoch, const size_t N) {\n  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());\n  int* buckets_size = table->buckets_size;\n\n  using ScoreFunctor = ScoreFunctor<K, V, S, Strategy>;\n\n  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;\n       t += blockDim.x * gridDim.x) {\n    int key_pos = -1;\n    size_t key_idx = t / TILE_SIZE;\n\n    const K find_or_insert_key = keys[key_idx];\n\n    if (IS_RESERVED_KEY<K>(find_or_insert_key)) continue;\n\n    const S find_or_insert_score =\n        ScoreFunctor::desired_when_missed(scores, key_idx, global_epoch);\n\n    size_t bkt_idx = 0;\n    size_t start_idx = 0;\n    int src_lane = -1;\n    K evicted_key;\n\n    Bucket<K, V, S>* bucket =\n        get_key_position<K>(buckets, find_or_insert_key, bkt_idx, start_idx,\n                            buckets_num, bucket_max_size);\n\n    OccupyResult occupy_result{OccupyResult::INITIAL};\n    const int bucket_size = buckets_size[bkt_idx];\n    do {\n      if (bucket_size < bucket_max_size) {\n        occupy_result = find_and_lock_when_vacant<K, V, S, TILE_SIZE>(\n            g, bucket, find_or_insert_key, find_or_insert_score, evicted_key,\n            start_idx, key_pos, src_lane, bucket_max_size);\n      } else {\n        start_idx = (start_idx / TILE_SIZE) * TILE_SIZE;\n        occupy_result = find_and_lock_when_full<K, V, S, TILE_SIZE,\n                                                ScoreFunctor::LOCK_MEM_ORDER,\n                                                ScoreFunctor::UNLOCK_MEM_ORDER>(\n            g, bucket, find_or_insert_key, find_or_insert_score, evicted_key,\n            start_idx, key_pos, src_lane, bucket_max_size);\n      }\n\n      occupy_result = g.shfl(occupy_result, src_lane);\n    } while (occupy_result == OccupyResult::CONTINUE);\n\n    if ((occupy_result == OccupyResult::OCCUPIED_EMPTY ||\n         occupy_result == OccupyResult::OCCUPIED_RECLAIMED) &&\n        g.thread_rank() == src_lane) {\n      atomicAdd(&(buckets_size[bkt_idx]), 1);\n    }\n\n    if (g.thread_rank() == src_lane) {\n      if (occupy_result != OccupyResult::REFUSED) {\n        ScoreFunctor::update(bucket, key_pos, scores, key_idx,\n                             find_or_insert_score,\n                             occupy_result != OccupyResult::DUPLICATE);\n        bucket->digests(key_pos)[0] = get_digest<K>(find_or_insert_key);\n        (bucket->keys(key_pos))\n            ->store(find_or_insert_key, ScoreFunctor::UNLOCK_MEM_ORDER);\n        *(vectors + key_idx) = (bucket->vectors + key_pos * dim);\n      } else {\n        *(vectors + key_idx) = nullptr;\n      }\n      *(found + key_idx) = occupy_result == OccupyResult::DUPLICATE;\n    }\n  }\n}\n\ntemplate <typename K, typename V, typename S, int Strategy>\nstruct SelectFindOrInsertPtrKernel {\n  static void execute_kernel(const float& load_factor, const int& block_size,\n                             const size_t bucket_max_size,\n                             const size_t buckets_num, const size_t dim,\n                             cudaStream_t& stream, const size_t& n,\n                             const Table<K, V, S>* __restrict table,\n                             Bucket<K, V, S>* buckets, const K* __restrict keys,\n                             V** __restrict values, S* __restrict scores,\n                             bool* __restrict found, const S global_epoch) {\n    if (load_factor <= 0.5) {\n      const unsigned int tile_size = 4;\n      const size_t N = n * tile_size;\n      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);\n      find_ptr_or_insert_kernel<K, V, S, Strategy, tile_size>\n          <<<grid_size, block_size, 0, stream>>>(\n              table, buckets, bucket_max_size, buckets_num, dim, keys, values,\n              scores, found, global_epoch, N);\n    } else if (load_factor <= 0.875) {\n      const unsigned int tile_size = 8;\n      const size_t N = n * tile_size;\n      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);\n      find_ptr_or_insert_kernel<K, V, S, Strategy, tile_size>\n          <<<grid_size, block_size, 0, stream>>>(\n              table, buckets, bucket_max_size, buckets_num, dim, keys, values,\n              scores, found, global_epoch, N);\n    } else {\n      const unsigned int tile_size = 32;\n      const size_t N = n * tile_size;\n      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);\n      find_ptr_or_insert_kernel<K, V, S, Strategy, tile_size>\n          <<<grid_size, block_size, 0, stream>>>(\n              table, buckets, bucket_max_size, buckets_num, dim, keys, values,\n              scores, found, global_epoch, N);\n    }\n    return;\n  }\n};\n\n}  // namespace merlin\n}  // namespace nv"
  },
  {
    "path": "include/merlin/core_kernels/group_lock_kernels.cuh",
    "content": "/*\n * Copyright (c) 2023, NVIDIA CORPORATION.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *     http:///www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n#pragma once\n#include <cuda/atomic>\n#include <cuda/std/semaphore>\n\nnamespace nv {\nnamespace merlin {\nnamespace group_lock {\n\ntemplate <typename T>\n__global__ void init_kernel(\n    cuda::atomic<T, cuda::thread_scope_device>* update_count,\n    cuda::atomic<T, cuda::thread_scope_device>* read_count,\n    cuda::atomic<bool, cuda::thread_scope_device>* unique_flag) {\n  if (blockIdx.x == 0 && threadIdx.x == 0) {\n    new (update_count) cuda::atomic<T, cuda::thread_scope_device>{0};\n    new (read_count) cuda::atomic<T, cuda::thread_scope_device>{0};\n    new (unique_flag) cuda::atomic<bool, cuda::thread_scope_device>{false};\n  }\n}\n\ntemplate <typename T>\n__global__ void lock_read_kernel(\n    cuda::atomic<T, cuda::thread_scope_device>* update_count,\n    cuda::atomic<T, cuda::thread_scope_device>* read_count) {\n  for (;;) {\n    while (update_count->load(cuda::std::memory_order_relaxed)) {\n    }\n    read_count->fetch_add(1, cuda::std::memory_order_relaxed);\n    if (update_count->load(cuda::std::memory_order_relaxed) == 0) {\n      break;\n    }\n    read_count->fetch_sub(1, cuda::std::memory_order_relaxed);\n  }\n}\n\ntemplate <typename T>\n__global__ void unlock_read_kernel(\n    cuda::atomic<T, cuda::thread_scope_device>* read_count) {\n  read_count->fetch_sub(1, cuda::std::memory_order_relaxed);\n}\n\ntemplate <typename T>\n__global__ void lock_update_kernel(\n    cuda::atomic<T, cuda::thread_scope_device>* update_count,\n    cuda::atomic<T, cuda::thread_scope_device>* read_count) {\n  for (;;) {\n    while (read_count->load(cuda::std::memory_order_relaxed)) {\n    }\n    update_count->fetch_add(1, cuda::std::memory_order_relaxed);\n    if (read_count->load(cuda::std::memory_order_relaxed) == 0) {\n      break;\n    }\n    update_count->fetch_sub(1, cuda::std::memory_order_relaxed);\n  }\n}\n\ntemplate <typename T>\n__global__ void unlock_update_kernel(\n    cuda::atomic<T, cuda::thread_scope_device>* update_count) {\n  update_count->fetch_sub(1, cuda::std::memory_order_relaxed);\n}\n\ntemplate <typename T>\n__global__ void lock_update_read_kernel(\n    cuda::atomic<T, cuda::thread_scope_device>* update_count,\n    cuda::atomic<T, cuda::thread_scope_device>* read_count,\n    cuda::atomic<bool, cuda::thread_scope_device>* unique_flag) {\n  /* Lock unique flag */\n  bool expected = false;\n  while (!unique_flag->compare_exchange_weak(expected, true,\n                                             cuda::std::memory_order_relaxed)) {\n    expected = false;\n  }\n\n  /* Ban update */\n  for (;;) {\n    while (update_count->load(cuda::std::memory_order_relaxed)) {\n    }\n    read_count->fetch_add(1, cuda::std::memory_order_relaxed);\n    if (update_count->load(cuda::std::memory_order_relaxed) == 0) {\n      break;\n    }\n    read_count->fetch_sub(1, cuda::std::memory_order_relaxed);\n  }\n\n  /* Ban read */\n  for (;;) {\n    while (read_count->load(cuda::std::memory_order_relaxed) > 1) {\n    }\n    update_count->fetch_add(1, cuda::std::memory_order_relaxed);\n    if (read_count->load(cuda::std::memory_order_relaxed) == 1) {\n      break;\n    }\n    update_count->fetch_sub(1, cuda::std::memory_order_relaxed);\n  }\n}\n\ntemplate <typename T>\n__global__ void unlock_update_read_kernel(\n    cuda::atomic<T, cuda::thread_scope_device>* update_count,\n    cuda::atomic<T, cuda::thread_scope_device>* read_count,\n    cuda::atomic<bool, cuda::thread_scope_device>* unique_flag) {\n  read_count->fetch_sub(1, cuda::std::memory_order_relaxed);\n  update_count->fetch_sub(1, cuda::std::memory_order_relaxed);\n  unique_flag->store(false, cuda::std::memory_order_relaxed);\n}\n\ntemplate <typename T>\n__global__ void update_count_kernel(\n    T* counter, cuda::atomic<T, cuda::thread_scope_device>* update_count) {\n  *counter = update_count->load(cuda::std::memory_order_relaxed);\n}\n\ntemplate <typename T>\n__global__ void read_count_kernel(\n    T* counter, cuda::atomic<T, cuda::thread_scope_device>* read_count) {\n  *counter = read_count->load(cuda::std::memory_order_relaxed);\n}\n\n}  // namespace group_lock\n}  // namespace merlin\n}  // namespace nv"
  },
  {
    "path": "include/merlin/core_kernels/kernel_utils.cuh",
    "content": "/*\n * Copyright (c) 2023, NVIDIA CORPORATION.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *     http:///www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n#pragma once\n\n#include <cooperative_groups.h>\n#include <cooperative_groups/reduce.h>\n#include <cuda_pipeline.h>\n#include <cuda/barrier>\n#include <mutex>\n#include <thread>\n#include <vector>\n#include \"../types.cuh\"\n#include \"../utils.cuh\"\n\nusing namespace cooperative_groups;\nnamespace cg = cooperative_groups;\n\nnamespace nv {\nnamespace merlin {\n\n// Vector Type of digests for memory access.\nusing VecD_Load = byte16;\n// Vector Type of digests for computation.\nusing VecD_Comp = byte4;\n\ntemplate <typename T>\n__forceinline__ __device__ T* __shfl_sync_ptr(uint32_t mask, T* var,\n                                              int srcLane,\n                                              int width = warpSize) {\n  uint64_t var64 = reinterpret_cast<uint64_t>(var);\n  var64 = __shfl_sync(mask, var64, srcLane, width);\n  return reinterpret_cast<T*>(var64);\n}\n\n// Select from double buffer.\n// If i % 2 == 0, select buffer 0, else buffer 1.\n__forceinline__ __device__ int same_buf(int i) { return (i & 0x01) ^ 0; }\n// If i % 2 == 0, select buffer 1, else buffer 0.\n__forceinline__ __device__ int diff_buf(int i) { return (i & 0x01) ^ 1; }\n\n/**\n * Digest functions for single-bucket mode.\n *\n * A digest is a 1-byte fingerprint (bits [32:39] of the Murmur3 hash) stored\n * alongside each key in the bucket.  During lookup, the warp first performs a\n * SIMD comparison (`__vcmpeq4`) of the target digest against all 128 stored\n * digests.  Only slots whose digest matches proceed to full 8-byte key\n * comparison.  With a random 8-bit digest, the expected false-positive rate\n * is 1/256 per occupied slot, reducing full-key comparisons from O(bucket_size)\n * to ~0.5 per lookup miss.\n *\n * NOTE: Some pipeline kernels (lookup.cuh, contains.cuh) compute the target\n * digest inline as `hashed_key >> 32` for performance, bypassing\n * `get_digest()`. Any change to the digest derivation must be reflected in\n * those locations too.\n */\n\ntemplate <typename K>\n__forceinline__ __device__ D empty_digest() {\n  const K hashed_key = Murmur3HashDevice(static_cast<K>(EMPTY_KEY));\n  return static_cast<D>(hashed_key >> 32);\n}\n\ntemplate <typename K>\n__forceinline__ __device__ D reclaim_digest() {\n  const K hashed_key = Murmur3HashDevice(static_cast<K>(RECLAIM_KEY));\n  return static_cast<D>(hashed_key >> 32);\n}\n\n// Target digest for a given key (bits [32:39] of Murmur3 hash).\ntemplate <typename K>\n__forceinline__ __device__ D get_digest(const K& key) {\n  const K hashed_key = Murmur3HashDevice(key);\n  return static_cast<D>(hashed_key >> 32);\n}\n\n// Pack digest into all 4 bytes for SIMD `__vcmpeq4` comparison.\ntemplate <typename K>\n__forceinline__ __device__ VecD_Comp digests_from_hashed(const K& hashed_key) {\n  D digest = static_cast<D>(hashed_key >> 32);\n  return static_cast<VecD_Comp>(__byte_perm(digest, digest, 0x0000));\n}\n\n// Pack empty-key digest into all 4 bytes for SIMD comparison.\ntemplate <typename K>\n__forceinline__ __device__ VecD_Comp empty_digests() {\n  D digest = empty_digest<K>();\n  return static_cast<VecD_Comp>(__byte_perm(digest, digest, 0x0000));\n}\n\n// Position alignment.\ntemplate <uint32_t ALIGN_SIZE>\n__forceinline__ __device__ uint32_t align_to(uint32_t& pos) {\n  constexpr uint32_t MASK = 0xffffffffU - (ALIGN_SIZE - 1);\n  return pos & MASK;\n}\n\ntemplate <typename ElementType>\n__forceinline__ __device__ void LDGSTS(ElementType* dst,\n                                       const ElementType* src);\n\ntemplate <>\n__forceinline__ __device__ void LDGSTS<byte>(byte* dst, const byte* src) {\n  byte element = *src;\n  *dst = element;\n}\n\ntemplate <>\n__forceinline__ __device__ void LDGSTS<byte2>(byte2* dst, const byte2* src) {\n  byte2 element = *src;\n  *dst = element;\n}\n\n// Require compute ability >= 8.0\ntemplate <typename ElementType>\n__forceinline__ __device__ void LDGSTS(ElementType* dst,\n                                       const ElementType* src) {\n  __pipeline_memcpy_async(dst, src, sizeof(ElementType));\n}\n\ntemplate <typename S, typename K, int BUCKET_SIZE = 128>\nstruct CopyScoreEmpty {\n  __forceinline__ __device__ static S* get_base_ptr(K** keys_ptr, int offset) {\n    return nullptr;\n  }\n  __forceinline__ __device__ static void ldg_sts(S* dst, const S* src) {}\n  __forceinline__ __device__ static S lgs(const S* src) { return 0; }\n  __forceinline__ __device__ static void stg(S* dst, const S score_) {}\n};\n\ntemplate <typename S, typename K, int BUCKET_SIZE = 128>\nstruct CopyScoreByPassCache {\n  __forceinline__ __device__ static S* get_base_ptr(K** keys_ptr, int offset) {\n    return reinterpret_cast<S*>(keys_ptr[offset] + BUCKET_SIZE);\n  }\n\n  __forceinline__ __device__ static void ldg_sts(S* dst, const S* src) {\n    LDGSTS<S>(dst, src);\n  }\n\n  __forceinline__ __device__ static S lgs(const S* src) { return src[0]; }\n\n  __forceinline__ __device__ static void stg(S* dst, const S score_) {\n    __stcs(dst, score_);\n  }\n};\n\ntemplate <typename VecV = byte16, int GROUP_SIZE = 16>\nstruct CopyValueOneGroup {\n  __forceinline__ __device__ static void ldg_sts(int rank, VecV* dst,\n                                                 const VecV* src, int dim) {\n    int offset = rank;\n    if (offset < dim) LDGSTS<VecV>(dst + offset, src + offset);\n  }\n\n  __forceinline__ __device__ static void lds_stg(int rank, VecV* dst,\n                                                 const VecV* src, int dim) {\n    int offset = rank;\n    if (offset < dim) {\n      VecV vec_v = src[offset];\n      __stcs(dst + offset, vec_v);\n    }\n  }\n};\n\ntemplate <typename VecV = byte16, int GROUP_SIZE = 16>\nstruct CopyValueTwoGroup {\n  __forceinline__ __device__ static void ldg_sts(int rank, VecV* dst,\n                                                 const VecV* src,\n                                                 const int dim) {\n    int offset = rank;\n    LDGSTS<VecV>(dst + offset, src + offset);\n    offset += GROUP_SIZE;\n    if (offset < dim) LDGSTS<VecV>(dst + offset, src + offset);\n  }\n\n  __forceinline__ __device__ static void lds_stg(int rank, VecV* dst,\n                                                 const VecV* src,\n                                                 const int dim) {\n    int offset = rank;\n    const VecV vec_v = src[offset];\n    __stcs(dst + offset, vec_v);\n    offset += GROUP_SIZE;\n    if (offset < dim) {\n      const VecV vec_v = src[offset];\n      __stcs(dst + offset, vec_v);\n    }\n  }\n};\n\ntemplate <typename VecV = byte16, int GROUP_SIZE = 16>\nstruct CopyValueMultipleGroup {\n  __forceinline__ __device__ static void ldg_sts(int rank, VecV* dst,\n                                                 const VecV* src,\n                                                 const int dim) {\n    for (int offset = rank; offset < dim; offset += GROUP_SIZE) {\n      LDGSTS<VecV>(dst + offset, src + offset);\n    }\n  }\n\n  __forceinline__ __device__ static void lds_stg(int rank, VecV* dst,\n                                                 const VecV* src,\n                                                 const int dim) {\n    for (int offset = rank; offset < dim; offset += GROUP_SIZE) {\n      VecV vec_v = src[offset];\n      __stcs(dst + offset, vec_v);\n    }\n  }\n\n  __forceinline__ __device__ static void ldg_stg(int rank, VecV* dst,\n                                                 const VecV* src,\n                                                 const int dim) {\n    for (int offset = rank; offset < dim; offset += GROUP_SIZE) {\n      VecV vec_v = __ldcs(src + offset);\n      __stcs(dst + offset, vec_v);\n    }\n  }\n};\n\ntemplate <typename K, typename S>\n__forceinline__ __device__ void evict_key_score(K* evicted_keys,\n                                                S* evicted_scores,\n                                                const uint32_t evict_idx,\n                                                const K& key, const S& score) {\n  // Cache with evict_first strategy.\n  __stcs(evicted_keys + evict_idx, key);\n  if (evicted_scores != nullptr) {\n    __stcs(evicted_scores + evict_idx, score);\n  }\n};\n\ntemplate <class K, class V, class S, int Strategy>\nstruct ScoreFunctor;\n\nconstexpr int EPOCH_BITS = 32;\n\nconstexpr uint64_t EPOCH_BITS_MASK = UINT64_C(0xFFFFFFFF00000000);\nconstexpr uint64_t SCORE_BITS_MASK = UINT64_C(0xFFFFFFFF);\nconstexpr uint64_t SCORE_32BIT_MAX = UINT64_C(0xFFFFFFFF);\n/* The granularity of timestamp in the lower 32 bits is 1.048576ms. */\nstatic constexpr int RSHIFT_ON_NANO = 20;\n\ntemplate <class S>\n__forceinline__ __device__ S make_epoch(const S& epoch) {\n  return epoch << EPOCH_BITS;\n}\n\ntemplate <class S>\n__forceinline__ __device__ S make_nano() {\n  return (SCORE_BITS_MASK & (device_nano<S>() >> RSHIFT_ON_NANO));\n}\n\ntemplate <class K, class V, class S>\nstruct ScoreFunctor<K, V, S, EvictStrategyInternal::kLru> {\n  static constexpr cuda::std::memory_order LOCK_MEM_ORDER =\n      cuda::std::memory_order_relaxed;\n  static constexpr cuda::std::memory_order UNLOCK_MEM_ORDER =\n      cuda::std::memory_order_relaxed;\n  using BUCKET = Bucket<K, V, S>;\n\n  __forceinline__ __device__ static S desired_when_missed(\n      const S* __restrict const input_scores, const int key_idx,\n      const S& epoch) {\n    return device_nano<S>();\n  }\n\n  __forceinline__ __device__ static void update(\n      BUCKET* __restrict bucket, const int key_pos,\n      const S* __restrict const input_scores, const int key_idx,\n      const S& desired_score_when_missed, const bool new_insert) {\n    bucket->scores(key_pos)->store(desired_score_when_missed,\n                                   cuda::std::memory_order_relaxed);\n    return;\n  }\n\n  __forceinline__ __device__ static void update_with_digest(\n      K* __restrict bucket_key_ptr, const uint32_t& key_pos,\n      const S* __restrict const input_scores, const uint32_t& key_idx,\n      const S& desired_score_when_missed, const uint32_t& bucket_capacity,\n      const D& digest, const bool new_insert) {\n    S* dst_score_ptr = BUCKET::scores(bucket_key_ptr, bucket_capacity, key_pos);\n    D* dst_digest_ptr =\n        BUCKET::digests(bucket_key_ptr, bucket_capacity, key_pos);\n    // Cache in L2 cache, bypass L1 Cache.\n    __stcg(dst_digest_ptr, digest);\n    __stcg(dst_score_ptr, device_nano<S>());\n    return;\n  }\n\n  __forceinline__ __device__ static void update_without_missed(\n      BUCKET* __restrict bucket, const int key_pos,\n      const S* __restrict const input_scores, const int key_idx,\n      const S& epoch) {\n    bucket->scores(key_pos)->store(device_nano<S>(),\n                                   cuda::std::memory_order_relaxed);\n    return;\n  }\n\n  __forceinline__ __device__ static void update_without_missed(\n      K* bucket_keys_ptr, const uint32_t bucket_capacity,\n      const uint32_t key_pos, const S* __restrict const input_scores,\n      const int key_idx, const S& epoch) {\n    S* dst_score_ptr =\n        BUCKET::scores(bucket_keys_ptr, bucket_capacity, key_pos);\n    // Cache in L2 cache, bypass L1 Cache.\n    __stcg(dst_score_ptr, device_nano<S>());\n  }\n};\n\ntemplate <class K, class V, class S>\nstruct ScoreFunctor<K, V, S, EvictStrategyInternal::kLfu> {\n  static constexpr cuda::std::memory_order LOCK_MEM_ORDER =\n      cuda::std::memory_order_acquire;\n  static constexpr cuda::std::memory_order UNLOCK_MEM_ORDER =\n      cuda::std::memory_order_release;\n  using BUCKET = Bucket<K, V, S>;\n\n  __forceinline__ __device__ static S desired_when_missed(\n      const S* __restrict const input_scores, const int key_idx,\n      const S& epoch) {\n    return static_cast<S>(MAX_SCORE);\n  }\n\n  __forceinline__ __device__ static void update(\n      BUCKET* __restrict bucket, const int key_pos,\n      const S* __restrict const input_scores, const int key_idx,\n      const S& desired_score_when_missed, const bool new_insert) {\n    if (input_scores == nullptr) return;\n    if (new_insert) {\n      bucket->scores(key_pos)->store(input_scores[key_idx],\n                                     cuda::std::memory_order_relaxed);\n    } else {\n      bucket->scores(key_pos)->fetch_add(input_scores[key_idx],\n                                         cuda::std::memory_order_relaxed);\n    }\n    return;\n  }\n\n  __forceinline__ __device__ static void update_with_digest(\n      K* __restrict bucket_key_ptr, const uint32_t& key_pos,\n      const S* __restrict const input_scores, const uint32_t& key_idx,\n      const S& desired_score_when_missed, const uint32_t& bucket_capacity,\n      const D& digest, const bool new_insert) {\n    if (input_scores == nullptr) return;\n\n    S* dst_score_ptr = BUCKET::scores(bucket_key_ptr, bucket_capacity, key_pos);\n    D* dst_digest_ptr =\n        BUCKET::digests(bucket_key_ptr, bucket_capacity, key_pos);\n    // Cache in L2 cache, bypass L1 Cache.\n    __stcg(dst_digest_ptr, digest);\n    if (new_insert) {\n      __stcg(dst_score_ptr, input_scores[key_idx]);\n    } else {\n      __stcg(dst_score_ptr, input_scores[key_idx] + *dst_score_ptr);\n    }\n    return;\n  }\n\n  __forceinline__ __device__ static void update_without_missed(\n      BUCKET* __restrict bucket, const int key_pos,\n      const S* __restrict const input_scores, const int key_idx,\n      const S& epoch) {\n    if (input_scores == nullptr) return;\n    bucket->scores(key_pos)->fetch_add(input_scores[key_idx],\n                                       cuda::std::memory_order_relaxed);\n    return;\n  }\n\n  __forceinline__ __device__ static void update_without_missed(\n      K* bucket_keys_ptr, const uint32_t bucket_capacity,\n      const uint32_t key_pos, const S* __restrict const input_scores,\n      const int key_idx, const S& epoch) {\n    if (input_scores == nullptr) return;\n    S* dst_score_ptr =\n        BUCKET::scores(bucket_keys_ptr, bucket_capacity, key_pos);\n    // Cache in L2 cache, bypass L1 Cache.\n    __stcg(dst_score_ptr, input_scores[key_idx] + *dst_score_ptr);\n  }\n};\n\ntemplate <class K, class V, class S>\nstruct ScoreFunctor<K, V, S, EvictStrategyInternal::kEpochLru> {\n  static constexpr cuda::std::memory_order LOCK_MEM_ORDER =\n      cuda::std::memory_order_relaxed;\n  static constexpr cuda::std::memory_order UNLOCK_MEM_ORDER =\n      cuda::std::memory_order_relaxed;\n  using BUCKET = Bucket<K, V, S>;\n\n  __forceinline__ __device__ static S desired_when_missed(\n      const S* __restrict const input_scores, const int key_idx,\n      const S& epoch) {\n    if (epoch == static_cast<S>(IGNORED_GLOBAL_EPOCH) &&\n        input_scores != nullptr) {\n      return input_scores[key_idx];\n    }\n    return make_epoch<S>(epoch) | make_nano<S>();\n  }\n\n  __forceinline__ __device__ static void update(\n      BUCKET* __restrict bucket, const int key_pos,\n      const S* __restrict const input_scores, const int key_idx,\n      const S& desired_score_when_missed, const bool new_insert) {\n    bucket->scores(key_pos)->store(desired_score_when_missed,\n                                   cuda::std::memory_order_relaxed);\n    return;\n  }\n\n  __forceinline__ __device__ static void update_with_digest(\n      K* __restrict bucket_key_ptr, const uint32_t& key_pos,\n      const S* __restrict const input_scores, const uint32_t& key_idx,\n      const S& desired_score_when_missed, const uint32_t& bucket_capacity,\n      const D& digest, const bool new_insert) {\n    S* dst_score_ptr = BUCKET::scores(bucket_key_ptr, bucket_capacity, key_pos);\n    D* dst_digest_ptr =\n        BUCKET::digests(bucket_key_ptr, bucket_capacity, key_pos);\n    // Cache in L2 cache, bypass L1 Cache.\n    __stcg(dst_digest_ptr, digest);\n    __stcg(dst_score_ptr, desired_score_when_missed);\n    return;\n  }\n\n  __forceinline__ __device__ static void update_without_missed(\n      BUCKET* __restrict bucket, const int key_pos,\n      const S* __restrict const input_scores, const int key_idx,\n      const S& epoch) {\n    bucket->scores(key_pos)->store(make_epoch<S>(epoch) | make_nano<S>(),\n                                   cuda::std::memory_order_relaxed);\n    return;\n  }\n\n  __forceinline__ __device__ static void update_without_missed(\n      K* bucket_keys_ptr, const uint32_t bucket_capacity,\n      const uint32_t key_pos, const S* __restrict const input_scores,\n      const int key_idx, const S& epoch) {\n    S* dst_score_ptr =\n        BUCKET::scores(bucket_keys_ptr, bucket_capacity, key_pos);\n    // Cache in L2 cache, bypass L1 Cache.\n    __stcg(dst_score_ptr, make_epoch<S>(epoch) | make_nano<S>());\n  }\n};\n\ntemplate <class K, class V, class S>\nstruct ScoreFunctor<K, V, S, EvictStrategyInternal::kEpochLfu> {\n  static constexpr cuda::std::memory_order LOCK_MEM_ORDER =\n      cuda::std::memory_order_relaxed;\n  static constexpr cuda::std::memory_order UNLOCK_MEM_ORDER =\n      cuda::std::memory_order_relaxed;\n  using BUCKET = Bucket<K, V, S>;\n\n  __forceinline__ __device__ static S desired_when_missed(\n      const S* __restrict const input_scores, const int key_idx,\n      const S epoch) {\n    if (epoch == static_cast<S>(IGNORED_GLOBAL_EPOCH)) {\n      return input_scores[key_idx];\n    }\n    return make_epoch<S>(epoch) | (input_scores[key_idx] & SCORE_BITS_MASK);\n  }\n\n  __forceinline__ __device__ static void update(\n      BUCKET* __restrict bucket, const int key_pos,\n      const S* __restrict const input_scores, const int key_idx,\n      const S& desired_score_when_missed, const bool new_insert) {\n    S new_score = desired_score_when_missed;\n    if (!new_insert) {\n      new_score =\n          (bucket->scores(key_pos)->load(cuda::std::memory_order_relaxed) &\n           SCORE_BITS_MASK);\n      if (SCORE_32BIT_MAX - new_score >\n          (desired_score_when_missed & SCORE_BITS_MASK)) {\n        new_score += desired_score_when_missed;\n      } else {\n        new_score =\n            (desired_score_when_missed & EPOCH_BITS_MASK) | SCORE_32BIT_MAX;\n      }\n    }\n    bucket->scores(key_pos)->store(new_score, cuda::std::memory_order_relaxed);\n    return;\n  }\n\n  __forceinline__ __device__ static void update_with_digest(\n      K* __restrict bucket_key_ptr, const uint32_t& key_pos,\n      const S* __restrict const input_scores, const uint32_t& key_idx,\n      const S& desired_score_when_missed, const uint32_t& bucket_capacity,\n      const D& digest, const bool new_insert) {\n    S new_score = desired_score_when_missed;\n    S* dst_score_ptr = BUCKET::scores(bucket_key_ptr, bucket_capacity, key_pos);\n    D* dst_digest_ptr =\n        BUCKET::digests(bucket_key_ptr, bucket_capacity, key_pos);\n    if (!new_insert) {\n      new_score = (*dst_score_ptr & SCORE_BITS_MASK);\n      if (SCORE_32BIT_MAX - new_score >\n          (desired_score_when_missed & SCORE_BITS_MASK)) {\n        new_score += desired_score_when_missed;\n      } else {\n        new_score =\n            (desired_score_when_missed & EPOCH_BITS_MASK) | SCORE_32BIT_MAX;\n      }\n    }\n    // Cache in L2 cache, bypass L1 Cache.\n    __stcg(dst_digest_ptr, digest);\n    __stcg(dst_score_ptr, new_score);\n    return;\n  }\n\n  __forceinline__ __device__ static void update_without_missed(\n      BUCKET* __restrict bucket, const int key_pos,\n      const S* __restrict const input_scores, const int key_idx,\n      const S& epoch) {\n    if (input_scores == nullptr) return;\n    S new_score =\n        (bucket->scores(key_pos)->load(cuda::std::memory_order_relaxed) &\n         SCORE_BITS_MASK);\n    if (SCORE_32BIT_MAX - new_score >\n        (input_scores[key_idx] & SCORE_BITS_MASK)) {\n      new_score +=\n          (make_epoch<S>(epoch) | (input_scores[key_idx] & SCORE_BITS_MASK));\n    } else {\n      new_score = make_epoch<S>(epoch) | SCORE_32BIT_MAX;\n    }\n\n    bucket->scores(key_pos)->store(new_score, cuda::std::memory_order_relaxed);\n    return;\n  }\n\n  __forceinline__ __device__ static void update_without_missed(\n      K* bucket_keys_ptr, const uint32_t bucket_capacity,\n      const uint32_t key_pos, const S* __restrict const input_scores,\n      const int key_idx, const S& epoch) {\n    if (input_scores == nullptr) return;\n    S* dst_score_ptr =\n        BUCKET::scores(bucket_keys_ptr, bucket_capacity, key_pos);\n    S new_score = *dst_score_ptr & SCORE_BITS_MASK;\n    if (SCORE_32BIT_MAX - new_score >\n        (input_scores[key_idx] & SCORE_BITS_MASK)) {\n      new_score +=\n          (make_epoch<S>(epoch) | (input_scores[key_idx] & SCORE_BITS_MASK));\n    } else {\n      new_score = make_epoch<S>(epoch) | SCORE_32BIT_MAX;\n    }\n    // Cache in L2 cache, bypass L1 Cache.\n    __stcg(dst_score_ptr, new_score);\n  }\n};\n\ntemplate <class K, class V, class S>\nstruct ScoreFunctor<K, V, S, EvictStrategyInternal::kCustomized> {\n  static constexpr cuda::std::memory_order LOCK_MEM_ORDER =\n      cuda::std::memory_order_acquire;\n  static constexpr cuda::std::memory_order UNLOCK_MEM_ORDER =\n      cuda::std::memory_order_release;\n  using BUCKET = Bucket<K, V, S>;\n\n  __forceinline__ __device__ static S desired_when_missed(\n      const S* __restrict const input_scores, const int key_idx,\n      const S& epoch) {\n    return input_scores[key_idx];\n  }\n\n  __forceinline__ __device__ static void update(\n      BUCKET* __restrict bucket, const int key_pos,\n      const S* __restrict const input_scores, const int key_idx,\n      const S& desired_score_when_missed, const bool new_insert) {\n    bucket->scores(key_pos)->store(desired_score_when_missed,\n                                   cuda::std::memory_order_relaxed);\n    return;\n  }\n\n  __forceinline__ __device__ static void update_with_digest(\n      K* __restrict bucket_key_ptr, const uint32_t& key_pos,\n      const S* __restrict const input_scores, const uint32_t& key_idx,\n      const S& desired_score_when_missed, const uint32_t& bucket_capacity,\n      const D& digest, const bool new_insert) {\n    S* dst_score_ptr = BUCKET::scores(bucket_key_ptr, bucket_capacity, key_pos);\n    D* dst_digest_ptr =\n        BUCKET::digests(bucket_key_ptr, bucket_capacity, key_pos);\n    // Cache in L2 cache, bypass L1 Cache.\n    __stcg(dst_digest_ptr, digest);\n    __stcg(dst_score_ptr, desired_score_when_missed);\n    return;\n  }\n  __forceinline__ __device__ static void update_without_missed(\n      BUCKET* __restrict bucket, const int key_pos,\n      const S* __restrict const input_scores, const int key_idx,\n      const S& epoch) {\n    if (input_scores == nullptr) return;\n    bucket->scores(key_pos)->store(input_scores[key_idx],\n                                   cuda::std::memory_order_relaxed);\n    return;\n  }\n\n  __forceinline__ __device__ static void update_without_missed(\n      K* bucket_keys_ptr, const uint32_t bucket_capacity,\n      const uint32_t key_pos, const S* __restrict const input_scores,\n      const int key_idx, const S& epoch) {\n    if (input_scores == nullptr) return;\n    S* dst_score_ptr =\n        BUCKET::scores(bucket_keys_ptr, bucket_capacity, key_pos);\n    // Cache in L2 cache, bypass L1 Cache.\n    __stcg(dst_score_ptr, input_scores[key_idx]);\n  }\n};\n\ntemplate <class V, uint32_t TILE_SIZE = 4>\n__device__ __forceinline__ void copy_vector(\n    cg::thread_block_tile<TILE_SIZE> const& g, const V* src, V* dst,\n    const size_t dim) {\n  for (auto i = g.thread_rank(); i < dim; i += g.size()) {\n    dst[i] = src[i];\n  }\n}\n\ntemplate <class K, class V, class S>\n__forceinline__ __device__ Bucket<K, V, S>* get_key_position(\n    Bucket<K, V, S>* __restrict buckets, const K key, size_t& bkt_idx,\n    size_t& start_idx, const size_t buckets_num, const size_t bucket_max_size) {\n  const K hashed_key = Murmur3HashDevice(key);\n  const size_t global_idx = hashed_key % (buckets_num * bucket_max_size);\n  bkt_idx = global_idx / bucket_max_size;\n  start_idx = global_idx % bucket_max_size;\n  start_idx -= start_idx % 4;\n  return buckets + bkt_idx;\n}\n\n__forceinline__ __device__ uint32_t get_start_position(\n    const uint64_t& global_idx, const uint64_t& bucket_capacity) {\n  uint32_t start_idx = global_idx & (bucket_capacity - 1);\n  start_idx -= start_idx % 4;\n  return start_idx;\n}\n\ntemplate <class K, class V, class S, uint32_t TILE_SIZE = 4>\n__device__ __forceinline__ OccupyResult find_without_lock(\n    cg::thread_block_tile<TILE_SIZE> g, Bucket<K, V, S>* __restrict__ bucket,\n    const K desired_key, const size_t start_idx, int& key_pos, int& src_lane,\n    const size_t bucket_max_size) {\n  K expected_key = static_cast<K>(EMPTY_KEY);\n\n  AtomicKey<K>* current_key;\n\n  unsigned vote = 0;\n\n  for (uint32_t tile_offset = 0; tile_offset < bucket_max_size;\n       tile_offset += TILE_SIZE) {\n    key_pos = (start_idx + tile_offset + g.thread_rank()) % bucket_max_size;\n\n    current_key = bucket->keys(key_pos);\n\n    expected_key = current_key->load(cuda::std::memory_order_relaxed);\n    vote = g.ballot(desired_key == expected_key);\n    if (vote) {\n      src_lane = __ffs(vote) - 1;\n      key_pos = g.shfl(key_pos, src_lane);\n      return OccupyResult::DUPLICATE;\n    }\n    vote = g.ballot(expected_key == static_cast<K>(EMPTY_KEY));\n    if (vote) break;\n  }\n  return OccupyResult::CONTINUE;\n}\n\ntemplate <class K, class V, class S, uint32_t TILE_SIZE = 4>\n__device__ __inline__ OccupyResult find_and_lock_when_vacant(\n    cg::thread_block_tile<TILE_SIZE> g, Bucket<K, V, S>* __restrict__ bucket,\n    const K desired_key, const S desired_score, K& evicted_key,\n    const size_t start_idx, int& key_pos, int& src_lane,\n    const size_t bucket_max_size) {\n  K expected_key = static_cast<K>(EMPTY_KEY);\n\n  AtomicKey<K>* current_key;\n  AtomicScore<S>* current_score;\n\n  K local_min_score_key = static_cast<K>(EMPTY_KEY);\n\n  S local_min_score_val = static_cast<S>(MAX_SCORE);\n  S temp_min_score_val = static_cast<S>(MAX_SCORE);\n  int local_min_score_pos = -1;\n\n  unsigned vote = 0;\n  bool result = false;\n\n  for (uint32_t tile_offset = 0; tile_offset < bucket_max_size;\n       tile_offset += TILE_SIZE) {\n    key_pos = (start_idx + tile_offset + g.thread_rank()) % bucket_max_size;\n\n    current_key = bucket->keys(key_pos);\n\n    // Step 1: try find and lock the desired_key.\n    do {\n      expected_key = desired_key;\n      result = current_key->compare_exchange_strong(\n          expected_key, static_cast<K>(LOCKED_KEY),\n          cuda::std::memory_order_relaxed, cuda::std::memory_order_relaxed);\n      vote = g.ballot(result);\n      if (vote) {\n        src_lane = __ffs(vote) - 1;\n        key_pos = g.shfl(key_pos, src_lane);\n        return OccupyResult::DUPLICATE;\n      }\n      vote = g.ballot(expected_key == static_cast<K>(LOCKED_KEY));\n      if (vote) continue;\n      vote = g.ballot(expected_key == static_cast<K>(EMPTY_KEY));\n      if (vote) break;\n    } while (vote != 0);\n\n    // Step 2: (TBD)try find empty location.\n    while (vote) {\n      src_lane = __ffs(vote) - 1;\n      if (src_lane == g.thread_rank()) {\n        expected_key = static_cast<K>(EMPTY_KEY);\n        result = current_key->compare_exchange_strong(\n            expected_key, static_cast<K>(LOCKED_KEY),\n            cuda::std::memory_order_relaxed, cuda::std::memory_order_relaxed);\n      }\n      result = g.shfl(result, src_lane);\n      if (result) {\n        key_pos = g.shfl(key_pos, src_lane);\n        return OccupyResult::OCCUPIED_EMPTY;\n      }\n      result = g.shfl((expected_key == desired_key ||\n                       expected_key == static_cast<K>(LOCKED_KEY)),\n                      src_lane);\n      if (result) {\n        return OccupyResult::CONTINUE;\n      }\n      vote -= ((unsigned(0x1)) << src_lane);\n    }\n  }\n\n  for (uint32_t tile_offset = 0; tile_offset < bucket_max_size;\n       tile_offset += TILE_SIZE) {\n    key_pos = (start_idx + tile_offset + g.thread_rank()) % bucket_max_size;\n\n    current_score = bucket->scores(key_pos);\n\n    // Step 4: record min score location.\n    temp_min_score_val = current_score->load(cuda::std::memory_order_relaxed);\n    if (temp_min_score_val < local_min_score_val) {\n      expected_key =\n          bucket->keys(key_pos)->load(cuda::std::memory_order_relaxed);\n      if (expected_key != static_cast<K>(LOCKED_KEY) &&\n          expected_key != static_cast<K>(EMPTY_KEY)) {\n        local_min_score_key = expected_key;\n        local_min_score_val = temp_min_score_val;\n        local_min_score_pos = key_pos;\n      }\n    }\n  }\n  // Step 5: insert by evicting some one.\n  const S global_min_score_val =\n      cg::reduce(g, local_min_score_val, cg::less<S>());\n  if (desired_score < global_min_score_val) {\n    return OccupyResult::REFUSED;\n  }\n  vote = g.ballot(local_min_score_val <= global_min_score_val);\n  if (vote) {\n    src_lane = __ffs(vote) - 1;\n    result = false;\n    if (src_lane == g.thread_rank()) {\n      // TBD: Here can be compare_exchange_weak. Do benchmark.\n      current_key = bucket->keys(local_min_score_pos);\n      current_score = bucket->scores(local_min_score_pos);\n      evicted_key = local_min_score_key;\n      result = current_key->compare_exchange_strong(\n          local_min_score_key, static_cast<K>(LOCKED_KEY),\n          cuda::std::memory_order_relaxed, cuda::std::memory_order_relaxed);\n\n      // Need to recover when fail.\n      if (result && (current_score->load(cuda::std::memory_order_relaxed) >\n                     global_min_score_val)) {\n        current_key->store(local_min_score_key,\n                           cuda::std::memory_order_release);\n        result = false;\n      }\n    }\n    result = g.shfl(result, src_lane);\n    if (result) {\n      // Not every `evicted_key` is correct expect the `src_lane` thread.\n      key_pos = g.shfl(local_min_score_pos, src_lane);\n      return (evicted_key == static_cast<K>(RECLAIM_KEY))\n                 ? OccupyResult::OCCUPIED_RECLAIMED\n                 : OccupyResult::EVICT;\n    }\n  }\n  return OccupyResult::CONTINUE;\n}\n\ntemplate <class K, class V, class S, uint32_t TILE_SIZE,\n          cuda::std::memory_order LOCK_MEM_ORDER,\n          cuda::std::memory_order UNLOCK_MEM_ORDER>\n__device__ __forceinline__ OccupyResult find_and_lock_when_full(\n    cg::thread_block_tile<TILE_SIZE> g, Bucket<K, V, S>* __restrict__ bucket,\n    const K desired_key, const S desired_score, K& evicted_key,\n    const size_t start_idx, int& key_pos, int& src_lane,\n    const size_t bucket_max_size) {\n  K expected_key = static_cast<K>(EMPTY_KEY);\n\n  AtomicKey<K>* current_key;\n  AtomicScore<S>* current_score;\n\n  K local_min_score_key = static_cast<K>(EMPTY_KEY);\n\n  S local_min_score_val = static_cast<S>(MAX_SCORE);\n  S temp_min_score_val = static_cast<S>(MAX_SCORE);\n  int local_min_score_pos = -1;\n\n  unsigned vote = 0;\n  bool result = false;\n\n  for (uint32_t tile_offset = 0; tile_offset < bucket_max_size;\n       tile_offset += TILE_SIZE) {\n    key_pos = (start_idx + tile_offset + g.thread_rank()) % bucket_max_size;\n\n    current_key = bucket->keys(key_pos);\n\n    // Step 1: try find and lock the desired_key.\n    do {\n      expected_key = desired_key;\n      result = current_key->compare_exchange_strong(\n          expected_key, static_cast<K>(LOCKED_KEY), LOCK_MEM_ORDER,\n          cuda::std::memory_order_relaxed);\n      vote = g.ballot(result);\n      if (vote) {\n        src_lane = __ffs(vote) - 1;\n        key_pos = g.shfl(key_pos, src_lane);\n        return OccupyResult::DUPLICATE;\n      }\n      vote = g.ballot(expected_key == static_cast<K>(LOCKED_KEY));\n    } while (vote != 0);\n  }\n\n  for (uint32_t tile_offset = 0; tile_offset < bucket_max_size;\n       tile_offset += TILE_SIZE) {\n    key_pos = (start_idx + tile_offset + g.thread_rank()) % bucket_max_size;\n\n    // Step 2: record min score location.\n    temp_min_score_val =\n        bucket->scores(key_pos)->load(cuda::std::memory_order_relaxed);\n    if (temp_min_score_val < local_min_score_val) {\n      while ((expected_key = bucket->keys(key_pos)->load(LOCK_MEM_ORDER)) ==\n             static_cast<K>(LOCKED_KEY)) {\n      };\n      local_min_score_key = expected_key;\n      local_min_score_val = temp_min_score_val;\n      local_min_score_pos = key_pos;\n    }\n  }\n\n  // Step 3: insert by evicting some one.\n  const S global_min_score_val =\n      cg::reduce(g, local_min_score_val, cg::less<S>());\n  if (desired_score < global_min_score_val) {\n    return OccupyResult::REFUSED;\n  }\n  vote = g.ballot(local_min_score_val <= global_min_score_val);\n  if (vote) {\n    src_lane = __ffs(vote) - 1;\n    result = false;\n    if (src_lane == g.thread_rank()) {\n      // TBD: Here can be compare_exchange_weak. Do benchmark.\n      current_key = bucket->keys(local_min_score_pos);\n      current_score = bucket->scores(local_min_score_pos);\n      evicted_key = local_min_score_key;\n      result = current_key->compare_exchange_strong(\n          local_min_score_key, static_cast<K>(LOCKED_KEY), LOCK_MEM_ORDER,\n          cuda::std::memory_order_relaxed);\n\n      // Need to recover when fail.\n      if (result && (current_score->load(cuda::std::memory_order_relaxed) >\n                     global_min_score_val)) {\n        current_key->store(local_min_score_key, UNLOCK_MEM_ORDER);\n        result = false;\n      }\n    }\n    result = g.shfl(result, src_lane);\n    if (result) {\n      // Not every `evicted_key` is correct expect the `src_lane` thread.\n      key_pos = g.shfl(local_min_score_pos, src_lane);\n      return (evicted_key == static_cast<K>(RECLAIM_KEY))\n                 ? OccupyResult::OCCUPIED_RECLAIMED\n                 : OccupyResult::EVICT;\n    }\n  }\n  return OccupyResult::CONTINUE;\n}\n\ntemplate <class K, class V, class S, uint32_t TILE_SIZE>\n__device__ __forceinline__ OccupyResult find_and_lock_for_update(\n    cg::thread_block_tile<TILE_SIZE> g, Bucket<K, V, S>* __restrict__ bucket,\n    const K desired_key, const size_t start_idx, int& key_pos, int& src_lane,\n    const size_t bucket_max_size) {\n  K expected_key = static_cast<K>(EMPTY_KEY);\n\n  AtomicKey<K>* current_key;\n\n  unsigned vote = 0;\n  bool result = false;\n\n  for (uint32_t tile_offset = 0; tile_offset < bucket_max_size;\n       tile_offset += TILE_SIZE) {\n    key_pos = (start_idx + tile_offset + g.thread_rank()) % bucket_max_size;\n\n    current_key = bucket->keys(key_pos);\n\n    // Step 1: try find and lock the desired_key.\n    do {\n      expected_key = desired_key;\n      result = current_key->compare_exchange_strong(\n          expected_key, static_cast<K>(LOCKED_KEY),\n          cuda::std::memory_order_relaxed, cuda::std::memory_order_relaxed);\n      vote = g.ballot(result);\n      if (vote) {\n        src_lane = __ffs(vote) - 1;\n        key_pos = g.shfl(key_pos, src_lane);\n        return OccupyResult::DUPLICATE;\n      }\n      vote = g.ballot(expected_key == static_cast<K>(EMPTY_KEY));\n      if (vote) return OccupyResult::REFUSED;\n      vote = g.ballot(expected_key == static_cast<K>(LOCKED_KEY));\n    } while (vote != 0);\n  }\n  return OccupyResult::REFUSED;\n}\n\n}  // namespace merlin\n}  // namespace nv\n"
  },
  {
    "path": "include/merlin/core_kernels/lookup.cuh",
    "content": "/*\n * Copyright (c) 2023, NVIDIA CORPORATION.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *     http:///www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n#pragma once\n\n#include \"kernel_utils.cuh\"\n\nnamespace nv {\nnamespace merlin {\n\ntemplate <typename K>\nstruct FoundFunctorV1 {\n  __host__ __device__ FoundFunctorV1(bool* __restrict founds_)\n      : founds(founds_) {}\n\n  __forceinline__ __device__ void operator()(const int idx, const K /*key*/,\n                                             const bool found) {\n    if (found) {\n      founds[idx] = true;\n    }\n  }\n\n  bool* __restrict founds;\n};\n\ntemplate <typename K>\nstruct FoundFunctorV2 {\n  __host__ __device__ FoundFunctorV2(K* __restrict missed_keys_,\n                                     int* __restrict missed_indices_,\n                                     int* __restrict missed_size_)\n      : missed_keys(missed_keys_),\n        missed_indices(missed_indices_),\n        missed_size(missed_size_) {}\n\n  __forceinline__ __device__ void operator()(const int idx, const K key,\n                                             const bool found) {\n    if (!found) {\n      int missed_idx = atomicAdd(missed_size, 1);\n      missed_keys[missed_idx] = key;\n      missed_indices[missed_idx] = idx;\n    }\n  }\n\n  K* __restrict missed_keys;\n  int* __restrict missed_indices;\n  int* __restrict missed_size;\n};\n\ntemplate <typename K = uint64_t, typename V = float, typename S = uint64_t>\nstruct LookupKernelParams {\n  LookupKernelParams(Bucket<K, V, S>* __restrict buckets_, size_t buckets_num_,\n                     uint32_t dim_, const K* __restrict keys_,\n                     V* __restrict values_, S* __restrict scores_,\n                     bool* __restrict founds_, size_t n_)\n      : buckets(buckets_),\n        buckets_num(buckets_num_),\n        dim(dim_),\n        keys(keys_),\n        values(values_),\n        scores(scores_),\n        found_functor(founds_),\n        n(n_) {}\n  Bucket<K, V, S>* __restrict buckets;\n  size_t buckets_num;\n  uint32_t dim;\n  const K* __restrict keys;\n  V* __restrict values;\n  S* __restrict scores;\n  FoundFunctorV1<K> found_functor;\n  size_t n;\n};\n\ntemplate <typename K = uint64_t, typename V = float, typename S = uint64_t>\nstruct LookupKernelParamsV2 {\n  LookupKernelParamsV2(Bucket<K, V, S>* __restrict buckets_,\n                       size_t buckets_num_, uint32_t dim_,\n                       const K* __restrict keys_, V* __restrict values_,\n                       S* __restrict scores_, K* __restrict missed_keys_,\n                       int* __restrict missed_indices_,\n                       int* __restrict missed_size_, size_t n_)\n      : buckets(buckets_),\n        buckets_num(buckets_num_),\n        dim(dim_),\n        keys(keys_),\n        values(values_),\n        scores(scores_),\n        found_functor(missed_keys_, missed_indices_, missed_size_),\n        n(n_) {}\n\n  Bucket<K, V, S>* __restrict buckets;\n  size_t buckets_num;\n  uint32_t dim;\n  const K* __restrict keys;\n  V* __restrict values;\n  S* __restrict scores;\n  FoundFunctorV2<K> found_functor;\n  size_t n;\n};\n\n// Using 32 threads to deal with one key\ntemplate <typename K = uint64_t, typename V = float, typename S = uint64_t,\n          typename VecV = float4,\n          typename CopyScore = CopyScoreEmpty<S, K, 128>,\n          typename CopyValue = CopyValueTwoGroup<VecV, 32>,\n          typename FoundFunctor = FoundFunctorV1<K>, int VALUE_BUF = 56>\n__global__ void lookup_kernel_with_io_pipeline_v1(\n    Bucket<K, V, S>* buckets, const size_t buckets_num, const int dim,\n    const K* __restrict keys, VecV* __restrict values, S* __restrict scores,\n    FoundFunctor found_functor, size_t n) {\n  constexpr int GROUP_SIZE = 32;\n  constexpr int RESERVE = 16;\n  constexpr int BLOCK_SIZE = 128;\n  constexpr int BUCKET_SIZE = 128;\n  constexpr int GROUP_NUM = BLOCK_SIZE / GROUP_SIZE;\n  constexpr int DIGEST_SPAN = BUCKET_SIZE / 4;\n\n  __shared__ int sm_target_digests[BLOCK_SIZE];\n  __shared__ K sm_target_keys[BLOCK_SIZE];\n  __shared__ K* sm_keys_ptr[BLOCK_SIZE];\n  __shared__ VecV* sm_values_ptr[BLOCK_SIZE];\n  __shared__ S sm_target_scores[BLOCK_SIZE];\n  // Reuse\n  int* sm_counts = sm_target_digests;\n  int* sm_founds = sm_counts;\n  // Double buffer\n  __shared__ uint32_t sm_probing_digests[2][GROUP_NUM * DIGEST_SPAN];\n  __shared__ K sm_possible_keys[2][GROUP_NUM * RESERVE];\n  __shared__ int sm_possible_pos[2][GROUP_NUM * RESERVE];\n  __shared__ VecV sm_vector[2][GROUP_NUM][VALUE_BUF];\n\n  // Initialization\n  auto g = cg::tiled_partition<GROUP_SIZE>(cg::this_thread_block());\n  int groupID = threadIdx.x / GROUP_SIZE;\n  int rank = g.thread_rank();\n  int key_idx_base = (blockIdx.x * blockDim.x) + groupID * GROUP_SIZE;\n  if (key_idx_base >= n) return;\n  int loop_num =\n      (n - key_idx_base) < GROUP_SIZE ? (n - key_idx_base) : GROUP_SIZE;\n  if (rank < loop_num) {\n    int idx_block = groupID * GROUP_SIZE + rank;\n    K target_key = keys[key_idx_base + rank];\n    sm_target_keys[idx_block] = target_key;\n    const K hashed_key = Murmur3HashDevice(target_key);\n    const uint8_t target_digest = static_cast<uint8_t>(hashed_key >> 32);\n    sm_target_digests[idx_block] = static_cast<uint32_t>(target_digest);\n    int global_idx = hashed_key % (buckets_num * BUCKET_SIZE);\n    int bkt_idx = global_idx / BUCKET_SIZE;\n    Bucket<K, V, S>* bucket = buckets + bkt_idx;\n    __pipeline_memcpy_async(sm_keys_ptr + idx_block, bucket->keys_addr(),\n                            sizeof(K*));\n    __pipeline_commit();\n    __pipeline_memcpy_async(sm_values_ptr + idx_block, &(bucket->vectors),\n                            sizeof(VecV*));\n  }\n  __pipeline_wait_prior(0);\n\n  // Pipeline loading\n  uint8_t* digests_ptr =\n      reinterpret_cast<uint8_t*>(sm_keys_ptr[groupID * GROUP_SIZE]) -\n      BUCKET_SIZE;\n  __pipeline_memcpy_async(sm_probing_digests[0] + groupID * DIGEST_SPAN + rank,\n                          digests_ptr + rank * 4, sizeof(uint32_t));\n  __pipeline_commit();\n  // Padding, meet the param of the first `__pipeline_wait_prior`\n  // in the first loop.\n  __pipeline_commit();\n  __pipeline_commit();\n\n  for (int i = 0; i < loop_num; i++) {\n    int key_idx_block = groupID * GROUP_SIZE + i;\n\n    /* Step1: prefetch all digests in one bucket */\n    if ((i + 1) < loop_num) {\n      uint8_t* digests_ptr =\n          reinterpret_cast<uint8_t*>(sm_keys_ptr[key_idx_block + 1]) -\n          BUCKET_SIZE;\n      __pipeline_memcpy_async(\n          sm_probing_digests[diff_buf(i)] + groupID * DIGEST_SPAN + rank,\n          digests_ptr + rank * 4, sizeof(uint32_t));\n    }\n    __pipeline_commit();\n\n    /* Step2: check digests and load possible keys */\n    uint32_t target_digest = sm_target_digests[key_idx_block];\n    uint32_t target_digests = __byte_perm(target_digest, target_digest, 0x0000);\n    sm_counts[key_idx_block] = 0;\n    __pipeline_wait_prior(3);\n    uint32_t probing_digests =\n        sm_probing_digests[same_buf(i)][groupID * DIGEST_SPAN + rank];\n    uint32_t find_result_ = __vcmpeq4(probing_digests, target_digests);\n    uint32_t find_result = 0;\n    if ((find_result_ & 0x01) != 0) find_result |= 0x01;\n    if ((find_result_ & 0x0100) != 0) find_result |= 0x02;\n    if ((find_result_ & 0x010000) != 0) find_result |= 0x04;\n    if ((find_result_ & 0x01000000) != 0) find_result |= 0x08;\n    int find_number = __popc(find_result);\n    int group_base = 0;\n    if (find_number > 0) {\n      group_base = atomicAdd(sm_counts + key_idx_block, find_number);\n    }\n    bool gt_reserve = (group_base + find_number) > RESERVE;\n    int gt_vote = g.ballot(gt_reserve);\n    K* key_ptr = sm_keys_ptr[key_idx_block];\n    if (gt_vote == 0) {\n      do {\n        int digest_idx = __ffs(find_result) - 1;\n        if (digest_idx >= 0) {\n          find_result &= (find_result - 1);\n          int key_pos = rank * 4 + digest_idx;\n          sm_possible_pos[same_buf(i)][groupID * RESERVE + group_base] =\n              key_pos;\n          __pipeline_memcpy_async(\n              sm_possible_keys[same_buf(i)] + (groupID * RESERVE + group_base),\n              key_ptr + key_pos, sizeof(K));\n          group_base += 1;\n        } else {\n          break;\n        }\n      } while (true);\n    } else {\n      K target_key = sm_target_keys[key_idx_block];\n      sm_counts[key_idx_block] = 0;\n      int found_vote = 0;\n      bool found = false;\n      do {\n        int digest_idx = __ffs(find_result) - 1;\n        if (digest_idx >= 0) {\n          find_result &= (find_result - 1);\n          int key_pos = rank * 4 + digest_idx;\n          K possible_key = key_ptr[key_pos];\n          if (possible_key == target_key) {\n            found = true;\n            sm_counts[key_idx_block] = 1;\n            sm_possible_pos[same_buf(i)][groupID * RESERVE] = key_pos;\n            sm_possible_keys[same_buf(i)][groupID * RESERVE] = possible_key;\n          }\n        }\n        found_vote = g.ballot(found);\n        if (found_vote) {\n          break;\n        }\n        found_vote = digest_idx >= 0;\n      } while (g.any(found_vote));\n    }\n    __pipeline_commit();\n\n    /* Step3: check possible keys, and prefecth the value and score */\n    if (i > 0) {\n      key_idx_block -= 1;\n      K target_key = sm_target_keys[key_idx_block];\n      int possible_num = sm_counts[key_idx_block];\n      sm_founds[key_idx_block] = 0;\n      S* score_ptr = CopyScore::get_base_ptr(sm_keys_ptr, key_idx_block);\n      VecV* value_ptr = sm_values_ptr[key_idx_block];\n      __pipeline_wait_prior(3);\n      int key_pos;\n      bool found_flag = false;\n      if (rank < possible_num) {\n        K possible_key =\n            sm_possible_keys[diff_buf(i)][groupID * RESERVE + rank];\n        key_pos = sm_possible_pos[diff_buf(i)][groupID * RESERVE + rank];\n        if (possible_key == target_key) {\n          found_flag = true;\n          CopyScore::ldg_sts(sm_target_scores + key_idx_block,\n                             score_ptr + key_pos);\n        }\n      }\n      int found_vote = g.ballot(found_flag);\n      if (found_vote) {\n        VecV* v_dst = sm_vector[diff_buf(i)][groupID];\n        sm_founds[key_idx_block] = 1;\n        int src_lane = __ffs(found_vote) - 1;\n        int target_pos = g.shfl(key_pos, src_lane);\n        VecV* v_src = value_ptr + target_pos * dim;\n        CopyValue::ldg_sts(rank, v_dst, v_src, dim);\n      }\n    }\n    __pipeline_commit();\n\n    /* Step4: write back value and score */\n    if (i > 1) {\n      key_idx_block -= 1;\n      int key_idx_grid = blockIdx.x * blockDim.x + key_idx_block;\n      VecV* v_src = sm_vector[same_buf(i)][groupID];\n      VecV* v_dst = values + key_idx_grid * dim;\n      int found_flag = sm_founds[key_idx_block];\n      __pipeline_wait_prior(3);\n      if (found_flag > 0) {\n        S score_ = CopyScore::lgs(sm_target_scores + key_idx_block);\n        CopyValue::lds_stg(rank, v_dst, v_src, dim);\n        CopyScore::stg(scores + key_idx_grid, score_);\n      }\n    }\n  }  // End loop\n\n  /* Pipeline emptying: step3, i = loop_num */\n  {\n    int key_idx_block = groupID * GROUP_SIZE + (loop_num - 1);\n    K target_key = sm_target_keys[key_idx_block];\n    int possible_num = sm_counts[key_idx_block];\n    sm_founds[key_idx_block] = 0;\n    S* score_ptr = CopyScore::get_base_ptr(sm_keys_ptr, key_idx_block);\n    VecV* value_ptr = sm_values_ptr[key_idx_block];\n    __pipeline_wait_prior(1);\n    int key_pos;\n    bool found_flag = false;\n    if (rank < possible_num) {\n      key_pos = sm_possible_pos[diff_buf(loop_num)][groupID * RESERVE + rank];\n      K possible_key =\n          sm_possible_keys[diff_buf(loop_num)][groupID * RESERVE + rank];\n      if (target_key == possible_key) {\n        found_flag = true;\n        CopyScore::ldg_sts(sm_target_scores + key_idx_block,\n                           score_ptr + key_pos);\n      }\n    }\n    int found_vote = g.ballot(found_flag);\n    if (found_vote) {\n      sm_founds[key_idx_block] = 1;\n      int src_lane = __ffs(found_vote) - 1;\n      int target_pos = g.shfl(key_pos, src_lane);\n      VecV* v_src = value_ptr + target_pos * dim;\n      VecV* v_dst = sm_vector[diff_buf(loop_num)][groupID];\n      CopyValue::ldg_sts(rank, v_dst, v_src, dim);\n    }\n  }\n  __pipeline_commit();\n\n  /* Pipeline emptying: step4, i = loop_num */\n  if (loop_num > 1) {\n    int key_idx_block = groupID * GROUP_SIZE + loop_num - 2;\n    int key_idx_grid = blockIdx.x * blockDim.x + key_idx_block;\n    VecV* v_src = sm_vector[same_buf(loop_num)][groupID];\n    VecV* v_dst = values + key_idx_grid * dim;\n    int found_flag = sm_founds[key_idx_block];\n    __pipeline_wait_prior(1);\n    if (found_flag > 0) {\n      S score_ = CopyScore::lgs(sm_target_scores + key_idx_block);\n      CopyValue::lds_stg(rank, v_dst, v_src, dim);\n      CopyScore::stg(scores + key_idx_grid, score_);\n    }\n  }\n\n  /* Pipeline emptying: step4, i = loop_num + 1 */\n  {\n    int key_idx_block = groupID * GROUP_SIZE + loop_num - 1;\n    int key_idx_grid = blockIdx.x * blockDim.x + key_idx_block;\n    VecV* v_src = sm_vector[same_buf(loop_num + 1)][groupID];\n    VecV* v_dst = values + key_idx_grid * dim;\n    int found_flag = sm_founds[key_idx_block];\n    __pipeline_wait_prior(0);\n    if (found_flag > 0) {\n      S score_ = CopyScore::lgs(sm_target_scores + key_idx_block);\n      CopyValue::lds_stg(rank, v_dst, v_src, dim);\n      CopyScore::stg(scores + key_idx_grid, score_);\n    }\n  }\n\n  if (rank < loop_num) {\n    int key_idx_block = groupID * GROUP_SIZE + rank;\n    int key_idx_grid = blockIdx.x * blockDim.x + key_idx_block;\n    found_functor(key_idx_grid, sm_target_keys[key_idx_block],\n                  sm_founds[key_idx_block]);\n  }\n}  // End function\n\n// Using 16 threads to deal with one key\ntemplate <typename K = uint64_t, typename V = float, typename S = uint64_t,\n          typename VecV = float4,\n          typename CopyScore = CopyScoreEmpty<S, K, 128>,\n          typename CopyValue = CopyValueTwoGroup<VecV, 16>,\n          typename FoundFunctor = FoundFunctorV1<K>, int VALUE_BUF = 32>\n__global__ void lookup_kernel_with_io_pipeline_v2(\n    Bucket<K, V, S>* buckets, const size_t buckets_num, const int dim,\n    const K* __restrict keys, VecV* __restrict values, S* __restrict scores,\n    FoundFunctor found_functor, size_t n) {\n  constexpr int GROUP_SIZE = 16;\n  constexpr int RESERVE = 8;\n  constexpr int BLOCK_SIZE = 128;\n  constexpr int BUCKET_SIZE = 128;\n  constexpr int GROUP_NUM = BLOCK_SIZE / GROUP_SIZE;\n  constexpr int DIGEST_SPAN = BUCKET_SIZE / 4;\n\n  __shared__ int sm_target_digests[BLOCK_SIZE];\n  __shared__ K sm_target_keys[BLOCK_SIZE];\n  __shared__ K* sm_keys_ptr[BLOCK_SIZE];\n  __shared__ VecV* sm_values_ptr[BLOCK_SIZE];\n  __shared__ S sm_target_scores[BLOCK_SIZE];\n  // Reuse\n  int* sm_counts = sm_target_digests;\n  int* sm_founds = sm_counts;\n  // Double buffer\n  __shared__ uint32_t sm_probing_digests[2][GROUP_NUM * DIGEST_SPAN];\n  __shared__ K sm_possible_keys[2][GROUP_NUM * RESERVE];\n  __shared__ int sm_possible_pos[2][GROUP_NUM * RESERVE];\n  __shared__ VecV sm_vector[2][GROUP_NUM][VALUE_BUF];\n\n  // Initialization\n  auto g = cg::tiled_partition<GROUP_SIZE>(cg::this_thread_block());\n  int groupID = threadIdx.x / GROUP_SIZE;\n  int rank = g.thread_rank();\n  int key_idx_base = (blockIdx.x * blockDim.x) + groupID * GROUP_SIZE;\n  if (key_idx_base >= n) return;\n  int loop_num =\n      (n - key_idx_base) < GROUP_SIZE ? (n - key_idx_base) : GROUP_SIZE;\n  if (rank < loop_num) {\n    int idx_block = groupID * GROUP_SIZE + rank;\n    K target_key = keys[key_idx_base + rank];\n    sm_target_keys[idx_block] = target_key;\n    const K hashed_key = Murmur3HashDevice(target_key);\n    const uint8_t target_digest = static_cast<uint8_t>(hashed_key >> 32);\n    sm_target_digests[idx_block] = static_cast<uint32_t>(target_digest);\n    int global_idx = hashed_key % (buckets_num * BUCKET_SIZE);\n    int bkt_idx = global_idx / BUCKET_SIZE;\n    Bucket<K, V, S>* bucket = buckets + bkt_idx;\n    __pipeline_memcpy_async(sm_keys_ptr + idx_block, bucket->keys_addr(),\n                            sizeof(K*));\n    __pipeline_commit();\n    __pipeline_memcpy_async(sm_values_ptr + idx_block, &(bucket->vectors),\n                            sizeof(VecV*));\n  }\n  __pipeline_wait_prior(0);\n\n  // Pipeline loading\n  uint8_t* digests_ptr =\n      reinterpret_cast<uint8_t*>(sm_keys_ptr[groupID * GROUP_SIZE]) -\n      BUCKET_SIZE;\n  __pipeline_memcpy_async(\n      sm_probing_digests[0] + groupID * DIGEST_SPAN + rank * 2,\n      digests_ptr + rank * 8, sizeof(uint2));\n  __pipeline_commit();\n  // Padding, meet the param of the first `__pipeline_wait_prior`\n  // in the first loop.\n  __pipeline_commit();\n  __pipeline_commit();\n\n  for (int i = 0; i < loop_num; i++) {\n    int key_idx_block = groupID * GROUP_SIZE + i;\n\n    /* Step1: prefetch all digests in one bucket */\n    if ((i + 1) < loop_num) {\n      uint8_t* digests_ptr =\n          reinterpret_cast<uint8_t*>(sm_keys_ptr[key_idx_block + 1]) -\n          BUCKET_SIZE;\n      __pipeline_memcpy_async(\n          sm_probing_digests[diff_buf(i)] + groupID * DIGEST_SPAN + rank * 2,\n          digests_ptr + rank * 8, sizeof(uint2));\n    }\n    __pipeline_commit();\n\n    /* Step2: check digests and load possible keys */\n    uint32_t target_digest = sm_target_digests[key_idx_block];\n    uint32_t target_digests = __byte_perm(target_digest, target_digest, 0x0000);\n    sm_counts[key_idx_block] = 0;\n    __pipeline_wait_prior(3);\n    uint32_t probing_digests =\n        sm_probing_digests[same_buf(i)][groupID * DIGEST_SPAN + rank];\n    uint32_t find_result_ = __vcmpeq4(probing_digests, target_digests);\n    uint32_t find_result = 0;\n    if ((find_result_ & 0x01) != 0) find_result |= 0x01;\n    if ((find_result_ & 0x0100) != 0) find_result |= 0x02;\n    if ((find_result_ & 0x010000) != 0) find_result |= 0x04;\n    if ((find_result_ & 0x01000000) != 0) find_result |= 0x08;\n    probing_digests = sm_probing_digests[same_buf(i)][groupID * DIGEST_SPAN +\n                                                      rank + GROUP_SIZE];\n    find_result_ = __vcmpeq4(probing_digests, target_digests);\n    if ((find_result_ & 0x01) != 0) find_result |= 0x10;\n    if ((find_result_ & 0x0100) != 0) find_result |= 0x20;\n    if ((find_result_ & 0x010000) != 0) find_result |= 0x40;\n    if ((find_result_ & 0x01000000) != 0) find_result |= 0x80;\n    int find_number = __popc(find_result);\n    int group_base = 0;\n    if (find_number > 0) {\n      group_base = atomicAdd(sm_counts + key_idx_block, find_number);\n    }\n    bool gt_reserve = (group_base + find_number) > RESERVE;\n    int gt_vote = g.ballot(gt_reserve);\n    K* key_ptr = sm_keys_ptr[key_idx_block];\n    if (gt_vote == 0) {\n      do {\n        int digest_idx = __ffs(find_result) - 1;\n        if (digest_idx >= 0) {\n          find_result &= (find_result - 1);\n          int key_pos = digest_idx < 4\n                            ? (rank * 4 + digest_idx)\n                            : ((GROUP_SIZE + rank - 1) * 4 + digest_idx);\n          sm_possible_pos[same_buf(i)][groupID * RESERVE + group_base] =\n              key_pos;\n          __pipeline_memcpy_async(\n              sm_possible_keys[same_buf(i)] + groupID * RESERVE + group_base,\n              key_ptr + key_pos, sizeof(K));\n          group_base += 1;\n        } else {\n          break;\n        }\n      } while (true);\n    } else {\n      K target_key = sm_target_keys[key_idx_block];\n      sm_counts[key_idx_block] = 0;\n      int found_vote = 0;\n      bool found = false;\n      do {\n        int digest_idx = __ffs(find_result) - 1;\n        if (digest_idx >= 0) {\n          find_result &= (find_result - 1);\n          int key_pos = digest_idx < 4\n                            ? (rank * 4 + digest_idx)\n                            : ((GROUP_SIZE + rank - 1) * 4 + digest_idx);\n          K possible_key = key_ptr[key_pos];\n          if (possible_key == target_key) {\n            found = true;\n            sm_counts[key_idx_block] = 1;\n            sm_possible_pos[same_buf(i)][groupID * RESERVE] = key_pos;\n            sm_possible_keys[same_buf(i)][groupID * RESERVE] = possible_key;\n          }\n        }\n        found_vote = g.ballot(found);\n        if (found_vote) {\n          break;\n        }\n        found_vote = digest_idx >= 0;\n      } while (g.any(found_vote));\n    }\n    __pipeline_commit();\n\n    /* Step3: check possible keys, and prefecth the value and score */\n    if (i > 0) {\n      key_idx_block -= 1;\n      K target_key = sm_target_keys[key_idx_block];\n      int possible_num = sm_counts[key_idx_block];\n      sm_founds[key_idx_block] = 0;\n      S* score_ptr = CopyScore::get_base_ptr(sm_keys_ptr, key_idx_block);\n      VecV* value_ptr = sm_values_ptr[key_idx_block];\n      __pipeline_wait_prior(3);\n      int key_pos;\n      bool found_flag = false;\n      if (rank < possible_num) {\n        K possible_key =\n            sm_possible_keys[diff_buf(i)][groupID * RESERVE + rank];\n        key_pos = sm_possible_pos[diff_buf(i)][groupID * RESERVE + rank];\n        if (possible_key == target_key) {\n          found_flag = true;\n          CopyScore::ldg_sts(sm_target_scores + key_idx_block,\n                             score_ptr + key_pos);\n        }\n      }\n      int found_vote = g.ballot(found_flag);\n      if (found_vote) {\n        sm_founds[key_idx_block] = 1;\n        int src_lane = __ffs(found_vote) - 1;\n        int target_pos = g.shfl(key_pos, src_lane);\n        VecV* v_src = value_ptr + target_pos * dim;\n        VecV* v_dst = sm_vector[diff_buf(i)][groupID];\n        CopyValue::ldg_sts(rank, v_dst, v_src, dim);\n      }\n    }\n    __pipeline_commit();\n\n    /* Step4: write back value and score */\n    if (i > 1) {\n      key_idx_block -= 1;\n      int key_idx_grid = blockIdx.x * blockDim.x + key_idx_block;\n      int found_flag = sm_founds[key_idx_block];\n      VecV* v_src = sm_vector[same_buf(i)][groupID];\n      VecV* v_dst = values + key_idx_grid * dim;\n      __pipeline_wait_prior(3);\n      if (found_flag > 0) {\n        S score_ = CopyScore::lgs(sm_target_scores + key_idx_block);\n        CopyValue::lds_stg(rank, v_dst, v_src, dim);\n        CopyScore::stg(scores + key_idx_grid, score_);\n      }\n    }\n  }  // End loop\n\n  /* Pipeline emptying: step3, i = loop_num */\n  {\n    int key_idx_block = groupID * GROUP_SIZE + (loop_num - 1);\n    K target_key = sm_target_keys[key_idx_block];\n    int possible_num = sm_counts[key_idx_block];\n    sm_founds[key_idx_block] = 0;\n    S* score_ptr = CopyScore::get_base_ptr(sm_keys_ptr, key_idx_block);\n    VecV* value_ptr = sm_values_ptr[key_idx_block];\n    __pipeline_wait_prior(1);\n    int key_pos;\n    bool found_flag = false;\n    if (rank < possible_num) {\n      key_pos = sm_possible_pos[diff_buf(loop_num)][groupID * RESERVE + rank];\n      K possible_key =\n          sm_possible_keys[diff_buf(loop_num)][groupID * RESERVE + rank];\n      if (possible_key == target_key) {\n        found_flag = true;\n        CopyScore::ldg_sts(sm_target_scores + key_idx_block,\n                           score_ptr + key_pos);\n      }\n    }\n    int found_vote = g.ballot(found_flag);\n    if (found_vote) {\n      sm_founds[key_idx_block] = 1;\n      int src_lane = __ffs(found_vote) - 1;\n      int target_pos = g.shfl(key_pos, src_lane);\n      VecV* v_src = value_ptr + target_pos * dim;\n      VecV* v_dst = sm_vector[diff_buf(loop_num)][groupID];\n      CopyValue::ldg_sts(rank, v_dst, v_src, dim);\n    }\n  }\n  __pipeline_commit();\n\n  /* Pipeline emptying: step4, i = loop_num */\n  if (loop_num > 1) {\n    int key_idx_block = groupID * GROUP_SIZE + loop_num - 2;\n    int key_idx_grid = blockIdx.x * blockDim.x + key_idx_block;\n    VecV* v_dst = values + key_idx_grid * dim;\n    VecV* v_src = sm_vector[same_buf(loop_num)][groupID];\n    int found_flag = sm_founds[key_idx_block];\n    __pipeline_wait_prior(1);\n    if (found_flag > 0) {\n      S score_ = CopyScore::lgs(sm_target_scores + key_idx_block);\n      CopyValue::lds_stg(rank, v_dst, v_src, dim);\n      CopyScore::stg(scores + key_idx_grid, score_);\n    }\n  }\n\n  /* Pipeline emptying: step4, i = loop_num + 1 */\n  {\n    int key_idx_block = groupID * GROUP_SIZE + loop_num - 1;\n    int key_idx_grid = blockIdx.x * blockDim.x + key_idx_block;\n    VecV* v_dst = values + key_idx_grid * dim;\n    VecV* v_src = sm_vector[same_buf(loop_num + 1)][groupID];\n    int found_flag = sm_founds[key_idx_block];\n    __pipeline_wait_prior(0);\n    if (found_flag > 0) {\n      S score_ = CopyScore::lgs(sm_target_scores + key_idx_block);\n      CopyValue::lds_stg(rank, v_dst, v_src, dim);\n      CopyScore::stg(scores + key_idx_grid, score_);\n    }\n  }\n\n  if (rank < loop_num) {\n    int key_idx_block = groupID * GROUP_SIZE + rank;\n    int key_idx_grid = blockIdx.x * blockDim.x + key_idx_block;\n    found_functor(key_idx_grid, sm_target_keys[key_idx_block],\n                  sm_founds[key_idx_block]);\n  }\n}  // End function\n\ntemplate <typename K, typename V, typename S, typename CopyScore, typename VecV,\n          uint32_t ValueBufSize>\nstruct LaunchPipelineLookupV1 {\n  template <template <typename, typename, typename> typename LookupKernelParams>\n  static void launch_kernel(LookupKernelParams<K, V, S>& params,\n                            cudaStream_t& stream) {\n    constexpr int BLOCK_SIZE = 128;\n    // Using 32 threads to deal with one key\n    constexpr int GROUP_SIZE = 32;\n    params.dim = params.dim * sizeof(V) / sizeof(VecV);\n    constexpr uint32_t VecSize = ValueBufSize / sizeof(VecV);\n    if (params.dim > (GROUP_SIZE * 2)) {\n      using CopyValue = CopyValueMultipleGroup<VecV, GROUP_SIZE>;\n      lookup_kernel_with_io_pipeline_v1<K, V, S, VecV, CopyScore, CopyValue,\n                                        decltype(params.found_functor), VecSize>\n          <<<(params.n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(\n              params.buckets, params.buckets_num, params.dim, params.keys,\n              reinterpret_cast<VecV*>(params.values), params.scores,\n              params.found_functor, params.n);\n    } else if (params.dim > GROUP_SIZE) {\n      using CopyValue = CopyValueTwoGroup<VecV, GROUP_SIZE>;\n      lookup_kernel_with_io_pipeline_v1<K, V, S, VecV, CopyScore, CopyValue,\n                                        decltype(params.found_functor), VecSize>\n          <<<(params.n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(\n              params.buckets, params.buckets_num, params.dim, params.keys,\n              reinterpret_cast<VecV*>(params.values), params.scores,\n              params.found_functor, params.n);\n    } else {\n      using CopyValue = CopyValueOneGroup<VecV, GROUP_SIZE>;\n      lookup_kernel_with_io_pipeline_v1<K, V, S, VecV, CopyScore, CopyValue,\n                                        decltype(params.found_functor), VecSize>\n          <<<(params.n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(\n              params.buckets, params.buckets_num, params.dim, params.keys,\n              reinterpret_cast<VecV*>(params.values), params.scores,\n              params.found_functor, params.n);\n    }\n  }\n};\n\ntemplate <typename K, typename V, typename S, typename CopyScore, typename VecV,\n          uint32_t ValueBufSize>\nstruct LaunchPipelineLookupV2 {\n  template <template <typename, typename, typename> typename LookupKernelParams>\n  static void launch_kernel(LookupKernelParams<K, V, S>& params,\n                            cudaStream_t& stream) {\n    constexpr int BLOCK_SIZE = 128;\n    // Using 16 threads to deal with one key\n    constexpr int GROUP_SIZE = 16;\n    params.dim = params.dim * sizeof(V) / sizeof(VecV);\n    constexpr uint32_t VecSize = ValueBufSize / sizeof(VecV);\n    if (params.dim > (GROUP_SIZE * 2)) {\n      using CopyValue = CopyValueMultipleGroup<VecV, GROUP_SIZE>;\n      lookup_kernel_with_io_pipeline_v2<K, V, S, VecV, CopyScore, CopyValue,\n                                        decltype(params.found_functor), VecSize>\n          <<<(params.n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(\n              params.buckets, params.buckets_num, params.dim, params.keys,\n              reinterpret_cast<VecV*>(params.values), params.scores,\n              params.found_functor, params.n);\n    } else if (params.dim > GROUP_SIZE) {\n      using CopyValue = CopyValueTwoGroup<VecV, GROUP_SIZE>;\n      lookup_kernel_with_io_pipeline_v2<K, V, S, VecV, CopyScore, CopyValue,\n                                        decltype(params.found_functor), VecSize>\n          <<<(params.n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(\n              params.buckets, params.buckets_num, params.dim, params.keys,\n              reinterpret_cast<VecV*>(params.values), params.scores,\n              params.found_functor, params.n);\n    } else {\n      using CopyValue = CopyValueOneGroup<VecV, GROUP_SIZE>;\n      lookup_kernel_with_io_pipeline_v2<K, V, S, VecV, CopyScore, CopyValue,\n                                        decltype(params.found_functor), VecSize>\n          <<<(params.n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(\n              params.buckets, params.buckets_num, params.dim, params.keys,\n              reinterpret_cast<VecV*>(params.values), params.scores,\n              params.found_functor, params.n);\n    }\n  }\n};\n\ntemplate <typename ArchTag>\nstruct LookupValueBufConfig;\n\n/// TODO: support more arch\ntemplate <>\nstruct LookupValueBufConfig<Sm80> {\n  static constexpr uint32_t size_pipeline_v1 = 224 * sizeof(float);\n  static constexpr uint32_t size_pipeline_v2 = 128 * sizeof(float);\n};\n\ntemplate <>\nstruct LookupValueBufConfig<Sm70> {\n  static constexpr uint32_t size_pipeline_v1 = 112 * sizeof(float);\n  static constexpr uint32_t size_pipeline_v2 = 64 * sizeof(float);\n};\n\ntemplate <typename K, typename V, typename S = uint64_t,\n          typename ArchTag = Sm80>\nstruct SelectPipelineLookupKernelWithIO {\n  using ValueBufConfig = LookupValueBufConfig<ArchTag>;\n\n  static inline uint32_t max_value_size() {\n    return ValueBufConfig::size_pipeline_v1;\n  }\n\n  template <template <typename, typename, typename> typename LookupKernelParams>\n  static void select_kernel(LookupKernelParams<K, V, S>& params,\n                            cudaStream_t& stream) {\n    constexpr int BUCKET_SIZE = 128;\n    constexpr uint32_t buf_size_v1 = ValueBufConfig::size_pipeline_v1;\n    constexpr uint32_t buf_size_v2 = ValueBufConfig::size_pipeline_v2;\n\n    uint32_t total_value_size = static_cast<uint32_t>(params.dim * sizeof(V));\n\n    if (params.scores == nullptr) {\n      using CopyScore = CopyScoreEmpty<S, K, BUCKET_SIZE>;\n      if (total_value_size <= buf_size_v1) {\n        if (total_value_size % sizeof(float4) == 0) {\n          using VecV = float4;\n          LaunchPipelineLookupV1<K, V, S, CopyScore, VecV,\n                                 buf_size_v1>::launch_kernel(params, stream);\n        } else if (total_value_size % sizeof(float2) == 0) {\n          using VecV = float2;\n          LaunchPipelineLookupV1<K, V, S, CopyScore, VecV,\n                                 buf_size_v1>::launch_kernel(params, stream);\n        } else if (total_value_size % sizeof(float) == 0) {\n          using VecV = float;\n          LaunchPipelineLookupV1<K, V, S, CopyScore, VecV,\n                                 buf_size_v1>::launch_kernel(params, stream);\n        } else if (total_value_size % sizeof(uint16_t) == 0) {\n          using VecV = uint16_t;\n          LaunchPipelineLookupV1<K, V, S, CopyScore, VecV,\n                                 buf_size_v1>::launch_kernel(params, stream);\n        } else {\n          using VecV = uint8_t;\n          LaunchPipelineLookupV1<K, V, S, CopyScore, VecV,\n                                 buf_size_v1>::launch_kernel(params, stream);\n        }\n      } else {\n        if (total_value_size % sizeof(float4) == 0) {\n          using VecV = float4;\n          LaunchPipelineLookupV2<K, V, S, CopyScore, VecV,\n                                 buf_size_v2>::launch_kernel(params, stream);\n        } else if (total_value_size % sizeof(float2) == 0) {\n          using VecV = float2;\n          LaunchPipelineLookupV2<K, V, S, CopyScore, VecV,\n                                 buf_size_v2>::launch_kernel(params, stream);\n        } else if (total_value_size % sizeof(float) == 0) {\n          using VecV = float;\n          LaunchPipelineLookupV2<K, V, S, CopyScore, VecV,\n                                 buf_size_v2>::launch_kernel(params, stream);\n        } else if (total_value_size % sizeof(uint16_t) == 0) {\n          using VecV = uint16_t;\n          LaunchPipelineLookupV2<K, V, S, CopyScore, VecV,\n                                 buf_size_v2>::launch_kernel(params, stream);\n        } else {\n          using VecV = uint8_t;\n          LaunchPipelineLookupV2<K, V, S, CopyScore, VecV,\n                                 buf_size_v2>::launch_kernel(params, stream);\n        }\n      }\n    } else {\n      using CopyScore = CopyScoreByPassCache<S, K, BUCKET_SIZE>;\n      if (total_value_size <= buf_size_v1) {\n        if (total_value_size % sizeof(float4) == 0) {\n          using VecV = float4;\n          LaunchPipelineLookupV1<K, V, S, CopyScore, VecV,\n                                 buf_size_v1>::launch_kernel(params, stream);\n        } else if (total_value_size % sizeof(float2) == 0) {\n          using VecV = float2;\n          LaunchPipelineLookupV1<K, V, S, CopyScore, VecV,\n                                 buf_size_v1>::launch_kernel(params, stream);\n        } else if (total_value_size % sizeof(float) == 0) {\n          using VecV = float;\n          LaunchPipelineLookupV1<K, V, S, CopyScore, VecV,\n                                 buf_size_v1>::launch_kernel(params, stream);\n        } else if (total_value_size % sizeof(uint16_t) == 0) {\n          using VecV = uint16_t;\n          LaunchPipelineLookupV1<K, V, S, CopyScore, VecV,\n                                 buf_size_v1>::launch_kernel(params, stream);\n        } else {\n          using VecV = uint8_t;\n          LaunchPipelineLookupV1<K, V, S, CopyScore, VecV,\n                                 buf_size_v1>::launch_kernel(params, stream);\n        }\n      } else {\n        if (total_value_size % sizeof(float4) == 0) {\n          using VecV = float4;\n          LaunchPipelineLookupV2<K, V, S, CopyScore, VecV,\n                                 buf_size_v2>::launch_kernel(params, stream);\n        } else if (total_value_size % sizeof(float2) == 0) {\n          using VecV = float2;\n          LaunchPipelineLookupV2<K, V, S, CopyScore, VecV,\n                                 buf_size_v2>::launch_kernel(params, stream);\n        } else if (total_value_size % sizeof(float) == 0) {\n          using VecV = float;\n          LaunchPipelineLookupV2<K, V, S, CopyScore, VecV,\n                                 buf_size_v2>::launch_kernel(params, stream);\n        } else if (total_value_size % sizeof(uint16_t) == 0) {\n          using VecV = uint16_t;\n          LaunchPipelineLookupV2<K, V, S, CopyScore, VecV,\n                                 buf_size_v2>::launch_kernel(params, stream);\n        } else {\n          using VecV = uint8_t;\n          LaunchPipelineLookupV2<K, V, S, CopyScore, VecV,\n                                 buf_size_v2>::launch_kernel(params, stream);\n        }\n      }\n    }\n  }  // End function\n};\n\n/* lookup with IO operation. This kernel is\n * usually used for the pure HBM mode for better performance.\n */\ntemplate <class K, class V, class S, class FoundFunctor, uint32_t TILE_SIZE = 4>\n__global__ void lookup_kernel_with_io(\n    const Table<K, V, S>* __restrict table, Bucket<K, V, S>* buckets,\n    const size_t bucket_max_size, const size_t buckets_num, const size_t dim,\n    const K* __restrict keys, V* __restrict values, S* __restrict scores,\n    FoundFunctor found_functor, size_t N) {\n  int* buckets_size = table->buckets_size;\n\n  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());\n  int rank = g.thread_rank();\n\n  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;\n       t += blockDim.x * gridDim.x) {\n    int key_idx = t / TILE_SIZE;\n\n    const K find_key = keys[key_idx];\n    if (IS_RESERVED_KEY<K>(find_key)) continue;\n\n    V* find_value = values + key_idx * dim;\n\n    int key_pos = -1;\n    int src_lane = -1;\n    size_t bkt_idx = 0;\n    size_t start_idx = 0;\n\n    Bucket<K, V, S>* bucket = get_key_position<K>(\n        buckets, find_key, bkt_idx, start_idx, buckets_num, bucket_max_size);\n\n    const int bucket_size = buckets_size[bkt_idx];\n    if (bucket_size >= bucket_max_size) {\n      start_idx = (start_idx / TILE_SIZE) * TILE_SIZE;\n    }\n\n    OccupyResult occupy_result{OccupyResult::INITIAL};\n    occupy_result = find_without_lock<K, V, S, TILE_SIZE>(\n        g, bucket, find_key, start_idx, key_pos, src_lane, bucket_max_size);\n\n    bool found = occupy_result == OccupyResult::DUPLICATE;\n    if (found) {\n      copy_vector<V, TILE_SIZE>(g, bucket->vectors + key_pos * dim, find_value,\n                                dim);\n      bool found = (rank == src_lane);\n      if (found) {\n        if (scores != nullptr) {\n          *(scores + key_idx) =\n              bucket->scores(key_pos)->load(cuda::std::memory_order_relaxed);\n        }\n      }\n    }\n    if (rank == 0) {\n      found_functor(key_idx, find_key, found);\n    }\n  }\n}\n\ntemplate <typename K, typename V, typename S, typename FoundFunctor>\nstruct SelectLookupKernelWithIOImpl {\n  static void execute_kernel(const float& load_factor, const int& block_size,\n                             const size_t bucket_max_size,\n                             const size_t buckets_num, const size_t dim,\n                             cudaStream_t& stream, const size_t& n,\n                             const Table<K, V, S>* __restrict table,\n                             Bucket<K, V, S>* buckets, const K* __restrict keys,\n                             V* __restrict values, S* __restrict scores,\n                             const FoundFunctor& found_functor) {\n    if (load_factor <= 0.75) {\n      const unsigned int tile_size = 4;\n      const size_t N = n * tile_size;\n      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);\n      lookup_kernel_with_io<K, V, S, FoundFunctor, tile_size>\n          <<<grid_size, block_size, 0, stream>>>(\n              table, buckets, bucket_max_size, buckets_num, dim, keys, values,\n              scores, found_functor, N);\n    } else {\n      const unsigned int tile_size = 16;\n      const size_t N = n * tile_size;\n      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);\n      lookup_kernel_with_io<K, V, S, FoundFunctor, tile_size>\n          <<<grid_size, block_size, 0, stream>>>(\n              table, buckets, bucket_max_size, buckets_num, dim, keys, values,\n              scores, found_functor, N);\n    }\n    return;\n  }\n};\n\ntemplate <typename K, typename V, typename S>\nstruct SelectLookupKernelWithIO {\n  static void execute_kernel(const float& load_factor, const int& block_size,\n                             const size_t bucket_max_size,\n                             const size_t buckets_num, const size_t dim,\n                             cudaStream_t& stream, const size_t& n,\n                             const Table<K, V, S>* __restrict table,\n                             Bucket<K, V, S>* buckets, const K* __restrict keys,\n                             V* __restrict values, S* __restrict scores,\n                             bool* __restrict found) {\n    FoundFunctorV1<K> found_functor(found);\n    SelectLookupKernelWithIOImpl<K, V, S, decltype(found_functor)>::\n        execute_kernel(load_factor, block_size, bucket_max_size, buckets_num,\n                       dim, stream, n, table, buckets, keys, values, scores,\n                       found_functor);\n  }\n};\n\ntemplate <typename K, typename V, typename S>\nstruct SelectLookupKernelWithIOV2 {\n  static void execute_kernel(const float& load_factor, const int& block_size,\n                             const size_t bucket_max_size,\n                             const size_t buckets_num, const size_t dim,\n                             cudaStream_t& stream, const size_t& n,\n                             const Table<K, V, S>* __restrict table,\n                             Bucket<K, V, S>* buckets, const K* __restrict keys,\n                             V* __restrict values, S* __restrict scores,\n                             K* __restrict missed_keys,\n                             int* __restrict missed_indices,\n                             int* __restrict missed_size) {\n    FoundFunctorV2<K> found_functor(missed_keys, missed_indices, missed_size);\n    SelectLookupKernelWithIOImpl<K, V, S, decltype(found_functor)>::\n        execute_kernel(load_factor, block_size, bucket_max_size, buckets_num,\n                       dim, stream, n, table, buckets, keys, values, scores,\n                       found_functor);\n  }\n};\n\n// Use 1 thread to deal with a KV-pair, exculing copying value.\ntemplate <typename K, typename V, typename S,\n          typename FoundFunctor = FoundFunctorV1<K>>\n__device__ void tlp_lookup_kernel_hybrid_impl(\n    Bucket<K, V, S>* __restrict__ buckets, const uint64_t buckets_num,\n    uint32_t bucket_capacity, const uint32_t dim, const K* __restrict__ keys,\n    V** __restrict values, S* __restrict scores, int* __restrict dst_offset,\n    FoundFunctor found_functor, uint64_t n) {\n  using BUCKET = Bucket<K, V, S>;\n\n  uint32_t tx = threadIdx.x;\n  uint32_t kv_idx = blockIdx.x * blockDim.x + tx;\n  K key{static_cast<K>(EMPTY_KEY)};\n  S score{static_cast<S>(EMPTY_SCORE)};\n  OccupyResult occupy_result{OccupyResult::INITIAL};\n  VecD_Comp target_digests{0};\n  V* bucket_values_ptr{nullptr};\n  K* bucket_keys_ptr{nullptr};\n  uint32_t key_pos = {0};\n  if (kv_idx < n) {\n    key = keys[kv_idx];\n    if (dst_offset) dst_offset[kv_idx] = kv_idx;\n    if (!IS_RESERVED_KEY<K>(key)) {\n      const K hashed_key = Murmur3HashDevice(key);\n      target_digests = digests_from_hashed<K>(hashed_key);\n      uint64_t global_idx =\n          static_cast<uint64_t>(hashed_key % (buckets_num * bucket_capacity));\n      key_pos = get_start_position(global_idx, bucket_capacity);\n      uint64_t bkt_idx = global_idx / bucket_capacity;\n      BUCKET* bucket = buckets + bkt_idx;\n      bucket_keys_ptr = reinterpret_cast<K*>(bucket->keys(0));\n      bucket_values_ptr = reinterpret_cast<V*>(bucket->vectors);\n    } else {\n      return;\n    }\n  } else {\n    return;\n  }\n\n  // Load `STRIDE` digests every time.\n  constexpr uint32_t STRIDE = sizeof(VecD_Load) / sizeof(D);\n  // One more loop to handle empty keys.\n  for (int offset = 0; offset < bucket_capacity + STRIDE; offset += STRIDE) {\n    if (occupy_result != OccupyResult::INITIAL) break;\n\n    uint32_t pos_cur = align_to<STRIDE>(key_pos);\n    pos_cur = (pos_cur + offset) & (bucket_capacity - 1);\n\n    D* digests_ptr = BUCKET::digests(bucket_keys_ptr, bucket_capacity, pos_cur);\n    VecD_Load digests_vec = *(reinterpret_cast<VecD_Load*>(digests_ptr));\n    VecD_Comp digests_arr[4] = {digests_vec.x, digests_vec.y, digests_vec.z,\n                                digests_vec.w};\n\n    for (int i = 0; i < 4; i++) {\n      VecD_Comp probe_digests = digests_arr[i];\n      uint32_t possible_pos = 0;\n      // Perform a vectorized comparison by byte,\n      // and if they are equal, set the corresponding byte in the result to\n      // 0xff.\n      int cmp_result = __vcmpeq4(probe_digests, target_digests);\n      cmp_result &= 0x01010101;\n      do {\n        if (cmp_result == 0) break;\n        // CUDA uses little endian,\n        // and the lowest byte in register stores in the lowest address.\n        uint32_t index = (__ffs(cmp_result) - 1) >> 3;\n        cmp_result &= (cmp_result - 1);\n        possible_pos = pos_cur + i * 4 + index;\n        auto current_key = bucket_keys_ptr[possible_pos];\n        score = *BUCKET::scores(bucket_keys_ptr, bucket_capacity, possible_pos);\n        bool found = (current_key == key);\n        if (found) {\n          key_pos = possible_pos;\n          if (scores) {\n            scores[kv_idx] = score;\n          }\n          values[kv_idx] = bucket_values_ptr + key_pos * dim;\n          found_functor(kv_idx, key, true);\n          return;\n        } else {\n          values[kv_idx] = nullptr;\n        }\n      } while (true);\n      VecD_Comp empty_digests_ = empty_digests<K>();\n      cmp_result = __vcmpeq4(probe_digests, empty_digests_);\n      cmp_result &= 0x01010101;\n      do {\n        if (cmp_result == 0) break;\n        uint32_t index = (__ffs(cmp_result) - 1) >> 3;\n        cmp_result &= (cmp_result - 1);\n        possible_pos = pos_cur + i * 4 + index;\n        if (offset == 0 && possible_pos < key_pos) continue;\n        auto current_key = bucket_keys_ptr[possible_pos];\n        if (current_key == static_cast<K>(EMPTY_KEY)) {\n          found_functor(kv_idx, key, false);\n          return;\n        }\n      } while (true);\n    }\n  }\n\n  found_functor(kv_idx, key, false);\n}\n\ntemplate <typename K, typename V, typename S>\n__global__ void tlp_lookup_kernel_hybrid(\n    Bucket<K, V, S>* __restrict__ buckets, const uint64_t buckets_num,\n    uint32_t bucket_capacity, const uint32_t dim, const K* __restrict__ keys,\n    V** __restrict values, S* __restrict scores, int* __restrict dst_offset,\n    bool* __restrict founds, uint64_t n) {\n  FoundFunctorV1<K> found_functor(founds);\n  tlp_lookup_kernel_hybrid_impl<K, V, S, decltype(found_functor)>(\n      buckets, buckets_num, bucket_capacity, dim, keys, values, scores,\n      dst_offset, found_functor, n);\n}\n\ntemplate <typename K, typename V, typename S>\n__global__ void tlp_lookup_kernel_hybrid(\n    Bucket<K, V, S>* __restrict__ buckets, const uint64_t buckets_num,\n    uint32_t bucket_capacity, const uint32_t dim, const K* __restrict__ keys,\n    V** __restrict values, S* __restrict scores, int* __restrict dst_offset,\n    K* __restrict missed_keys, int* __restrict missed_indices,\n    int* __restrict missed_size, uint64_t n) {\n  FoundFunctorV2<K> found_functor(missed_keys, missed_indices, missed_size);\n  tlp_lookup_kernel_hybrid_impl<K, V, S, decltype(found_functor)>(\n      buckets, buckets_num, bucket_capacity, dim, keys, values, scores,\n      dst_offset, found_functor, n);\n}\n\n/* lookup kernel.\n */\ntemplate <class K, class V, class S, class FoundFunctor, uint32_t TILE_SIZE = 4>\n__device__ void lookup_kernel_impl(\n    const Table<K, V, S>* __restrict table, Bucket<K, V, S>* buckets,\n    const size_t bucket_max_size, const size_t buckets_num, const size_t dim,\n    const K* __restrict keys, V** __restrict values, S* __restrict scores,\n    FoundFunctor found_functor, int* __restrict dst_offset, size_t N) {\n  int* buckets_size = table->buckets_size;\n\n  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());\n  int rank = g.thread_rank();\n\n  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;\n       t += blockDim.x * gridDim.x) {\n    int key_idx = t / TILE_SIZE;\n\n    const K find_key = keys[key_idx];\n    if (IS_RESERVED_KEY<K>(find_key)) continue;\n\n    int key_pos = -1;\n    int src_lane = -1;\n    size_t bkt_idx = 0;\n    size_t start_idx = 0;\n\n    Bucket<K, V, S>* bucket = get_key_position<K>(\n        buckets, find_key, bkt_idx, start_idx, buckets_num, bucket_max_size);\n\n    const int bucket_size = buckets_size[bkt_idx];\n    if (bucket_size >= bucket_max_size) {\n      start_idx = (start_idx / TILE_SIZE) * TILE_SIZE;\n    }\n\n    if (dst_offset != nullptr && rank == 0) {\n      *(dst_offset + key_idx) = key_idx;\n    }\n\n    OccupyResult occupy_result{OccupyResult::INITIAL};\n    occupy_result = find_without_lock<K, V, S, TILE_SIZE>(\n        g, bucket, find_key, start_idx, key_pos, src_lane, bucket_max_size);\n\n    bool found = (occupy_result == OccupyResult::DUPLICATE);\n    if (found) {\n      if (rank == src_lane) {\n        *(values + key_idx) = (bucket->vectors + key_pos * dim);\n        if (scores != nullptr) {\n          *(scores + key_idx) =\n              bucket->scores(key_pos)->load(cuda::std::memory_order_relaxed);\n        }\n      }\n    } else {\n      if (rank == 0) {\n        *(values + key_idx) = nullptr;\n      }\n    }\n\n    if (rank == 0) {\n      found_functor(key_idx, find_key, found);\n    }\n  }\n}\n\ntemplate <class K, class V, class S, uint32_t TILE_SIZE = 4>\n__global__ void lookup_kernel(const Table<K, V, S>* __restrict table,\n                              Bucket<K, V, S>* buckets,\n                              const size_t bucket_max_size,\n                              const size_t buckets_num, const size_t dim,\n                              const K* __restrict keys, V** __restrict values,\n                              S* __restrict scores, bool* __restrict founds,\n                              int* __restrict dst_offset, size_t N) {\n  FoundFunctorV1<K> found_functor(founds);\n  lookup_kernel_impl<K, V, S, decltype(found_functor), TILE_SIZE>(\n      table, buckets, bucket_max_size, buckets_num, dim, keys, values, scores,\n      found_functor, dst_offset, N);\n}\n\ntemplate <class K, class V, class S, uint32_t TILE_SIZE = 4>\n__global__ void lookup_kernel(\n    const Table<K, V, S>* __restrict table, Bucket<K, V, S>* buckets,\n    const size_t bucket_max_size, const size_t buckets_num, const size_t dim,\n    const K* __restrict keys, V** __restrict values, S* __restrict scores,\n    K* __restrict missed_keys, int* __restrict missed_indices,\n    int* __restrict missed_size, int* __restrict dst_offset, size_t N) {\n  FoundFunctorV2<K> found_functor(missed_keys, missed_indices, missed_size);\n  lookup_kernel_impl<K, V, S, decltype(found_functor), TILE_SIZE>(\n      table, buckets, bucket_max_size, buckets_num, dim, keys, values, scores,\n      found_functor, dst_offset, N);\n}\n\n}  // namespace merlin\n}  // namespace nv\n"
  },
  {
    "path": "include/merlin/core_kernels/lookup_ptr.cuh",
    "content": "/*\n * Copyright (c) 2023, NVIDIA CORPORATION.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *     http:///www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n#pragma once\n\n#include \"kernel_utils.cuh\"\n\nnamespace nv {\nnamespace merlin {\n\n// Use 1 thread to deal with a KV-pair, including copying value.\ntemplate <typename K, typename V, typename S, int Strategy>\n__global__ void tlp_lookup_ptr_kernel_with_filter(\n    Bucket<K, V, S>* __restrict__ buckets, const uint64_t buckets_num,\n    uint32_t bucket_capacity, const uint32_t dim, const K* __restrict__ keys,\n    V** __restrict values, S* __restrict scores, bool* __restrict founds,\n    uint64_t n, bool update_score, const S global_epoch) {\n  using BUCKET = Bucket<K, V, S>;\n  using ScoreFunctor = ScoreFunctor<K, V, S, Strategy>;\n  // Load `STRIDE` digests every time.\n  constexpr uint32_t STRIDE = sizeof(VecD_Load) / sizeof(D);\n\n  uint32_t tx = threadIdx.x;\n  uint32_t kv_idx = blockIdx.x * blockDim.x + tx;\n  K key{static_cast<K>(EMPTY_KEY)};\n  S score{static_cast<S>(EMPTY_SCORE)};\n  OccupyResult occupy_result{OccupyResult::INITIAL};\n  VecD_Comp target_digests{0};\n  V* bucket_values_ptr{nullptr};\n  K* bucket_keys_ptr{nullptr};\n  uint32_t key_pos = {0};\n  if (kv_idx < n) {\n    key = keys[kv_idx];\n    if (update_score) {\n      score = ScoreFunctor::desired_when_missed(scores, kv_idx, global_epoch);\n    }\n    if (!IS_RESERVED_KEY<K>(key)) {\n      const K hashed_key = Murmur3HashDevice(key);\n      target_digests = digests_from_hashed<K>(hashed_key);\n      uint64_t global_idx =\n          static_cast<uint64_t>(hashed_key % (buckets_num * bucket_capacity));\n      key_pos = get_start_position(global_idx, bucket_capacity);\n      uint64_t bkt_idx = global_idx / bucket_capacity;\n      BUCKET* bucket = buckets + bkt_idx;\n      bucket_keys_ptr = reinterpret_cast<K*>(bucket->keys(0));\n      bucket_values_ptr = reinterpret_cast<V*>(bucket->vectors);\n    } else {\n      occupy_result = OccupyResult::ILLEGAL;\n      goto WRITE_BACK;\n    }\n  } else {\n    return;\n  }\n\n  // One more loop to handle empty keys.\n  for (int offset = 0; offset < bucket_capacity + STRIDE; offset += STRIDE) {\n    uint32_t pos_cur = align_to<STRIDE>(key_pos);\n    pos_cur = (pos_cur + offset) & (bucket_capacity - 1);\n\n    D* digests_ptr = BUCKET::digests(bucket_keys_ptr, bucket_capacity, pos_cur);\n    VecD_Load digests_vec = *(reinterpret_cast<VecD_Load*>(digests_ptr));\n    VecD_Comp digests_arr[4] = {digests_vec.x, digests_vec.y, digests_vec.z,\n                                digests_vec.w};\n\n    for (int i = 0; i < 4; i++) {\n      VecD_Comp probe_digests = digests_arr[i];\n      uint32_t possible_pos = 0;\n      // Perform a vectorized comparison by byte,\n      // and if they are equal, set the corresponding byte in the result to\n      // 0xff.\n      int cmp_result = __vcmpeq4(probe_digests, target_digests);\n      cmp_result &= 0x01010101;\n      do {\n        if (cmp_result == 0) break;\n        // CUDA uses little endian,\n        // and the lowest byte in register stores in the lowest address.\n        uint32_t index = (__ffs(cmp_result) - 1) >> 3;\n        cmp_result &= (cmp_result - 1);\n        possible_pos = pos_cur + i * 4 + index;\n        if (update_score) {\n          auto current_key = BUCKET::keys(bucket_keys_ptr, possible_pos);\n          K expected_key = key;\n          // Modifications to the bucket will not before this instruction.\n          bool result = current_key->compare_exchange_strong(\n              expected_key, static_cast<K>(LOCKED_KEY),\n              cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);\n          if (result) {\n            occupy_result = OccupyResult::DUPLICATE;\n            key_pos = possible_pos;\n            ScoreFunctor::update_with_digest(bucket_keys_ptr, key_pos, scores,\n                                             kv_idx, score, bucket_capacity,\n                                             get_digest<K>(key), false);\n            current_key->store(key, cuda::std::memory_order_release);\n            score = *BUCKET::scores(bucket_keys_ptr, bucket_capacity, key_pos);\n            goto WRITE_BACK;\n          }\n        } else {\n          auto current_key = bucket_keys_ptr[possible_pos];\n          score =\n              *BUCKET::scores(bucket_keys_ptr, bucket_capacity, possible_pos);\n          if (current_key == key) {\n            key_pos = possible_pos;\n            occupy_result = OccupyResult::DUPLICATE;\n            goto WRITE_BACK;\n          }\n        }\n      } while (true);\n      VecD_Comp empty_digests_ = empty_digests<K>();\n      cmp_result = __vcmpeq4(probe_digests, empty_digests_);\n      cmp_result &= 0x01010101;\n      do {\n        if (cmp_result == 0) break;\n        uint32_t index = (__ffs(cmp_result) - 1) >> 3;\n        cmp_result &= (cmp_result - 1);\n        possible_pos = pos_cur + i * 4 + index;\n        if (offset == 0 && possible_pos < key_pos) continue;\n        auto current_key = bucket_keys_ptr[possible_pos];\n        if (current_key == static_cast<K>(EMPTY_KEY)) {\n          occupy_result = OccupyResult::OCCUPIED_EMPTY;\n          goto WRITE_BACK;\n        }\n      } while (true);\n    }\n  }\n\nWRITE_BACK:\n  bool found_ = occupy_result == OccupyResult::DUPLICATE;\n  if (founds) {\n    founds[kv_idx] = found_;\n  }\n  if (found_) {\n    if (scores) {\n      scores[kv_idx] = score;\n    }\n    values[kv_idx] = bucket_values_ptr + key_pos * dim;\n  } else {\n    values[kv_idx] = nullptr;\n  }\n}\n\n// Pipelined pointer-return lookup: reuses the cooperative 32-thread digest scan\n// from lookup_kernel_with_io_pipeline_v1 (the value-copy find kernel) but skips\n// the value copy stages entirely, writing only V* pointers.  This ensures find*\n// throughput at lambda=1.0 is always >= find throughput.\ntemplate <typename K, typename V, typename S>\n__global__ void lookup_ptr_kernel_with_pipeline(\n    Bucket<K, V, S>* buckets, const size_t buckets_num, const int dim,\n    const K* __restrict keys, V** __restrict values, S* __restrict scores,\n    bool* __restrict founds, size_t n) {\n  constexpr int GROUP_SIZE = 32;\n  constexpr int RESERVE = 16;\n  constexpr int BLOCK_SIZE = 128;\n  constexpr int BUCKET_SIZE = 128;\n  constexpr int GROUP_NUM = BLOCK_SIZE / GROUP_SIZE;\n  constexpr int DIGEST_SPAN = BUCKET_SIZE / 4;\n\n  __shared__ int sm_target_digests[BLOCK_SIZE];\n  __shared__ K sm_target_keys[BLOCK_SIZE];\n  __shared__ K* sm_keys_ptr[BLOCK_SIZE];\n  __shared__ V* sm_values_ptr[BLOCK_SIZE];\n  // Reuse\n  int* sm_counts = sm_target_digests;\n  int* sm_founds = sm_counts;\n  // Double buffer\n  __shared__ uint32_t sm_probing_digests[2][GROUP_NUM * DIGEST_SPAN];\n  __shared__ K sm_possible_keys[2][GROUP_NUM * RESERVE];\n  __shared__ int sm_possible_pos[2][GROUP_NUM * RESERVE];\n\n  // Initialization\n  auto g = cg::tiled_partition<GROUP_SIZE>(cg::this_thread_block());\n  int groupID = threadIdx.x / GROUP_SIZE;\n  int rank = g.thread_rank();\n  int key_idx_base = (blockIdx.x * blockDim.x) + groupID * GROUP_SIZE;\n  if (key_idx_base >= n) return;\n  int loop_num =\n      (n - key_idx_base) < GROUP_SIZE ? (n - key_idx_base) : GROUP_SIZE;\n  if (rank < loop_num) {\n    int idx_block = groupID * GROUP_SIZE + rank;\n    K target_key = keys[key_idx_base + rank];\n    sm_target_keys[idx_block] = target_key;\n    const K hashed_key = Murmur3HashDevice(target_key);\n    const uint8_t target_digest = static_cast<uint8_t>(hashed_key >> 32);\n    sm_target_digests[idx_block] = static_cast<uint32_t>(target_digest);\n    int global_idx = hashed_key % (buckets_num * BUCKET_SIZE);\n    int bkt_idx = global_idx / BUCKET_SIZE;\n    Bucket<K, V, S>* bucket = buckets + bkt_idx;\n    __pipeline_memcpy_async(sm_keys_ptr + idx_block, bucket->keys_addr(),\n                            sizeof(K*));\n    __pipeline_commit();\n    __pipeline_memcpy_async(sm_values_ptr + idx_block, &(bucket->vectors),\n                            sizeof(V*));\n  }\n  __pipeline_wait_prior(0);\n\n  // Pipeline loading: prefetch digests for the first key\n  uint8_t* digests_ptr =\n      reinterpret_cast<uint8_t*>(sm_keys_ptr[groupID * GROUP_SIZE]) -\n      BUCKET_SIZE;\n  __pipeline_memcpy_async(sm_probing_digests[0] + groupID * DIGEST_SPAN + rank,\n                          digests_ptr + rank * 4, sizeof(uint32_t));\n  __pipeline_commit();\n  __pipeline_commit();\n\n  for (int i = 0; i < loop_num; i++) {\n    int key_idx_block = groupID * GROUP_SIZE + i;\n\n    /* Step1: prefetch all digests in one bucket */\n    if ((i + 1) < loop_num) {\n      uint8_t* digests_ptr =\n          reinterpret_cast<uint8_t*>(sm_keys_ptr[key_idx_block + 1]) -\n          BUCKET_SIZE;\n      __pipeline_memcpy_async(\n          sm_probing_digests[diff_buf(i)] + groupID * DIGEST_SPAN + rank,\n          digests_ptr + rank * 4, sizeof(uint32_t));\n    }\n    __pipeline_commit();\n\n    /* Step2: check digests and load possible keys */\n    uint32_t target_digest = sm_target_digests[key_idx_block];\n    uint32_t target_digests = __byte_perm(target_digest, target_digest, 0x0000);\n    sm_counts[key_idx_block] = 0;\n    __pipeline_wait_prior(2);\n    uint32_t probing_digests =\n        sm_probing_digests[same_buf(i)][groupID * DIGEST_SPAN + rank];\n    uint32_t find_result_ = __vcmpeq4(probing_digests, target_digests);\n    uint32_t find_result = 0;\n    if ((find_result_ & 0x01) != 0) find_result |= 0x01;\n    if ((find_result_ & 0x0100) != 0) find_result |= 0x02;\n    if ((find_result_ & 0x010000) != 0) find_result |= 0x04;\n    if ((find_result_ & 0x01000000) != 0) find_result |= 0x08;\n    int find_number = __popc(find_result);\n    int group_base = 0;\n    if (find_number > 0) {\n      group_base = atomicAdd(sm_counts + key_idx_block, find_number);\n    }\n    bool gt_reserve = (group_base + find_number) > RESERVE;\n    int gt_vote = g.ballot(gt_reserve);\n    K* key_ptr = sm_keys_ptr[key_idx_block];\n    if (gt_vote == 0) {\n      do {\n        int digest_idx = __ffs(find_result) - 1;\n        if (digest_idx >= 0) {\n          find_result &= (find_result - 1);\n          int key_pos = rank * 4 + digest_idx;\n          sm_possible_pos[same_buf(i)][groupID * RESERVE + group_base] =\n              key_pos;\n          __pipeline_memcpy_async(\n              sm_possible_keys[same_buf(i)] + (groupID * RESERVE + group_base),\n              key_ptr + key_pos, sizeof(K));\n          group_base += 1;\n        } else {\n          break;\n        }\n      } while (true);\n    } else {\n      K target_key = sm_target_keys[key_idx_block];\n      sm_counts[key_idx_block] = 0;\n      int found_vote = 0;\n      bool found = false;\n      do {\n        int digest_idx = __ffs(find_result) - 1;\n        if (digest_idx >= 0) {\n          find_result &= (find_result - 1);\n          int key_pos = rank * 4 + digest_idx;\n          K possible_key = key_ptr[key_pos];\n          if (possible_key == target_key) {\n            found = true;\n            sm_counts[key_idx_block] = 1;\n            sm_possible_pos[same_buf(i)][groupID * RESERVE] = key_pos;\n            sm_possible_keys[same_buf(i)][groupID * RESERVE] = possible_key;\n          }\n        }\n        found_vote = g.ballot(found);\n        if (found_vote) {\n          break;\n        }\n        found_vote = digest_idx >= 0;\n      } while (g.any(found_vote));\n    }\n    __pipeline_commit();\n\n    /* Step3: check possible keys, write back pointer immediately */\n    if (i > 0) {\n      int prev_idx_block = key_idx_block - 1;\n      K target_key = sm_target_keys[prev_idx_block];\n      int possible_num = sm_counts[prev_idx_block];\n      sm_founds[prev_idx_block] = 0;\n      __pipeline_wait_prior(2);\n      int key_pos = -1;\n      bool found_flag = false;\n      if (rank < possible_num) {\n        K possible_key =\n            sm_possible_keys[diff_buf(i)][groupID * RESERVE + rank];\n        key_pos = sm_possible_pos[diff_buf(i)][groupID * RESERVE + rank];\n        if (possible_key == target_key) {\n          found_flag = true;\n        }\n      }\n      int found_vote = g.ballot(found_flag);\n      if (found_vote) {\n        sm_founds[prev_idx_block] = 1;\n        int src_lane = __ffs(found_vote) - 1;\n        int target_pos = g.shfl(key_pos, src_lane);\n        // Write pointer directly (no value copy needed).\n        if (rank == 0) {\n          int key_idx_grid = blockIdx.x * blockDim.x + prev_idx_block;\n          values[key_idx_grid] =\n              sm_values_ptr[prev_idx_block] + target_pos * dim;\n          if (scores) {\n            S* score_ptr =\n                reinterpret_cast<S*>(sm_keys_ptr[prev_idx_block] + BUCKET_SIZE);\n            scores[key_idx_grid] = score_ptr[target_pos];\n          }\n        }\n      }\n    }\n  }  // End loop\n\n  /* Pipeline emptying: process the last key */\n  {\n    int key_idx_block = groupID * GROUP_SIZE + (loop_num - 1);\n    K target_key = sm_target_keys[key_idx_block];\n    int possible_num = sm_counts[key_idx_block];\n    sm_founds[key_idx_block] = 0;\n    __pipeline_wait_prior(0);\n    int key_pos = -1;\n    bool found_flag = false;\n    if (rank < possible_num) {\n      key_pos = sm_possible_pos[diff_buf(loop_num)][groupID * RESERVE + rank];\n      K possible_key =\n          sm_possible_keys[diff_buf(loop_num)][groupID * RESERVE + rank];\n      if (target_key == possible_key) {\n        found_flag = true;\n      }\n    }\n    int found_vote = g.ballot(found_flag);\n    if (found_vote) {\n      sm_founds[key_idx_block] = 1;\n      int src_lane = __ffs(found_vote) - 1;\n      int target_pos = g.shfl(key_pos, src_lane);\n      if (rank == 0) {\n        int key_idx_grid = blockIdx.x * blockDim.x + key_idx_block;\n        values[key_idx_grid] = sm_values_ptr[key_idx_block] + target_pos * dim;\n        if (scores) {\n          S* score_ptr =\n              reinterpret_cast<S*>(sm_keys_ptr[key_idx_block] + BUCKET_SIZE);\n          scores[key_idx_grid] = score_ptr[target_pos];\n        }\n      }\n    }\n  }\n\n  // Write found flags and nullptr for misses.\n  if (rank < loop_num) {\n    int key_idx_block = groupID * GROUP_SIZE + rank;\n    int key_idx_grid = blockIdx.x * blockDim.x + key_idx_block;\n    bool found_ = sm_founds[key_idx_block] > 0;\n    if (founds) founds[key_idx_grid] = found_;\n    if (!found_) values[key_idx_grid] = nullptr;\n  }\n}\n\n/* lookup with IO operation. This kernel is\n * usually used for the pure HBM mode for better performance.\n */\ntemplate <class K, class V, class S, uint32_t TILE_SIZE = 4>\n__global__ void lookup_ptr_kernel(const Table<K, V, S>* __restrict table,\n                                  Bucket<K, V, S>* buckets,\n                                  const size_t bucket_max_size,\n                                  const size_t buckets_num, const size_t dim,\n                                  const K* __restrict keys,\n                                  V** __restrict values, S* __restrict scores,\n                                  bool* __restrict found, size_t N) {\n  int* buckets_size = table->buckets_size;\n\n  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());\n  int rank = g.thread_rank();\n\n  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;\n       t += blockDim.x * gridDim.x) {\n    int key_idx = t / TILE_SIZE;\n\n    const K find_key = keys[key_idx];\n    OccupyResult occupy_result{OccupyResult::INITIAL};\n    int key_pos = -1;\n    int src_lane = -1;\n    Bucket<K, V, S>* bucket{nullptr};\n    if (!IS_RESERVED_KEY<K>(find_key)) {\n      size_t bkt_idx = 0;\n      size_t start_idx = 0;\n\n      bucket = get_key_position<K>(buckets, find_key, bkt_idx, start_idx,\n                                   buckets_num, bucket_max_size);\n\n      const int bucket_size = buckets_size[bkt_idx];\n      if (bucket_size >= bucket_max_size) {\n        start_idx = (start_idx / TILE_SIZE) * TILE_SIZE;\n      }\n\n      occupy_result = find_without_lock<K, V, S, TILE_SIZE>(\n          g, bucket, find_key, start_idx, key_pos, src_lane, bucket_max_size);\n    } else {\n      occupy_result = OccupyResult::ILLEGAL;\n    }\n\n    if (rank == src_lane) {\n      bool found_ = occupy_result == OccupyResult::DUPLICATE;\n      if (found != nullptr) {\n        *(found + key_idx) = found_;\n      }\n      if (found_) {\n        values[key_idx] = bucket->vectors + key_pos * dim;\n        if (scores != nullptr) {\n          *(scores + key_idx) =\n              bucket->scores(key_pos)->load(cuda::std::memory_order_relaxed);\n        }\n      } else {\n        values[key_idx] = nullptr;\n      }\n    }\n  }\n}\n\ntemplate <typename K, typename V, typename S>\nstruct SelectLookupPtrKernel {\n  static void execute_kernel(const float& load_factor, const int& block_size,\n                             const size_t bucket_max_size,\n                             const size_t buckets_num, const size_t dim,\n                             cudaStream_t& stream, const size_t& n,\n                             const Table<K, V, S>* __restrict table,\n                             Bucket<K, V, S>* buckets, const K* __restrict keys,\n                             V** __restrict values, S* __restrict scores,\n                             bool* __restrict found) {\n    if (load_factor <= 0.75) {\n      const unsigned int tile_size = 4;\n      const size_t N = n * tile_size;\n      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);\n      lookup_ptr_kernel<K, V, S, tile_size>\n          <<<grid_size, block_size, 0, stream>>>(\n              table, buckets, bucket_max_size, buckets_num, dim, keys, values,\n              scores, found, N);\n    } else {\n      const unsigned int tile_size = 16;\n      const size_t N = n * tile_size;\n      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);\n      lookup_ptr_kernel<K, V, S, tile_size>\n          <<<grid_size, block_size, 0, stream>>>(\n              table, buckets, bucket_max_size, buckets_num, dim, keys, values,\n              scores, found, N);\n    }\n    return;\n  }\n};\n\n}  // namespace merlin\n}  // namespace nv"
  },
  {
    "path": "include/merlin/core_kernels/update.cuh",
    "content": "/*\n * Copyright (c) 2023, NVIDIA CORPORATION.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *     http:///www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n#pragma once\n\n#include \"kernel_utils.cuh\"\n\nnamespace nv {\nnamespace merlin {\n\n// Use 1 thread to deal with a KV-pair, including copying value.\ntemplate <typename K = uint64_t, typename V = byte4, typename S = uint64_t,\n          typename VecV = byte16, uint32_t BLOCK_SIZE = 128, int Strategy = -1>\n__global__ void tlp_update_kernel_with_io(\n    Bucket<K, V, S>* __restrict__ buckets, const uint64_t buckets_num,\n    uint32_t bucket_capacity, const uint32_t dim, const K* __restrict__ keys,\n    const VecV* __restrict__ values, const S* __restrict__ scores, uint64_t n,\n    const S global_epoch) {\n  using BUCKET = Bucket<K, V, S>;\n  using CopyValue = CopyValueMultipleGroup<VecV, 1>;\n  using ScoreFunctor = ScoreFunctor<K, V, S, Strategy>;\n\n  uint32_t tx = threadIdx.x;\n  uint32_t kv_idx = blockIdx.x * blockDim.x + tx;\n  K key{static_cast<K>(EMPTY_KEY)};\n  OccupyResult occupy_result{OccupyResult::INITIAL};\n  VecD_Comp target_digests{0};\n  VecV* bucket_values_ptr{nullptr};\n  K* bucket_keys_ptr{nullptr};\n  uint32_t key_pos = {0};\n  if (kv_idx < n) {\n    key = keys[kv_idx];\n\n    if (!IS_RESERVED_KEY<K>(key)) {\n      const K hashed_key = Murmur3HashDevice(key);\n      target_digests = digests_from_hashed<K>(hashed_key);\n      uint64_t global_idx =\n          static_cast<uint64_t>(hashed_key % (buckets_num * bucket_capacity));\n      key_pos = get_start_position(global_idx, bucket_capacity);\n      uint64_t bkt_idx = global_idx / bucket_capacity;\n      BUCKET* bucket = buckets + bkt_idx;\n      bucket_keys_ptr = reinterpret_cast<K*>(bucket->keys(0));\n      bucket_values_ptr = reinterpret_cast<VecV*>(bucket->vectors);\n    } else {\n      return;\n    }\n  } else {\n    return;\n  }\n\n  // Load `STRIDE` digests every time.\n  constexpr uint32_t STRIDE = sizeof(VecD_Load) / sizeof(D);\n  // One more loop to handle empty keys.\n  for (int offset = 0; offset < bucket_capacity + STRIDE; offset += STRIDE) {\n    if (occupy_result != OccupyResult::INITIAL) break;\n\n    uint32_t pos_cur = align_to<STRIDE>(key_pos);\n    pos_cur = (pos_cur + offset) & (bucket_capacity - 1);\n\n    D* digests_ptr = BUCKET::digests(bucket_keys_ptr, bucket_capacity, pos_cur);\n    VecD_Load digests_vec = *(reinterpret_cast<VecD_Load*>(digests_ptr));\n    VecD_Comp digests_arr[4] = {digests_vec.x, digests_vec.y, digests_vec.z,\n                                digests_vec.w};\n\n    for (int i = 0; i < 4; i++) {\n      VecD_Comp probe_digests = digests_arr[i];\n      uint32_t possible_pos = 0;\n      bool result = false;\n      // Perform a vectorized comparison by byte,\n      // and if they are equal, set the corresponding byte in the result to\n      // 0xff.\n      int cmp_result = __vcmpeq4(probe_digests, target_digests);\n      cmp_result &= 0x01010101;\n      do {\n        if (cmp_result == 0) break;\n        // CUDA uses little endian,\n        // and the lowest byte in register stores in the lowest address.\n        uint32_t index = (__ffs(cmp_result) - 1) >> 3;\n        cmp_result &= (cmp_result - 1);\n        possible_pos = pos_cur + i * 4 + index;\n        auto current_key = BUCKET::keys(bucket_keys_ptr, possible_pos);\n        K expected_key = key;\n        // Modifications to the bucket will not before this instruction.\n        result = current_key->compare_exchange_strong(\n            expected_key, static_cast<K>(LOCKED_KEY),\n            cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);\n      } while (!result);\n      if (result) {\n        occupy_result = OccupyResult::DUPLICATE;\n        key_pos = possible_pos;\n        ScoreFunctor::update_without_missed(bucket_keys_ptr, bucket_capacity,\n                                            key_pos, scores, kv_idx,\n                                            global_epoch);\n        VecV* bucket_value_ptr = bucket_values_ptr + key_pos * dim;\n        const VecV* param_value_ptr = values + kv_idx * dim;\n        CopyValue::ldg_stg(0, bucket_value_ptr, param_value_ptr, dim);\n        auto key_address = BUCKET::keys(bucket_keys_ptr, key_pos);\n        // memory_order_release:\n        // Modifications to the bucket will not after this instruction.\n        key_address->store(key, cuda::std::memory_order_release);\n        return;\n      }\n      VecD_Comp empty_digests_ = empty_digests<K>();\n      cmp_result = __vcmpeq4(probe_digests, empty_digests_);\n      cmp_result &= 0x01010101;\n      do {\n        if (cmp_result == 0) break;\n        uint32_t index = (__ffs(cmp_result) - 1) >> 3;\n        cmp_result &= (cmp_result - 1);\n        possible_pos = pos_cur + i * 4 + index;\n        if (offset == 0 && possible_pos < key_pos) continue;\n        auto current_key = BUCKET::keys(bucket_keys_ptr, possible_pos);\n        auto probe_key = current_key->load(cuda::std::memory_order_relaxed);\n        if (probe_key == static_cast<K>(EMPTY_KEY)) {\n          return;\n        }\n      } while (true);\n    }\n  }\n}\ntemplate <typename K = uint64_t, typename V = byte4, typename S = uint64_t,\n          typename VecV = byte16, uint32_t BLOCK_SIZE = 128,\n          uint32_t GROUP_SIZE = 16, int Strategy = -1>\n__global__ void pipeline_update_kernel_with_io(\n    Bucket<K, V, S>* __restrict__ buckets, const uint64_t buckets_num,\n    const uint32_t dim, const K* __restrict__ keys,\n    const VecV* __restrict__ values, const S* __restrict__ scores, uint64_t n,\n    const S global_epoch) {\n  constexpr uint32_t BUCKET_SIZE = 128;\n  constexpr int GROUP_NUM = BLOCK_SIZE / GROUP_SIZE;\n  constexpr uint32_t Comp_LEN = sizeof(VecD_Comp) / sizeof(D);\n  // Here, GROUP_SIZE * Load_LEN = BUCKET_SIZE.\n  using VecD_Load = byte8;\n  constexpr uint32_t Load_LEN = sizeof(VecD_Load) / sizeof(D);\n  constexpr int RESERVE = 8;\n\n  using BUCKET = Bucket<K, V, S>;\n  using CopyValue = CopyValueMultipleGroup<VecV, GROUP_SIZE>;\n  using CopyScore = CopyScoreByPassCache<S, K, BUCKET_SIZE>;\n  using ScoreFunctor = ScoreFunctor<K, V, S, Strategy>;\n\n  __shared__ VecD_Comp sm_target_digests[BLOCK_SIZE];\n  __shared__ K sm_target_keys[BLOCK_SIZE];\n  __shared__ K* sm_keys_ptr[BLOCK_SIZE];\n  __shared__ VecV* sm_values_ptr[BLOCK_SIZE];\n  // Reuse\n  int* sm_counts = reinterpret_cast<int*>(sm_target_digests);\n  int* sm_position = sm_counts;\n  // Double buffer\n  __shared__ D sm_digests[GROUP_NUM][2 * BUCKET_SIZE];\n  __shared__ K sm_possible_keys[GROUP_NUM][2 * RESERVE];\n  __shared__ int sm_possible_pos[GROUP_NUM][2 * RESERVE];\n  __shared__ S sm_scores[GROUP_NUM][2];\n  __shared__ int sm_ranks[GROUP_NUM][2];\n  // __shared__ VecV sm_values_buffer[GROUP_NUM][2 * dim];\n\n  extern __shared__ __align__(alignof(byte16)) byte sm_values_buffer[];\n\n  bool CAS_res[2]{false};\n\n  // Initialization\n  auto g = cg::tiled_partition<GROUP_SIZE>(cg::this_thread_block());\n  int groupID = threadIdx.x / GROUP_SIZE;\n  int rank = g.thread_rank();\n  uint64_t key_idx_base = (blockIdx.x * blockDim.x) + groupID * GROUP_SIZE;\n  if (key_idx_base >= n) return;\n  int loop_num =\n      (n - key_idx_base) < GROUP_SIZE ? (n - key_idx_base) : GROUP_SIZE;\n  if (rank < loop_num) {\n    int idx_block = groupID * GROUP_SIZE + rank;\n    K key = keys[key_idx_base + rank];\n    sm_target_keys[idx_block] = key;\n    const K hashed_key = Murmur3HashDevice(key);\n    sm_target_digests[idx_block] = digests_from_hashed<K>(hashed_key);\n    uint64_t global_idx = hashed_key % (buckets_num * BUCKET_SIZE);\n    uint64_t bkt_idx = global_idx / BUCKET_SIZE;\n    Bucket<K, V, S>* bucket = buckets + bkt_idx;\n    __pipeline_memcpy_async(sm_keys_ptr + idx_block, bucket->keys_addr(),\n                            sizeof(K*));\n    __pipeline_commit();\n    __pipeline_memcpy_async(sm_values_ptr + idx_block, &(bucket->vectors),\n                            sizeof(VecV*));\n  }\n  __pipeline_wait_prior(0);\n\n  // Pipeline loading\n  K* keys_ptr = sm_keys_ptr[groupID * GROUP_SIZE];\n  D* digests_ptr = BUCKET::digests(keys_ptr, BUCKET_SIZE, rank * Load_LEN);\n  __pipeline_memcpy_async(sm_digests[groupID] + rank * Load_LEN, digests_ptr,\n                          sizeof(VecD_Load));\n  __pipeline_commit();\n  // Padding, meet the param of the first `__pipeline_wait_prior`\n  // in the first loop.\n  __pipeline_commit();\n  __pipeline_commit();\n\n  for (int i = 0; i < loop_num; i++) {\n    int key_idx_block = groupID * GROUP_SIZE + i;\n\n    /* Step1: prefetch all digests in one bucket */\n    if ((i + 1) < loop_num) {\n      K* keys_ptr = sm_keys_ptr[key_idx_block + 1];\n      D* digests_ptr = BUCKET::digests(keys_ptr, BUCKET_SIZE, rank * Load_LEN);\n      __pipeline_memcpy_async(\n          sm_digests[groupID] + diff_buf(i) * BUCKET_SIZE + rank * Load_LEN,\n          digests_ptr, sizeof(VecD_Load));\n    }\n    __pipeline_commit();\n\n    /* Step2: check digests and load possible keys */\n    VecD_Comp target_digests = sm_target_digests[key_idx_block];\n    sm_counts[key_idx_block] = 0;\n    __pipeline_wait_prior(3);\n    VecD_Comp probing_digests = *reinterpret_cast<VecD_Comp*>(\n        &sm_digests[groupID][same_buf(i) * BUCKET_SIZE + rank * Comp_LEN]);\n    uint32_t find_result_ = __vcmpeq4(probing_digests, target_digests);\n    uint32_t find_result = 0;\n    if ((find_result_ & 0x01) != 0) find_result |= 0x01;\n    if ((find_result_ & 0x0100) != 0) find_result |= 0x02;\n    if ((find_result_ & 0x010000) != 0) find_result |= 0x04;\n    if ((find_result_ & 0x01000000) != 0) find_result |= 0x08;\n    probing_digests = *reinterpret_cast<VecD_Comp*>(\n        &sm_digests[groupID][same_buf(i) * BUCKET_SIZE +\n                             (GROUP_SIZE + rank) * Comp_LEN]);\n    find_result_ = __vcmpeq4(probing_digests, target_digests);\n    if ((find_result_ & 0x01) != 0) find_result |= 0x10;\n    if ((find_result_ & 0x0100) != 0) find_result |= 0x20;\n    if ((find_result_ & 0x010000) != 0) find_result |= 0x40;\n    if ((find_result_ & 0x01000000) != 0) find_result |= 0x80;\n    int find_number = __popc(find_result);\n    int group_base = 0;\n    if (find_number > 0) {\n      group_base = atomicAdd(sm_counts + key_idx_block, find_number);\n    }\n    bool gt_reserve = (group_base + find_number) > RESERVE;\n    int gt_vote = g.ballot(gt_reserve);\n    K* key_ptr = sm_keys_ptr[key_idx_block];\n    if (gt_vote == 0) {\n      do {\n        int digest_idx = __ffs(find_result) - 1;\n        if (digest_idx >= 0) {\n          find_result &= (find_result - 1);\n          int key_pos = digest_idx < 4\n                            ? (rank * 4 + digest_idx)\n                            : ((GROUP_SIZE + rank - 1) * 4 + digest_idx);\n          sm_possible_pos[groupID][same_buf(i) * RESERVE + group_base] =\n              key_pos;\n          __pipeline_memcpy_async(\n              sm_possible_keys[groupID] + same_buf(i) * RESERVE + group_base,\n              key_ptr + key_pos, sizeof(K));\n          group_base += 1;\n        } else {\n          break;\n        }\n      } while (true);\n    } else {\n      K target_key = sm_target_keys[key_idx_block];\n      sm_counts[key_idx_block] = 0;\n      int found_vote = 0;\n      bool found = false;\n      do {\n        int digest_idx = __ffs(find_result) - 1;\n        if (digest_idx >= 0) {\n          find_result &= (find_result - 1);\n          int key_pos = digest_idx < 4\n                            ? (rank * 4 + digest_idx)\n                            : ((GROUP_SIZE + rank - 1) * 4 + digest_idx);\n          K possible_key = key_ptr[key_pos];\n          if (possible_key == target_key) {\n            found = true;\n            sm_counts[key_idx_block] = 1;\n            sm_possible_pos[groupID][same_buf(i) * RESERVE] = key_pos;\n            sm_possible_keys[groupID][same_buf(i) * RESERVE] = possible_key;\n          }\n        }\n        found_vote = g.ballot(found);\n        if (found_vote) {\n          break;\n        }\n        found_vote = digest_idx >= 0;\n      } while (g.any(found_vote));\n    }\n    __pipeline_commit();\n\n    /* Step3: check possible keys, and prefecth the value and score */\n    if (i > 0) {\n      key_idx_block -= 1;\n      K target_key = sm_target_keys[key_idx_block];\n      K* keys_ptr = sm_keys_ptr[key_idx_block];\n      int possible_num = sm_counts[key_idx_block];\n      sm_position[key_idx_block] = -1;\n      __pipeline_wait_prior(3);\n      int key_pos;\n      bool found_flag = false;\n      if (rank < possible_num) {\n        K possible_key =\n            sm_possible_keys[groupID][diff_buf(i) * RESERVE + rank];\n        key_pos = sm_possible_pos[groupID][diff_buf(i) * RESERVE + rank];\n        if (possible_key == target_key) {\n          found_flag = true;\n          if (scores) {\n            int key_idx_grid = blockIdx.x * blockDim.x + key_idx_block;\n            const S* score_ptr = scores + key_idx_grid;\n            CopyScore::ldg_sts(sm_scores[groupID] + diff_buf(i), score_ptr);\n          }\n          auto key_ptr = BUCKET::keys(keys_ptr, key_pos);\n          sm_ranks[groupID][diff_buf(i)] = rank;\n          if (diff_buf(i) == 0) {\n            CAS_res[0] = key_ptr->compare_exchange_strong(\n                possible_key, static_cast<K>(LOCKED_KEY),\n                cuda::std::memory_order_acquire,\n                cuda::std::memory_order_relaxed);\n          } else {\n            CAS_res[1] = key_ptr->compare_exchange_strong(\n                possible_key, static_cast<K>(LOCKED_KEY),\n                cuda::std::memory_order_acquire,\n                cuda::std::memory_order_relaxed);\n          }\n        }\n      }\n      int found_vote = g.ballot(found_flag);\n      if (found_vote) {\n        int src_lane = __ffs(found_vote) - 1;\n        int target_pos = g.shfl(key_pos, src_lane);\n        sm_position[key_idx_block] = target_pos;\n        int key_idx_grid = blockIdx.x * blockDim.x + key_idx_block;\n        const VecV* v_src = values + key_idx_grid * dim;\n        auto tmp = reinterpret_cast<VecV*>(sm_values_buffer);\n        VecV* v_dst = tmp + (groupID * 2 + diff_buf(i)) * dim;\n        CopyValue::ldg_sts(rank, v_dst, v_src, dim);\n      }\n    }\n    __pipeline_commit();\n\n    /* Step4: write back value and score */\n    if (i > 1) {\n      key_idx_block -= 1;\n      VecV* value_ptr = sm_values_ptr[key_idx_block];\n      int target_pos = sm_position[key_idx_block];\n      K target_key = sm_target_keys[key_idx_block];\n      K* keys_ptr = sm_keys_ptr[key_idx_block];\n      int src_lane = sm_ranks[groupID][same_buf(i)];\n      __pipeline_wait_prior(3);\n      int succ = 0;\n      if (rank == src_lane) {\n        bool CAS_res_cur = same_buf(i) == 0 ? CAS_res[0] : CAS_res[1];\n        succ = CAS_res_cur ? 1 : 0;\n      }\n      succ = g.shfl(succ, src_lane);\n      if (target_pos >= 0 && succ == 1) {\n        auto tmp = reinterpret_cast<VecV*>(sm_values_buffer);\n        VecV* v_src = tmp + (groupID * 2 + same_buf(i)) * dim;\n        VecV* v_dst = value_ptr + target_pos * dim;\n        CopyValue::lds_stg(rank, v_dst, v_src, dim);\n        ScoreFunctor::update_without_missed(keys_ptr, BUCKET_SIZE, target_pos,\n                                            sm_scores[groupID] + same_buf(i), 0,\n                                            global_epoch);\n        if (rank == 0) {\n          auto key_address = BUCKET::keys(keys_ptr, target_pos);\n          key_address->store(target_key, cuda::std::memory_order_release);\n        }\n      }\n    }\n  }  // End loop\n\n  /* Pipeline emptying: step3, i = loop_num */\n  {\n    int key_idx_block = groupID * GROUP_SIZE + (loop_num - 1);\n    K target_key = sm_target_keys[key_idx_block];\n    K* keys_ptr = sm_keys_ptr[key_idx_block];\n    int possible_num = sm_counts[key_idx_block];\n    sm_position[key_idx_block] = -1;\n    __pipeline_wait_prior(1);\n    int key_pos;\n    bool found_flag = false;\n    if (rank < possible_num) {\n      K possible_key =\n          sm_possible_keys[groupID][diff_buf(loop_num) * RESERVE + rank];\n      key_pos = sm_possible_pos[groupID][diff_buf(loop_num) * RESERVE + rank];\n      if (possible_key == target_key) {\n        found_flag = true;\n        if (scores) {\n          int key_idx_grid = blockIdx.x * blockDim.x + key_idx_block;\n          const S* score_ptr = scores + key_idx_grid;\n          CopyScore::ldg_sts(sm_scores[groupID] + diff_buf(loop_num),\n                             score_ptr);\n        }\n        auto key_ptr = BUCKET::keys(keys_ptr, key_pos);\n        sm_ranks[groupID][diff_buf(loop_num)] = rank;\n        if (diff_buf(loop_num) == 0) {\n          CAS_res[0] = key_ptr->compare_exchange_strong(\n              possible_key, static_cast<K>(LOCKED_KEY),\n              cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);\n        } else {\n          CAS_res[1] = key_ptr->compare_exchange_strong(\n              possible_key, static_cast<K>(LOCKED_KEY),\n              cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);\n        }\n      }\n    }\n    int found_vote = g.ballot(found_flag);\n    if (found_vote) {\n      int src_lane = __ffs(found_vote) - 1;\n      int target_pos = g.shfl(key_pos, src_lane);\n      sm_position[key_idx_block] = target_pos;\n      int key_idx_grid = blockIdx.x * blockDim.x + key_idx_block;\n      const VecV* v_src = values + key_idx_grid * dim;\n      auto tmp = reinterpret_cast<VecV*>(sm_values_buffer);\n      VecV* v_dst = tmp + (groupID * 2 + diff_buf(loop_num)) * dim;\n      CopyValue::ldg_sts(rank, v_dst, v_src, dim);\n    }\n  }\n  __pipeline_commit();\n\n  /* Pipeline emptying: step4, i = loop_num */\n  if (loop_num > 1) {\n    int key_idx_block = groupID * GROUP_SIZE + loop_num - 2;\n    VecV* value_ptr = sm_values_ptr[key_idx_block];\n    int target_pos = sm_position[key_idx_block];\n    K target_key = sm_target_keys[key_idx_block];\n    K* keys_ptr = sm_keys_ptr[key_idx_block];\n    int src_lane = sm_ranks[groupID][same_buf(loop_num)];\n    __pipeline_wait_prior(1);\n    int succ = 0;\n    if (rank == src_lane) {\n      bool CAS_res_cur = same_buf(loop_num) == 0 ? CAS_res[0] : CAS_res[1];\n      succ = CAS_res_cur ? 1 : 0;\n    }\n    succ = g.shfl(succ, src_lane);\n    if (target_pos >= 0 && succ == 1) {\n      auto tmp = reinterpret_cast<VecV*>(sm_values_buffer);\n      VecV* v_src = tmp + (groupID * 2 + same_buf(loop_num)) * dim;\n      VecV* v_dst = value_ptr + target_pos * dim;\n      CopyValue::lds_stg(rank, v_dst, v_src, dim);\n      ScoreFunctor::update_without_missed(\n          keys_ptr, BUCKET_SIZE, target_pos,\n          sm_scores[groupID] + same_buf(loop_num), 0, global_epoch);\n\n      auto key_ptr = BUCKET::keys(keys_ptr, target_pos);\n      if (rank == 0) {\n        auto key_address = BUCKET::keys(keys_ptr, target_pos);\n        key_address->store(target_key, cuda::std::memory_order_release);\n      }\n    }\n  }\n\n  /* Pipeline emptying: step4, i = loop_num + 1 */\n  {\n    int key_idx_block = groupID * GROUP_SIZE + loop_num - 1;\n    VecV* value_ptr = sm_values_ptr[key_idx_block];\n    int target_pos = sm_position[key_idx_block];\n    K target_key = sm_target_keys[key_idx_block];\n    K* keys_ptr = sm_keys_ptr[key_idx_block];\n    int src_lane = sm_ranks[groupID][same_buf(loop_num + 1)];\n    __pipeline_wait_prior(0);\n    int succ = 0;\n    if (rank == src_lane) {\n      bool CAS_res_cur = same_buf(loop_num + 1) == 0 ? CAS_res[0] : CAS_res[1];\n      succ = CAS_res_cur ? 1 : 0;\n    }\n    succ = g.shfl(succ, src_lane);\n    if (target_pos >= 0 && succ == 1) {\n      auto tmp = reinterpret_cast<VecV*>(sm_values_buffer);\n      VecV* v_src = tmp + (groupID * 2 + same_buf(loop_num + 1)) * dim;\n      VecV* v_dst = value_ptr + target_pos * dim;\n      CopyValue::lds_stg(rank, v_dst, v_src, dim);\n      ScoreFunctor::update_without_missed(\n          keys_ptr, BUCKET_SIZE, target_pos,\n          sm_scores[groupID] + same_buf(loop_num + 1), 0, global_epoch);\n      if (rank == 0) {\n        auto key_address = BUCKET::keys(keys_ptr, target_pos);\n        key_address->store(target_key, cuda::std::memory_order_release);\n      }\n    }\n  }\n}  // End function\n\ntemplate <typename K = uint64_t, typename V = float, typename S = uint64_t>\nstruct Params_Update {\n  Params_Update(float load_factor_, Bucket<K, V, S>* __restrict__ buckets_,\n                size_t buckets_num_, uint32_t bucket_capacity_, uint32_t dim_,\n                const K* __restrict__ keys_, const V* __restrict__ values_,\n                const S* __restrict__ scores_, size_t n_, const S global_epoch_)\n      : load_factor(load_factor_),\n        buckets(buckets_),\n        buckets_num(buckets_num_),\n        bucket_capacity(bucket_capacity_),\n        dim(dim_),\n        keys(keys_),\n        values(values_),\n        scores(scores_),\n        n(n_),\n        global_epoch(global_epoch_) {}\n  float load_factor;\n  Bucket<K, V, S>* __restrict__ buckets;\n  size_t buckets_num;\n  uint32_t bucket_capacity;\n  uint32_t dim;\n  const K* __restrict__ keys;\n  const V* __restrict__ values;\n  const S* __restrict__ scores;\n  uint64_t n;\n  const S global_epoch;\n};\n\ntemplate <typename K, typename V, typename S, typename VecV, int Strategy>\nstruct Launch_TLP_Update {\n  using Params = Params_Update<K, V, S>;\n  inline static void launch_kernel(Params& params, cudaStream_t& stream) {\n    constexpr int BLOCK_SIZE = 128;\n    params.dim = params.dim * sizeof(V) / sizeof(VecV);\n    tlp_update_kernel_with_io<K, V, S, VecV, BLOCK_SIZE, Strategy>\n        <<<(params.n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(\n            params.buckets, params.buckets_num, params.bucket_capacity,\n            params.dim, params.keys,\n            reinterpret_cast<const VecV*>(params.values), params.scores,\n            params.n, params.global_epoch);\n  }\n};\n\ntemplate <typename K, typename V, typename S, typename VecV, int Strategy>\nstruct Launch_Pipeline_Update {\n  using Params = Params_Update<K, V, S>;\n  inline static void launch_kernel(Params& params, cudaStream_t& stream) {\n    constexpr int BLOCK_SIZE = 128;\n    constexpr uint32_t GROUP_SIZE = 16;\n    constexpr uint32_t GROUP_NUM = BLOCK_SIZE / GROUP_SIZE;\n\n    params.dim = params.dim * sizeof(V) / sizeof(VecV);\n    uint32_t shared_mem = GROUP_NUM * 2 * params.dim * sizeof(VecV);\n    shared_mem =\n        (shared_mem + sizeof(byte16) - 1) / sizeof(byte16) * sizeof(byte16);\n    pipeline_update_kernel_with_io<K, V, S, VecV, BLOCK_SIZE, GROUP_SIZE,\n                                   Strategy>\n        <<<(params.n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, shared_mem,\n           stream>>>(params.buckets, params.buckets_num, params.dim,\n                     params.keys, reinterpret_cast<const VecV*>(params.values),\n                     params.scores, params.n, params.global_epoch);\n  }\n};\n\ntemplate <typename ArchTag>\nstruct ValueConfig_Update;\n\n/// TODO: support more arch.\ntemplate <>\nstruct ValueConfig_Update<Sm80> {\n  // Value size greater than it will bring poor performance for TLP.\n  static constexpr uint32_t size_tlp = 8 * sizeof(byte4);\n  // Value size greater than it will reduce the occupancy for Pipeline.\n  // When the value is very high, the kernel will fail to launch.\n  static constexpr uint32_t size_pipeline = 128 * sizeof(byte4);\n};\n\ntemplate <>\nstruct ValueConfig_Update<Sm70> {\n  // Value size greater than it will bring poor performance for TLP.\n  static constexpr uint32_t size_tlp = 8 * sizeof(byte4);\n  // Value size greater than it will reduce the occupancy for Pipeline.\n  // When the value is very high, the kernel will fail to launch.\n  static constexpr uint32_t size_pipeline = 64 * sizeof(byte4);\n};\n\ntemplate <typename K, typename V, typename S, int Strategy, typename ArchTag>\nstruct KernelSelector_Update {\n  using ValueConfig = ValueConfig_Update<ArchTag>;\n  using Params = Params_Update<K, V, S>;\n\n  static bool callable(bool unique_key, uint32_t bucket_size, uint32_t dim) {\n    constexpr uint32_t MinBucketCap = sizeof(VecD_Load) / sizeof(D);\n    if (!unique_key || bucket_size < MinBucketCap) return false;\n    uint32_t value_size = dim * sizeof(V);\n    if (value_size <= ValueConfig::size_tlp) return true;\n    if (bucket_size == 128 && value_size <= ValueConfig::size_pipeline) {\n      return true;\n    }\n    return false;\n  }\n\n  static void select_kernel(Params& params, cudaStream_t& stream) {\n    const uint32_t total_value_size =\n        static_cast<uint32_t>(params.dim * sizeof(V));\n\n    auto launch_TLP = [&]() {\n      if (total_value_size % sizeof(byte16) == 0) {\n        using VecV = byte16;\n        Launch_TLP_Update<K, V, S, VecV, Strategy>::launch_kernel(params,\n                                                                  stream);\n      } else if (total_value_size % sizeof(byte8) == 0) {\n        using VecV = byte8;\n        Launch_TLP_Update<K, V, S, VecV, Strategy>::launch_kernel(params,\n                                                                  stream);\n      } else if (total_value_size % sizeof(byte4) == 0) {\n        using VecV = byte4;\n        Launch_TLP_Update<K, V, S, VecV, Strategy>::launch_kernel(params,\n                                                                  stream);\n      } else if (total_value_size % sizeof(byte2) == 0) {\n        using VecV = byte2;\n        Launch_TLP_Update<K, V, S, VecV, Strategy>::launch_kernel(params,\n                                                                  stream);\n      } else {\n        using VecV = byte;\n        Launch_TLP_Update<K, V, S, VecV, Strategy>::launch_kernel(params,\n                                                                  stream);\n      }\n    };\n\n    auto launch_Pipeline = [&]() {\n      if (total_value_size % sizeof(byte16) == 0) {\n        using VecV = byte16;\n        Launch_Pipeline_Update<K, V, S, VecV, Strategy>::launch_kernel(params,\n                                                                       stream);\n      } else if (total_value_size % sizeof(byte8) == 0) {\n        using VecV = byte8;\n        Launch_Pipeline_Update<K, V, S, VecV, Strategy>::launch_kernel(params,\n                                                                       stream);\n      } else if (total_value_size % sizeof(byte4) == 0) {\n        using VecV = byte4;\n        Launch_Pipeline_Update<K, V, S, VecV, Strategy>::launch_kernel(params,\n                                                                       stream);\n      } else if (total_value_size % sizeof(byte2) == 0) {\n        using VecV = byte2;\n        Launch_Pipeline_Update<K, V, S, VecV, Strategy>::launch_kernel(params,\n                                                                       stream);\n      } else {\n        using VecV = byte;\n        Launch_Pipeline_Update<K, V, S, VecV, Strategy>::launch_kernel(params,\n                                                                       stream);\n      }\n    };\n    // This part is according to the test on A100.\n    if (params.bucket_capacity != 128) {\n      launch_TLP();\n    } else {\n      if (total_value_size <= ValueConfig::size_tlp) {\n        if (params.load_factor <= 0.60f) {\n          launch_TLP();\n        } else {\n          launch_Pipeline();\n        }\n      } else {\n        launch_Pipeline();\n      }\n    }\n  }  // End function\n};\n\n/*\n * update with IO operation. This kernel is\n * usually used for the pure HBM mode for better performance.\n */\ntemplate <class K, class V, class S, int Strategy, uint32_t TILE_SIZE = 4>\n__global__ void update_kernel_with_io(\n    const Table<K, V, S>* __restrict table, Bucket<K, V, S>* buckets,\n    const size_t bucket_max_size, const size_t buckets_num, const size_t dim,\n    const K* __restrict keys, const V* __restrict values,\n    const S* __restrict scores, const S global_epoch, const size_t N) {\n  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());\n  int* buckets_size = table->buckets_size;\n\n  using ScoreFunctor = ScoreFunctor<K, V, S, Strategy>;\n\n  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;\n       t += blockDim.x * gridDim.x) {\n    int key_pos = -1;\n    size_t key_idx = t / TILE_SIZE;\n\n    const K update_key = keys[key_idx];\n\n    if (IS_RESERVED_KEY<K>(update_key)) continue;\n\n    const V* update_value = values + key_idx * dim;\n\n    size_t bkt_idx = 0;\n    size_t start_idx = 0;\n    int src_lane = -1;\n\n    Bucket<K, V, S>* bucket = get_key_position<K>(\n        buckets, update_key, bkt_idx, start_idx, buckets_num, bucket_max_size);\n\n    OccupyResult occupy_result{OccupyResult::INITIAL};\n    const int bucket_size = buckets_size[bkt_idx];\n\n    if (bucket_size >= bucket_max_size) {\n      start_idx = (start_idx / TILE_SIZE) * TILE_SIZE;\n    }\n    occupy_result = find_and_lock_for_update<K, V, S, TILE_SIZE>(\n        g, bucket, update_key, start_idx, key_pos, src_lane, bucket_max_size);\n\n    occupy_result = g.shfl(occupy_result, src_lane);\n\n    if (occupy_result == OccupyResult::REFUSED) continue;\n\n    if (occupy_result == OccupyResult::DUPLICATE) {\n      copy_vector<V, TILE_SIZE>(g, update_value,\n                                bucket->vectors + key_pos * dim, dim);\n      if (src_lane == g.thread_rank()) {\n        ScoreFunctor::update_without_missed(bucket, key_pos, scores, key_idx,\n                                            global_epoch);\n      }\n    }\n\n    if (g.thread_rank() == src_lane) {\n      (bucket->keys(key_pos))\n          ->store(update_key, cuda::std::memory_order_relaxed);\n    }\n  }\n}\n\ntemplate <typename K, typename V, typename S, int Strategy>\nstruct SelectUpdateKernelWithIO {\n  static void execute_kernel(const float& load_factor, const int& block_size,\n                             const size_t bucket_max_size,\n                             const size_t buckets_num, const size_t dim,\n                             cudaStream_t& stream, const size_t& n,\n                             const Table<K, V, S>* __restrict table,\n                             Bucket<K, V, S>* buckets, const K* __restrict keys,\n                             const V* __restrict values,\n                             const S* __restrict scores, const S global_epoch) {\n    if (load_factor <= 0.75) {\n      const unsigned int tile_size = 4;\n      const size_t N = n * tile_size;\n      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);\n      update_kernel_with_io<K, V, S, Strategy, tile_size>\n          <<<grid_size, block_size, 0, stream>>>(\n              table, buckets, bucket_max_size, buckets_num, dim, keys, values,\n              scores, global_epoch, N);\n    } else {\n      const unsigned int tile_size = 32;\n      const size_t N = n * tile_size;\n      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);\n      update_kernel_with_io<K, V, S, Strategy, tile_size>\n          <<<grid_size, block_size, 0, stream>>>(\n              table, buckets, bucket_max_size, buckets_num, dim, keys, values,\n              scores, global_epoch, N);\n    }\n    return;\n  }\n};\n\n// Use 1 thread to deal with a KV-pair, including copying value.\ntemplate <typename K, typename V, typename S, int Strategy = -1>\n__global__ void tlp_update_kernel_hybrid(\n    Bucket<K, V, S>* __restrict__ buckets, const uint64_t buckets_num,\n    uint32_t bucket_capacity, const uint32_t dim, const K* __restrict__ keys,\n    V** __restrict__ values, const S* __restrict__ scores,\n    K** __restrict__ key_ptrs, int* __restrict src_offset, const S global_epoch,\n    uint64_t n) {\n  using BUCKET = Bucket<K, V, S>;\n  using ScoreFunctor = ScoreFunctor<K, V, S, Strategy>;\n\n  uint32_t tx = threadIdx.x;\n  uint32_t kv_idx = blockIdx.x * blockDim.x + tx;\n  K key{static_cast<K>(EMPTY_KEY)};\n  OccupyResult occupy_result{OccupyResult::INITIAL};\n  VecD_Comp target_digests{0};\n  V* bucket_values_ptr{nullptr};\n  K* bucket_keys_ptr{nullptr};\n  uint32_t key_pos = {0};\n  if (kv_idx < n) {\n    key = keys[kv_idx];\n    if (src_offset) src_offset[kv_idx] = kv_idx;\n    if (!IS_RESERVED_KEY<K>(key)) {\n      const K hashed_key = Murmur3HashDevice(key);\n      target_digests = digests_from_hashed<K>(hashed_key);\n      uint64_t global_idx =\n          static_cast<uint64_t>(hashed_key % (buckets_num * bucket_capacity));\n      key_pos = get_start_position(global_idx, bucket_capacity);\n      uint64_t bkt_idx = global_idx / bucket_capacity;\n      BUCKET* bucket = buckets + bkt_idx;\n      bucket_keys_ptr = reinterpret_cast<K*>(bucket->keys(0));\n      bucket_values_ptr = bucket->vectors;\n    } else {\n      key_ptrs[kv_idx] = nullptr;\n      return;\n    }\n  } else {\n    return;\n  }\n\n  // Load `STRIDE` digests every time.\n  constexpr uint32_t STRIDE = sizeof(VecD_Load) / sizeof(D);\n  // One more loop to handle empty keys.\n  for (int offset = 0; offset < bucket_capacity + STRIDE; offset += STRIDE) {\n    if (occupy_result != OccupyResult::INITIAL) break;\n\n    uint32_t pos_cur = align_to<STRIDE>(key_pos);\n    pos_cur = (pos_cur + offset) & (bucket_capacity - 1);\n\n    D* digests_ptr = BUCKET::digests(bucket_keys_ptr, bucket_capacity, pos_cur);\n    VecD_Load digests_vec = *(reinterpret_cast<VecD_Load*>(digests_ptr));\n    VecD_Comp digests_arr[4] = {digests_vec.x, digests_vec.y, digests_vec.z,\n                                digests_vec.w};\n\n    for (int i = 0; i < 4; i++) {\n      VecD_Comp probe_digests = digests_arr[i];\n      uint32_t possible_pos = 0;\n      bool result = false;\n      // Perform a vectorized comparison by byte,\n      // and if they are equal, set the corresponding byte in the result to\n      // 0xff.\n      int cmp_result = __vcmpeq4(probe_digests, target_digests);\n      cmp_result &= 0x01010101;\n      do {\n        if (cmp_result == 0) break;\n        // CUDA uses little endian,\n        // and the lowest byte in register stores in the lowest address.\n        uint32_t index = (__ffs(cmp_result) - 1) >> 3;\n        cmp_result &= (cmp_result - 1);\n        possible_pos = pos_cur + i * 4 + index;\n        auto current_key = BUCKET::keys(bucket_keys_ptr, possible_pos);\n        K expected_key = key;\n        // Modifications to the bucket will not before this instruction.\n        result = current_key->compare_exchange_strong(\n            expected_key, static_cast<K>(LOCKED_KEY),\n            cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);\n      } while (!result);\n      if (result) {\n        key_pos = possible_pos;\n        ScoreFunctor::update_without_missed(bucket_keys_ptr, bucket_capacity,\n                                            key_pos, scores, kv_idx,\n                                            global_epoch);\n        V* bucket_value_ptr = bucket_values_ptr + key_pos * dim;\n        values[kv_idx] = bucket_value_ptr;\n        key_ptrs[kv_idx] = bucket_keys_ptr + key_pos;\n        return;\n      }\n      VecD_Comp empty_digests_ = empty_digests<K>();\n      cmp_result = __vcmpeq4(probe_digests, empty_digests_);\n      cmp_result &= 0x01010101;\n      do {\n        if (cmp_result == 0) break;\n        uint32_t index = (__ffs(cmp_result) - 1) >> 3;\n        cmp_result &= (cmp_result - 1);\n        possible_pos = pos_cur + i * 4 + index;\n        if (offset == 0 && possible_pos < key_pos) continue;\n        auto current_key = BUCKET::keys(bucket_keys_ptr, possible_pos);\n        auto probe_key = current_key->load(cuda::std::memory_order_relaxed);\n        if (probe_key == static_cast<K>(EMPTY_KEY)) {\n          return;\n        }\n      } while (true);\n    }\n  }\n}\n\ntemplate <class K, class V, class S, int Strategy, uint32_t TILE_SIZE = 4>\n__global__ void update_kernel(const Table<K, V, S>* __restrict table,\n                              Bucket<K, V, S>* buckets,\n                              const size_t bucket_max_size,\n                              const size_t buckets_num, const size_t dim,\n                              const K* __restrict keys, V** __restrict vectors,\n                              const S* __restrict scores,\n                              int* __restrict src_offset, const S global_epoch,\n                              size_t N) {\n  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());\n  int* buckets_size = table->buckets_size;\n\n  using ScoreFunctor = ScoreFunctor<K, V, S, Strategy>;\n\n  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;\n       t += blockDim.x * gridDim.x) {\n    int key_pos = -1;\n    size_t key_idx = t / TILE_SIZE;\n\n    const K update_key = keys[key_idx];\n\n    if (IS_RESERVED_KEY<K>(update_key)) continue;\n\n    size_t bkt_idx = 0;\n    size_t start_idx = 0;\n    int src_lane = -1;\n\n    Bucket<K, V, S>* bucket = get_key_position<K>(\n        buckets, update_key, bkt_idx, start_idx, buckets_num, bucket_max_size);\n\n    OccupyResult occupy_result{OccupyResult::INITIAL};\n    const int bucket_size = buckets_size[bkt_idx];\n    *(src_offset + key_idx) = key_idx;\n\n    if (bucket_size >= bucket_max_size) {\n      start_idx = (start_idx / TILE_SIZE) * TILE_SIZE;\n    }\n    occupy_result = find_and_lock_for_update<K, V, S, TILE_SIZE>(\n        g, bucket, update_key, start_idx, key_pos, src_lane, bucket_max_size);\n\n    occupy_result = g.shfl(occupy_result, src_lane);\n\n    if (occupy_result == OccupyResult::REFUSED) continue;\n\n    if (g.thread_rank() == src_lane) {\n      if (occupy_result == OccupyResult::DUPLICATE) {\n        *(vectors + key_idx) = (bucket->vectors + key_pos * dim);\n        ScoreFunctor::update_without_missed(bucket, key_pos, scores, key_idx,\n                                            global_epoch);\n      } else {\n        *(vectors + key_idx) = nullptr;\n      }\n    }\n\n    if (g.thread_rank() == src_lane) {\n      (bucket->keys(key_pos))\n          ->store(update_key, cuda::std::memory_order_relaxed);\n    }\n  }\n}\n\n}  // namespace merlin\n}  // namespace nv"
  },
  {
    "path": "include/merlin/core_kernels/update_score.cuh",
    "content": "/*\n * Copyright (c) 2023, NVIDIA CORPORATION.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *     http:///www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n#pragma once\n\n#include \"kernel_utils.cuh\"\n\nnamespace nv {\nnamespace merlin {\n\n// Use 1 thread to deal with a KV-pair, including copying value.\ntemplate <typename K = uint64_t, typename V = byte4, typename S = uint64_t,\n          uint32_t BLOCK_SIZE = 128, int Strategy = -1>\n__global__ void tlp_update_score_kernel(Bucket<K, V, S>* __restrict__ buckets,\n                                        const uint64_t buckets_num,\n                                        uint32_t bucket_capacity,\n                                        const K* __restrict__ keys,\n                                        const S* __restrict__ scores,\n                                        uint64_t n, const S global_epoch) {\n  using BUCKET = Bucket<K, V, S>;\n  using ScoreFunctor = ScoreFunctor<K, V, S, Strategy>;\n\n  uint32_t tx = threadIdx.x;\n  uint32_t kv_idx = blockIdx.x * blockDim.x + tx;\n  K key{static_cast<K>(EMPTY_KEY)};\n  OccupyResult occupy_result{OccupyResult::INITIAL};\n  VecD_Comp target_digests{0};\n  K* bucket_keys_ptr{nullptr};\n  uint32_t key_pos = {0};\n  if (kv_idx < n) {\n    key = keys[kv_idx];\n\n    if (!IS_RESERVED_KEY<K>(key)) {\n      const K hashed_key = Murmur3HashDevice(key);\n      target_digests = digests_from_hashed<K>(hashed_key);\n      uint64_t global_idx =\n          static_cast<uint64_t>(hashed_key % (buckets_num * bucket_capacity));\n      key_pos = global_idx & (bucket_capacity - 1);\n      uint64_t bkt_idx = global_idx / bucket_capacity;\n      BUCKET* bucket = buckets + bkt_idx;\n      bucket_keys_ptr = reinterpret_cast<K*>(bucket->keys(0));\n    } else {\n      return;\n    }\n  } else {\n    return;\n  }\n\n  // Load `STRIDE` digests every time.\n  constexpr uint32_t STRIDE = sizeof(VecD_Load) / sizeof(D);\n  // One more loop to handle empty keys.\n  for (int offset = 0; offset < bucket_capacity + STRIDE; offset += STRIDE) {\n    if (occupy_result != OccupyResult::INITIAL) break;\n\n    uint32_t pos_cur = align_to<STRIDE>(key_pos);\n    pos_cur = (pos_cur + offset) & (bucket_capacity - 1);\n\n    D* digests_ptr = BUCKET::digests(bucket_keys_ptr, bucket_capacity, pos_cur);\n    VecD_Load digests_vec = *(reinterpret_cast<VecD_Load*>(digests_ptr));\n    VecD_Comp digests_arr[4] = {digests_vec.x, digests_vec.y, digests_vec.z,\n                                digests_vec.w};\n\n    for (int i = 0; i < 4; i++) {\n      VecD_Comp probe_digests = digests_arr[i];\n      uint32_t possible_pos = 0;\n      bool result = false;\n      // Perform a vectorized comparison by byte,\n      // and if they are equal, set the corresponding byte in the result to\n      // 0xff.\n      int cmp_result = __vcmpeq4(probe_digests, target_digests);\n      cmp_result &= 0x01010101;\n      do {\n        if (cmp_result == 0) break;\n        // CUDA uses little endian,\n        // and the lowest byte in register stores in the lowest address.\n        uint32_t index = (__ffs(cmp_result) - 1) >> 3;\n        cmp_result &= (cmp_result - 1);\n        possible_pos = pos_cur + i * 4 + index;\n        auto current_key = BUCKET::keys(bucket_keys_ptr, possible_pos);\n        K expected_key = key;\n        // Modifications to the bucket will not before this instruction.\n        result = current_key->compare_exchange_strong(\n            expected_key, static_cast<K>(LOCKED_KEY),\n            cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);\n      } while (!result);\n      if (result) {\n        occupy_result = OccupyResult::DUPLICATE;\n        key_pos = possible_pos;\n        ScoreFunctor::update_without_missed(bucket_keys_ptr, bucket_capacity,\n                                            key_pos, scores, kv_idx,\n                                            global_epoch);\n        auto key_address = BUCKET::keys(bucket_keys_ptr, key_pos);\n        // memory_order_release:\n        // Modifications to the bucket will not after this instruction.\n        key_address->store(key, cuda::std::memory_order_release);\n        return;\n      }\n      VecD_Comp empty_digests_ = empty_digests<K>();\n      cmp_result = __vcmpeq4(probe_digests, empty_digests_);\n      cmp_result &= 0x01010101;\n      do {\n        if (cmp_result == 0) break;\n        uint32_t index = (__ffs(cmp_result) - 1) >> 3;\n        cmp_result &= (cmp_result - 1);\n        possible_pos = pos_cur + i * 4 + index;\n        if (offset == 0 && possible_pos < key_pos) continue;\n        auto current_key = BUCKET::keys(bucket_keys_ptr, possible_pos);\n        auto probe_key = current_key->load(cuda::std::memory_order_relaxed);\n        if (probe_key == static_cast<K>(EMPTY_KEY)) {\n          return;\n        }\n      } while (true);\n    }\n  }\n}\ntemplate <typename K = uint64_t, typename V = byte4, typename S = uint64_t,\n          uint32_t BLOCK_SIZE = 128, uint32_t GROUP_SIZE = 16,\n          int Strategy = -1>\n__global__ void pipeline_update_score_kernel(\n    Bucket<K, V, S>* __restrict__ buckets, const uint64_t buckets_num,\n    const K* __restrict__ keys, const S* __restrict__ scores, uint64_t n,\n    const S global_epoch) {\n  constexpr uint32_t BUCKET_SIZE = 128;\n  constexpr int GROUP_NUM = BLOCK_SIZE / GROUP_SIZE;\n  constexpr uint32_t Comp_LEN = sizeof(VecD_Comp) / sizeof(D);\n  // Here, GROUP_SIZE * Load_LEN = BUCKET_SIZE.\n  using VecD_Load = byte8;\n  constexpr uint32_t Load_LEN = sizeof(VecD_Load) / sizeof(D);\n  constexpr int RESERVE = 8;\n\n  using BUCKET = Bucket<K, V, S>;\n  using CopyScore = CopyScoreByPassCache<S, K, BUCKET_SIZE>;\n  using ScoreFunctor = ScoreFunctor<K, V, S, Strategy>;\n\n  __shared__ VecD_Comp sm_target_digests[BLOCK_SIZE];\n  __shared__ K sm_target_keys[BLOCK_SIZE];\n  __shared__ K* sm_keys_ptr[BLOCK_SIZE];\n  // Reuse\n  int* sm_counts = reinterpret_cast<int*>(sm_target_digests);\n  int* sm_position = sm_counts;\n  // Double buffer\n  __shared__ D sm_digests[GROUP_NUM][2 * BUCKET_SIZE];\n  __shared__ K sm_possible_keys[GROUP_NUM][2 * RESERVE];\n  __shared__ int sm_possible_pos[GROUP_NUM][2 * RESERVE];\n  __shared__ S sm_scores[GROUP_NUM][2];\n  __shared__ int sm_ranks[GROUP_NUM][2];\n\n  bool CAS_res[2]{false};\n\n  // Initialization\n  auto g = cg::tiled_partition<GROUP_SIZE>(cg::this_thread_block());\n  int groupID = threadIdx.x / GROUP_SIZE;\n  int rank = g.thread_rank();\n  uint64_t key_idx_base = (blockIdx.x * blockDim.x) + groupID * GROUP_SIZE;\n  if (key_idx_base >= n) return;\n  int loop_num =\n      (n - key_idx_base) < GROUP_SIZE ? (n - key_idx_base) : GROUP_SIZE;\n  if (rank < loop_num) {\n    int idx_block = groupID * GROUP_SIZE + rank;\n    K key = keys[key_idx_base + rank];\n    sm_target_keys[idx_block] = key;\n    const K hashed_key = Murmur3HashDevice(key);\n    sm_target_digests[idx_block] = digests_from_hashed<K>(hashed_key);\n    uint64_t global_idx = hashed_key % (buckets_num * BUCKET_SIZE);\n    uint64_t bkt_idx = global_idx / BUCKET_SIZE;\n    Bucket<K, V, S>* bucket = buckets + bkt_idx;\n    __pipeline_memcpy_async(sm_keys_ptr + idx_block, bucket->keys_addr(),\n                            sizeof(K*));\n    __pipeline_commit();\n  }\n  __pipeline_wait_prior(0);\n\n  // Pipeline loading\n  K* keys_ptr = sm_keys_ptr[groupID * GROUP_SIZE];\n  D* digests_ptr = BUCKET::digests(keys_ptr, BUCKET_SIZE, rank * Load_LEN);\n  __pipeline_memcpy_async(sm_digests[groupID] + rank * Load_LEN, digests_ptr,\n                          sizeof(VecD_Load));\n  __pipeline_commit();\n  // Padding, meet the param of the first `__pipeline_wait_prior`\n  // in the first loop.\n  __pipeline_commit();\n  __pipeline_commit();\n\n  for (int i = 0; i < loop_num; i++) {\n    int key_idx_block = groupID * GROUP_SIZE + i;\n\n    /* Step1: prefetch all digests in one bucket */\n    if ((i + 1) < loop_num) {\n      K* keys_ptr = sm_keys_ptr[key_idx_block + 1];\n      D* digests_ptr = BUCKET::digests(keys_ptr, BUCKET_SIZE, rank * Load_LEN);\n      __pipeline_memcpy_async(\n          sm_digests[groupID] + diff_buf(i) * BUCKET_SIZE + rank * Load_LEN,\n          digests_ptr, sizeof(VecD_Load));\n    }\n    __pipeline_commit();\n\n    /* Step2: check digests and load possible keys */\n    VecD_Comp target_digests = sm_target_digests[key_idx_block];\n    sm_counts[key_idx_block] = 0;\n    __pipeline_wait_prior(3);\n    VecD_Comp probing_digests = *reinterpret_cast<VecD_Comp*>(\n        &sm_digests[groupID][same_buf(i) * BUCKET_SIZE + rank * Comp_LEN]);\n    uint32_t find_result_ = __vcmpeq4(probing_digests, target_digests);\n    uint32_t find_result = 0;\n    if ((find_result_ & 0x01) != 0) find_result |= 0x01;\n    if ((find_result_ & 0x0100) != 0) find_result |= 0x02;\n    if ((find_result_ & 0x010000) != 0) find_result |= 0x04;\n    if ((find_result_ & 0x01000000) != 0) find_result |= 0x08;\n    probing_digests = *reinterpret_cast<VecD_Comp*>(\n        &sm_digests[groupID][same_buf(i) * BUCKET_SIZE +\n                             (GROUP_SIZE + rank) * Comp_LEN]);\n    find_result_ = __vcmpeq4(probing_digests, target_digests);\n    if ((find_result_ & 0x01) != 0) find_result |= 0x10;\n    if ((find_result_ & 0x0100) != 0) find_result |= 0x20;\n    if ((find_result_ & 0x010000) != 0) find_result |= 0x40;\n    if ((find_result_ & 0x01000000) != 0) find_result |= 0x80;\n    int find_number = __popc(find_result);\n    int group_base = 0;\n    if (find_number > 0) {\n      group_base = atomicAdd(sm_counts + key_idx_block, find_number);\n    }\n    bool gt_reserve = (group_base + find_number) > RESERVE;\n    int gt_vote = g.ballot(gt_reserve);\n    K* key_ptr = sm_keys_ptr[key_idx_block];\n    if (gt_vote == 0) {\n      do {\n        int digest_idx = __ffs(find_result) - 1;\n        if (digest_idx >= 0) {\n          find_result &= (find_result - 1);\n          int key_pos = digest_idx < 4\n                            ? (rank * 4 + digest_idx)\n                            : ((GROUP_SIZE + rank - 1) * 4 + digest_idx);\n          sm_possible_pos[groupID][same_buf(i) * RESERVE + group_base] =\n              key_pos;\n          __pipeline_memcpy_async(\n              sm_possible_keys[groupID] + same_buf(i) * RESERVE + group_base,\n              key_ptr + key_pos, sizeof(K));\n          group_base += 1;\n        } else {\n          break;\n        }\n      } while (true);\n    } else {\n      K target_key = sm_target_keys[key_idx_block];\n      sm_counts[key_idx_block] = 0;\n      int found_vote = 0;\n      bool found = false;\n      do {\n        int digest_idx = __ffs(find_result) - 1;\n        if (digest_idx >= 0) {\n          find_result &= (find_result - 1);\n          int key_pos = digest_idx < 4\n                            ? (rank * 4 + digest_idx)\n                            : ((GROUP_SIZE + rank - 1) * 4 + digest_idx);\n          K possible_key = key_ptr[key_pos];\n          if (possible_key == target_key) {\n            found = true;\n            sm_counts[key_idx_block] = 1;\n            sm_possible_pos[groupID][same_buf(i) * RESERVE] = key_pos;\n            sm_possible_keys[groupID][same_buf(i) * RESERVE] = possible_key;\n          }\n        }\n        found_vote = g.ballot(found);\n        if (found_vote) {\n          break;\n        }\n        found_vote = digest_idx >= 0;\n      } while (g.any(found_vote));\n    }\n    __pipeline_commit();\n\n    /* Step3: check possible keys, and prefecth the value and score */\n    if (i > 0) {\n      key_idx_block -= 1;\n      K target_key = sm_target_keys[key_idx_block];\n      K* keys_ptr = sm_keys_ptr[key_idx_block];\n      int possible_num = sm_counts[key_idx_block];\n      sm_position[key_idx_block] = -1;\n      __pipeline_wait_prior(3);\n      int key_pos;\n      bool found_flag = false;\n      if (rank < possible_num) {\n        K possible_key =\n            sm_possible_keys[groupID][diff_buf(i) * RESERVE + rank];\n        key_pos = sm_possible_pos[groupID][diff_buf(i) * RESERVE + rank];\n        if (possible_key == target_key) {\n          found_flag = true;\n          if (scores) {\n            int key_idx_grid = blockIdx.x * blockDim.x + key_idx_block;\n            const S* score_ptr = scores + key_idx_grid;\n            CopyScore::ldg_sts(sm_scores[groupID] + diff_buf(i), score_ptr);\n          }\n          auto key_ptr = BUCKET::keys(keys_ptr, key_pos);\n          sm_ranks[groupID][diff_buf(i)] = rank;\n          if (diff_buf(i) == 0) {\n            CAS_res[0] = key_ptr->compare_exchange_strong(\n                possible_key, static_cast<K>(LOCKED_KEY),\n                cuda::std::memory_order_acquire,\n                cuda::std::memory_order_relaxed);\n          } else {\n            CAS_res[1] = key_ptr->compare_exchange_strong(\n                possible_key, static_cast<K>(LOCKED_KEY),\n                cuda::std::memory_order_acquire,\n                cuda::std::memory_order_relaxed);\n          }\n        }\n      }\n      int found_vote = g.ballot(found_flag);\n      if (found_vote) {\n        int src_lane = __ffs(found_vote) - 1;\n        int target_pos = g.shfl(key_pos, src_lane);\n        sm_position[key_idx_block] = target_pos;\n      }\n    }\n    __pipeline_commit();\n\n    /* Step4: write back value and score */\n    if (i > 1) {\n      key_idx_block -= 1;\n      int target_pos = sm_position[key_idx_block];\n      K target_key = sm_target_keys[key_idx_block];\n      K* keys_ptr = sm_keys_ptr[key_idx_block];\n      int src_lane = sm_ranks[groupID][same_buf(i)];\n      __pipeline_wait_prior(3);\n      int succ = 0;\n      if (rank == src_lane) {\n        bool CAS_res_cur = same_buf(i) == 0 ? CAS_res[0] : CAS_res[1];\n        succ = CAS_res_cur ? 1 : 0;\n      }\n      succ = g.shfl(succ, src_lane);\n      if (target_pos >= 0 && succ == 1) {\n        ScoreFunctor::update_without_missed(keys_ptr, BUCKET_SIZE, target_pos,\n                                            sm_scores[groupID] + same_buf(i), 0,\n                                            global_epoch);\n        if (rank == 0) {\n          auto key_address = BUCKET::keys(keys_ptr, target_pos);\n          key_address->store(target_key, cuda::std::memory_order_release);\n        }\n      }\n    }\n  }  // End loop\n\n  /* Pipeline emptying: step3, i = loop_num */\n  {\n    int key_idx_block = groupID * GROUP_SIZE + (loop_num - 1);\n    K target_key = sm_target_keys[key_idx_block];\n    K* keys_ptr = sm_keys_ptr[key_idx_block];\n    int possible_num = sm_counts[key_idx_block];\n    sm_position[key_idx_block] = -1;\n    __pipeline_wait_prior(1);\n    int key_pos;\n    bool found_flag = false;\n    if (rank < possible_num) {\n      K possible_key =\n          sm_possible_keys[groupID][diff_buf(loop_num) * RESERVE + rank];\n      key_pos = sm_possible_pos[groupID][diff_buf(loop_num) * RESERVE + rank];\n      if (possible_key == target_key) {\n        found_flag = true;\n        if (scores) {\n          int key_idx_grid = blockIdx.x * blockDim.x + key_idx_block;\n          const S* score_ptr = scores + key_idx_grid;\n          CopyScore::ldg_sts(sm_scores[groupID] + diff_buf(loop_num),\n                             score_ptr);\n        }\n        auto key_ptr = BUCKET::keys(keys_ptr, key_pos);\n        sm_ranks[groupID][diff_buf(loop_num)] = rank;\n        if (diff_buf(loop_num) == 0) {\n          CAS_res[0] = key_ptr->compare_exchange_strong(\n              possible_key, static_cast<K>(LOCKED_KEY),\n              cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);\n        } else {\n          CAS_res[1] = key_ptr->compare_exchange_strong(\n              possible_key, static_cast<K>(LOCKED_KEY),\n              cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);\n        }\n      }\n    }\n    int found_vote = g.ballot(found_flag);\n    if (found_vote) {\n      int src_lane = __ffs(found_vote) - 1;\n      int target_pos = g.shfl(key_pos, src_lane);\n      sm_position[key_idx_block] = target_pos;\n    }\n  }\n  __pipeline_commit();\n\n  /* Pipeline emptying: step4, i = loop_num */\n  if (loop_num > 1) {\n    int key_idx_block = groupID * GROUP_SIZE + loop_num - 2;\n    int target_pos = sm_position[key_idx_block];\n    K target_key = sm_target_keys[key_idx_block];\n    K* keys_ptr = sm_keys_ptr[key_idx_block];\n    int src_lane = sm_ranks[groupID][same_buf(loop_num)];\n    __pipeline_wait_prior(1);\n    int succ = 0;\n    if (rank == src_lane) {\n      bool CAS_res_cur = same_buf(loop_num) == 0 ? CAS_res[0] : CAS_res[1];\n      succ = CAS_res_cur ? 1 : 0;\n    }\n    succ = g.shfl(succ, src_lane);\n    if (target_pos >= 0 && succ == 1) {\n      ScoreFunctor::update_without_missed(\n          keys_ptr, BUCKET_SIZE, target_pos,\n          sm_scores[groupID] + same_buf(loop_num), 0, global_epoch);\n\n      auto key_ptr = BUCKET::keys(keys_ptr, target_pos);\n      if (rank == 0) {\n        auto key_address = BUCKET::keys(keys_ptr, target_pos);\n        key_address->store(target_key, cuda::std::memory_order_release);\n      }\n    }\n  }\n\n  /* Pipeline emptying: step4, i = loop_num + 1 */\n  {\n    int key_idx_block = groupID * GROUP_SIZE + loop_num - 1;\n    int target_pos = sm_position[key_idx_block];\n    K target_key = sm_target_keys[key_idx_block];\n    K* keys_ptr = sm_keys_ptr[key_idx_block];\n    int src_lane = sm_ranks[groupID][same_buf(loop_num + 1)];\n    __pipeline_wait_prior(0);\n    int succ = 0;\n    if (rank == src_lane) {\n      bool CAS_res_cur = same_buf(loop_num + 1) == 0 ? CAS_res[0] : CAS_res[1];\n      succ = CAS_res_cur ? 1 : 0;\n    }\n    succ = g.shfl(succ, src_lane);\n    if (target_pos >= 0 && succ == 1) {\n      ScoreFunctor::update_without_missed(\n          keys_ptr, BUCKET_SIZE, target_pos,\n          sm_scores[groupID] + same_buf(loop_num + 1), 0, global_epoch);\n      if (rank == 0) {\n        auto key_address = BUCKET::keys(keys_ptr, target_pos);\n        key_address->store(target_key, cuda::std::memory_order_release);\n      }\n    }\n  }\n}  // End function\n\ntemplate <typename K = uint64_t, typename V = float, typename S = uint64_t>\nstruct Params_UpdateScore {\n  Params_UpdateScore(float load_factor_, Bucket<K, V, S>* __restrict__ buckets_,\n                     size_t buckets_num_, uint32_t bucket_capacity_,\n                     const K* __restrict__ keys_, const S* __restrict__ scores_,\n                     size_t n_, const S global_epoch_)\n      : load_factor(load_factor_),\n        buckets(buckets_),\n        buckets_num(buckets_num_),\n        bucket_capacity(bucket_capacity_),\n        keys(keys_),\n        scores(scores_),\n        n(n_),\n        global_epoch(global_epoch_) {}\n  float load_factor;\n  Bucket<K, V, S>* __restrict__ buckets;\n  size_t buckets_num;\n  uint32_t bucket_capacity;\n  const K* __restrict__ keys;\n  const S* __restrict__ scores;\n  uint64_t n;\n  const S global_epoch;\n};\n\ntemplate <typename K, typename V, typename S, int Strategy>\nstruct Launch_TLP_UpdateScore {\n  using Params = Params_UpdateScore<K, V, S>;\n  inline static void launch_kernel(Params& params, cudaStream_t& stream) {\n    constexpr int BLOCK_SIZE = 128;\n    tlp_update_score_kernel<K, V, S, BLOCK_SIZE, Strategy>\n        <<<(params.n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(\n            params.buckets, params.buckets_num, params.bucket_capacity,\n            params.keys, params.scores, params.n, params.global_epoch);\n  }\n};\n\ntemplate <typename K, typename V, typename S, int Strategy>\nstruct Launch_Pipeline_UpdateScore {\n  using Params = Params_UpdateScore<K, V, S>;\n  inline static void launch_kernel(Params& params, cudaStream_t& stream) {\n    constexpr int BLOCK_SIZE = 128;\n    constexpr uint32_t GROUP_SIZE = 16;\n\n    pipeline_update_score_kernel<K, V, S, BLOCK_SIZE, GROUP_SIZE, Strategy>\n        <<<(params.n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(\n            params.buckets, params.buckets_num, params.keys, params.scores,\n            params.n, params.global_epoch);\n  }\n};\n\ntemplate <typename K, typename V, typename S, int Strategy>\nstruct KernelSelector_UpdateScore {\n  using Params = Params_UpdateScore<K, V, S>;\n\n  static bool callable(bool unique_key, uint32_t bucket_size) {\n    constexpr uint32_t MinBucketCap = sizeof(VecD_Load) / sizeof(D);\n    return (unique_key && bucket_size >= MinBucketCap);\n  }\n\n  static void select_kernel(Params& params, cudaStream_t& stream) {\n    // This part is according to the test on A100.\n    if (params.bucket_capacity != 128) {\n      Launch_TLP_UpdateScore<K, V, S, Strategy>::launch_kernel(params, stream);\n    } else {\n      if (params.load_factor <= 0.60f) {\n        Launch_TLP_UpdateScore<K, V, S, Strategy>::launch_kernel(params,\n                                                                 stream);\n      } else {\n        Launch_Pipeline_UpdateScore<K, V, S, Strategy>::launch_kernel(params,\n                                                                      stream);\n      }\n    }\n  }  // End function\n};\n\n/*\n * update with IO operation. This kernel is\n * usually used for the pure HBM mode for better performance.\n */\ntemplate <class K, class V, class S, int Strategy, uint32_t TILE_SIZE = 4>\n__global__ void update_score_kernel(const Table<K, V, S>* __restrict table,\n                                    Bucket<K, V, S>* buckets,\n                                    const size_t bucket_max_size,\n                                    const size_t buckets_num,\n                                    const K* __restrict keys,\n                                    const S* __restrict scores,\n                                    const S global_epoch, const size_t N) {\n  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());\n  int* buckets_size = table->buckets_size;\n\n  using ScoreFunctor = ScoreFunctor<K, V, S, Strategy>;\n\n  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;\n       t += blockDim.x * gridDim.x) {\n    int key_pos = -1;\n    size_t key_idx = t / TILE_SIZE;\n\n    const K update_key = keys[key_idx];\n\n    if (IS_RESERVED_KEY<K>(update_key)) continue;\n\n    size_t bkt_idx = 0;\n    size_t start_idx = 0;\n    int src_lane = -1;\n\n    Bucket<K, V, S>* bucket = get_key_position<K>(\n        buckets, update_key, bkt_idx, start_idx, buckets_num, bucket_max_size);\n\n    OccupyResult occupy_result{OccupyResult::INITIAL};\n    const int bucket_size = buckets_size[bkt_idx];\n\n    if (bucket_size >= bucket_max_size) {\n      start_idx = (start_idx / TILE_SIZE) * TILE_SIZE;\n    }\n    occupy_result = find_and_lock_for_update<K, V, S, TILE_SIZE>(\n        g, bucket, update_key, start_idx, key_pos, src_lane, bucket_max_size);\n\n    occupy_result = g.shfl(occupy_result, src_lane);\n\n    if (occupy_result == OccupyResult::REFUSED) continue;\n\n    if (occupy_result == OccupyResult::DUPLICATE) {\n      if (src_lane == g.thread_rank()) {\n        ScoreFunctor::update_without_missed(bucket, key_pos, scores, key_idx,\n                                            global_epoch);\n      }\n    }\n\n    if (g.thread_rank() == src_lane) {\n      (bucket->keys(key_pos))\n          ->store(update_key, cuda::std::memory_order_relaxed);\n    }\n  }\n}\n\ntemplate <typename K, typename V, typename S, int Strategy>\nstruct SelectUpdateScoreKernel {\n  static void execute_kernel(const float& load_factor, const int& block_size,\n                             const size_t bucket_max_size,\n                             const size_t buckets_num, cudaStream_t& stream,\n                             const size_t& n,\n                             const Table<K, V, S>* __restrict table,\n                             Bucket<K, V, S>* buckets, const K* __restrict keys,\n                             const S* __restrict scores, const S global_epoch) {\n    if (load_factor <= 0.75) {\n      const unsigned int tile_size = 4;\n      const size_t N = n * tile_size;\n      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);\n      update_score_kernel<K, V, S, Strategy, tile_size>\n          <<<grid_size, block_size, 0, stream>>>(table, buckets,\n                                                 bucket_max_size, buckets_num,\n                                                 keys, scores, global_epoch, N);\n    } else {\n      const unsigned int tile_size = 32;\n      const size_t N = n * tile_size;\n      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);\n      update_score_kernel<K, V, S, Strategy, tile_size>\n          <<<grid_size, block_size, 0, stream>>>(table, buckets,\n                                                 bucket_max_size, buckets_num,\n                                                 keys, scores, global_epoch, N);\n    }\n    return;\n  }\n};\n\n}  // namespace merlin\n}  // namespace nv"
  },
  {
    "path": "include/merlin/core_kernels/update_values.cuh",
    "content": "/*\n * Copyright (c) 2023, NVIDIA CORPORATION.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *     http:///www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n#pragma once\n\n#include \"kernel_utils.cuh\"\n\nnamespace nv {\nnamespace merlin {\n\n// Use 1 thread to deal with a KV-pair, including copying value.\ntemplate <typename K = uint64_t, typename V = byte4, typename S = uint64_t,\n          typename VecV = byte16, uint32_t BLOCK_SIZE = 128>\n__global__ void tlp_update_values_kernel_with_io(\n    Bucket<K, V, S>* __restrict__ buckets, const uint64_t buckets_num,\n    uint32_t bucket_capacity, const uint32_t dim, const K* __restrict__ keys,\n    const VecV* __restrict__ values, uint64_t n) {\n  using BUCKET = Bucket<K, V, S>;\n  using CopyValue = CopyValueMultipleGroup<VecV, 1>;\n\n  uint32_t tx = threadIdx.x;\n  uint32_t kv_idx = blockIdx.x * blockDim.x + tx;\n  K key{static_cast<K>(EMPTY_KEY)};\n  OccupyResult occupy_result{OccupyResult::INITIAL};\n  VecD_Comp target_digests{0};\n  VecV* bucket_values_ptr{nullptr};\n  K* bucket_keys_ptr{nullptr};\n  uint32_t key_pos = {0};\n  if (kv_idx < n) {\n    key = keys[kv_idx];\n\n    if (!IS_RESERVED_KEY<K>(key)) {\n      const K hashed_key = Murmur3HashDevice(key);\n      target_digests = digests_from_hashed<K>(hashed_key);\n      uint64_t global_idx =\n          static_cast<uint64_t>(hashed_key % (buckets_num * bucket_capacity));\n      key_pos = get_start_position(global_idx, bucket_capacity);\n      uint64_t bkt_idx = global_idx / bucket_capacity;\n      BUCKET* bucket = buckets + bkt_idx;\n      bucket_keys_ptr = reinterpret_cast<K*>(bucket->keys(0));\n      bucket_values_ptr = reinterpret_cast<VecV*>(bucket->vectors);\n    } else {\n      return;\n    }\n  } else {\n    return;\n  }\n\n  // Load `STRIDE` digests every time.\n  constexpr uint32_t STRIDE = sizeof(VecD_Load) / sizeof(D);\n  // One more loop to handle empty keys.\n  for (int offset = 0; offset < bucket_capacity + STRIDE; offset += STRIDE) {\n    if (occupy_result != OccupyResult::INITIAL) break;\n\n    uint32_t pos_cur = align_to<STRIDE>(key_pos);\n    pos_cur = (pos_cur + offset) & (bucket_capacity - 1);\n\n    D* digests_ptr = BUCKET::digests(bucket_keys_ptr, bucket_capacity, pos_cur);\n    VecD_Load digests_vec = *(reinterpret_cast<VecD_Load*>(digests_ptr));\n    VecD_Comp digests_arr[4] = {digests_vec.x, digests_vec.y, digests_vec.z,\n                                digests_vec.w};\n\n    for (int i = 0; i < 4; i++) {\n      VecD_Comp probe_digests = digests_arr[i];\n      uint32_t possible_pos = 0;\n      bool result = false;\n      // Perform a vectorized comparison by byte,\n      // and if they are equal, set the corresponding byte in the result to\n      // 0xff.\n      int cmp_result = __vcmpeq4(probe_digests, target_digests);\n      cmp_result &= 0x01010101;\n      do {\n        if (cmp_result == 0) break;\n        // CUDA uses little endian,\n        // and the lowest byte in register stores in the lowest address.\n        uint32_t index = (__ffs(cmp_result) - 1) >> 3;\n        cmp_result &= (cmp_result - 1);\n        possible_pos = pos_cur + i * 4 + index;\n        auto current_key = BUCKET::keys(bucket_keys_ptr, possible_pos);\n        K expected_key = key;\n        // Modifications to the bucket will not before this instruction.\n        result = current_key->compare_exchange_strong(\n            expected_key, static_cast<K>(LOCKED_KEY),\n            cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);\n      } while (!result);\n      if (result) {\n        occupy_result = OccupyResult::DUPLICATE;\n        key_pos = possible_pos;\n        VecV* bucket_value_ptr = bucket_values_ptr + key_pos * dim;\n        const VecV* param_value_ptr = values + kv_idx * dim;\n        CopyValue::ldg_stg(0, bucket_value_ptr, param_value_ptr, dim);\n        auto key_address = BUCKET::keys(bucket_keys_ptr, key_pos);\n        // memory_order_release:\n        // Modifications to the bucket will not after this instruction.\n        key_address->store(key, cuda::std::memory_order_release);\n        return;\n      }\n      VecD_Comp empty_digests_ = empty_digests<K>();\n      cmp_result = __vcmpeq4(probe_digests, empty_digests_);\n      cmp_result &= 0x01010101;\n      do {\n        if (cmp_result == 0) break;\n        uint32_t index = (__ffs(cmp_result) - 1) >> 3;\n        cmp_result &= (cmp_result - 1);\n        possible_pos = pos_cur + i * 4 + index;\n        if (offset == 0 && possible_pos < key_pos) continue;\n        auto current_key = BUCKET::keys(bucket_keys_ptr, possible_pos);\n        auto probe_key = current_key->load(cuda::std::memory_order_relaxed);\n        if (probe_key == static_cast<K>(EMPTY_KEY)) {\n          return;\n        }\n      } while (true);\n    }\n  }\n}\ntemplate <typename K = uint64_t, typename V = byte4, typename S = uint64_t,\n          typename VecV = byte16, uint32_t BLOCK_SIZE = 128,\n          uint32_t GROUP_SIZE = 16>\n__global__ void pipeline_update_values_kernel_with_io(\n    Bucket<K, V, S>* __restrict__ buckets, const uint64_t buckets_num,\n    const uint32_t dim, const K* __restrict__ keys,\n    const VecV* __restrict__ values, uint64_t n) {\n  constexpr uint32_t BUCKET_SIZE = 128;\n  constexpr int GROUP_NUM = BLOCK_SIZE / GROUP_SIZE;\n  constexpr uint32_t Comp_LEN = sizeof(VecD_Comp) / sizeof(D);\n  // Here, GROUP_SIZE * Load_LEN = BUCKET_SIZE.\n  using VecD_Load = byte8;\n  constexpr uint32_t Load_LEN = sizeof(VecD_Load) / sizeof(D);\n  constexpr int RESERVE = 8;\n\n  using BUCKET = Bucket<K, V, S>;\n  using CopyValue = CopyValueMultipleGroup<VecV, GROUP_SIZE>;\n\n  __shared__ VecD_Comp sm_target_digests[BLOCK_SIZE];\n  __shared__ K sm_target_keys[BLOCK_SIZE];\n  __shared__ K* sm_keys_ptr[BLOCK_SIZE];\n  __shared__ VecV* sm_values_ptr[BLOCK_SIZE];\n  // Reuse\n  int* sm_counts = reinterpret_cast<int*>(sm_target_digests);\n  int* sm_position = sm_counts;\n  // Double buffer\n  __shared__ D sm_digests[GROUP_NUM][2 * BUCKET_SIZE];\n  __shared__ K sm_possible_keys[GROUP_NUM][2 * RESERVE];\n  __shared__ int sm_possible_pos[GROUP_NUM][2 * RESERVE];\n  __shared__ int sm_ranks[GROUP_NUM][2];\n  // __shared__ VecV sm_values_buffer[GROUP_NUM][2 * dim];\n\n  extern __shared__ __align__(alignof(byte16)) byte sm_values_buffer[];\n\n  bool CAS_res[2]{false};\n\n  // Initialization\n  auto g = cg::tiled_partition<GROUP_SIZE>(cg::this_thread_block());\n  int groupID = threadIdx.x / GROUP_SIZE;\n  int rank = g.thread_rank();\n  uint64_t key_idx_base = (blockIdx.x * blockDim.x) + groupID * GROUP_SIZE;\n  if (key_idx_base >= n) return;\n  int loop_num =\n      (n - key_idx_base) < GROUP_SIZE ? (n - key_idx_base) : GROUP_SIZE;\n  if (rank < loop_num) {\n    int idx_block = groupID * GROUP_SIZE + rank;\n    K key = keys[key_idx_base + rank];\n    sm_target_keys[idx_block] = key;\n    const K hashed_key = Murmur3HashDevice(key);\n    sm_target_digests[idx_block] = digests_from_hashed<K>(hashed_key);\n    uint64_t global_idx = hashed_key % (buckets_num * BUCKET_SIZE);\n    uint64_t bkt_idx = global_idx / BUCKET_SIZE;\n    Bucket<K, V, S>* bucket = buckets + bkt_idx;\n    __pipeline_memcpy_async(sm_keys_ptr + idx_block, bucket->keys_addr(),\n                            sizeof(K*));\n    __pipeline_commit();\n    __pipeline_memcpy_async(sm_values_ptr + idx_block, &(bucket->vectors),\n                            sizeof(VecV*));\n  }\n  __pipeline_wait_prior(0);\n\n  // Pipeline loading\n  K* keys_ptr = sm_keys_ptr[groupID * GROUP_SIZE];\n  D* digests_ptr = BUCKET::digests(keys_ptr, BUCKET_SIZE, rank * Load_LEN);\n  __pipeline_memcpy_async(sm_digests[groupID] + rank * Load_LEN, digests_ptr,\n                          sizeof(VecD_Load));\n  __pipeline_commit();\n  // Padding, meet the param of the first `__pipeline_wait_prior`\n  // in the first loop.\n  __pipeline_commit();\n  __pipeline_commit();\n\n  for (int i = 0; i < loop_num; i++) {\n    int key_idx_block = groupID * GROUP_SIZE + i;\n\n    /* Step1: prefetch all digests in one bucket */\n    if ((i + 1) < loop_num) {\n      K* keys_ptr = sm_keys_ptr[key_idx_block + 1];\n      D* digests_ptr = BUCKET::digests(keys_ptr, BUCKET_SIZE, rank * Load_LEN);\n      __pipeline_memcpy_async(\n          sm_digests[groupID] + diff_buf(i) * BUCKET_SIZE + rank * Load_LEN,\n          digests_ptr, sizeof(VecD_Load));\n    }\n    __pipeline_commit();\n\n    /* Step2: check digests and load possible keys */\n    VecD_Comp target_digests = sm_target_digests[key_idx_block];\n    sm_counts[key_idx_block] = 0;\n    __pipeline_wait_prior(3);\n    VecD_Comp probing_digests = *reinterpret_cast<VecD_Comp*>(\n        &sm_digests[groupID][same_buf(i) * BUCKET_SIZE + rank * Comp_LEN]);\n    uint32_t find_result_ = __vcmpeq4(probing_digests, target_digests);\n    uint32_t find_result = 0;\n    if ((find_result_ & 0x01) != 0) find_result |= 0x01;\n    if ((find_result_ & 0x0100) != 0) find_result |= 0x02;\n    if ((find_result_ & 0x010000) != 0) find_result |= 0x04;\n    if ((find_result_ & 0x01000000) != 0) find_result |= 0x08;\n    probing_digests = *reinterpret_cast<VecD_Comp*>(\n        &sm_digests[groupID][same_buf(i) * BUCKET_SIZE +\n                             (GROUP_SIZE + rank) * Comp_LEN]);\n    find_result_ = __vcmpeq4(probing_digests, target_digests);\n    if ((find_result_ & 0x01) != 0) find_result |= 0x10;\n    if ((find_result_ & 0x0100) != 0) find_result |= 0x20;\n    if ((find_result_ & 0x010000) != 0) find_result |= 0x40;\n    if ((find_result_ & 0x01000000) != 0) find_result |= 0x80;\n    int find_number = __popc(find_result);\n    int group_base = 0;\n    if (find_number > 0) {\n      group_base = atomicAdd(sm_counts + key_idx_block, find_number);\n    }\n    bool gt_reserve = (group_base + find_number) > RESERVE;\n    int gt_vote = g.ballot(gt_reserve);\n    K* key_ptr = sm_keys_ptr[key_idx_block];\n    if (gt_vote == 0) {\n      do {\n        int digest_idx = __ffs(find_result) - 1;\n        if (digest_idx >= 0) {\n          find_result &= (find_result - 1);\n          int key_pos = digest_idx < 4\n                            ? (rank * 4 + digest_idx)\n                            : ((GROUP_SIZE + rank - 1) * 4 + digest_idx);\n          sm_possible_pos[groupID][same_buf(i) * RESERVE + group_base] =\n              key_pos;\n          __pipeline_memcpy_async(\n              sm_possible_keys[groupID] + same_buf(i) * RESERVE + group_base,\n              key_ptr + key_pos, sizeof(K));\n          group_base += 1;\n        } else {\n          break;\n        }\n      } while (true);\n    } else {\n      K target_key = sm_target_keys[key_idx_block];\n      sm_counts[key_idx_block] = 0;\n      int found_vote = 0;\n      bool found = false;\n      do {\n        int digest_idx = __ffs(find_result) - 1;\n        if (digest_idx >= 0) {\n          find_result &= (find_result - 1);\n          int key_pos = digest_idx < 4\n                            ? (rank * 4 + digest_idx)\n                            : ((GROUP_SIZE + rank - 1) * 4 + digest_idx);\n          K possible_key = key_ptr[key_pos];\n          if (possible_key == target_key) {\n            found = true;\n            sm_counts[key_idx_block] = 1;\n            sm_possible_pos[groupID][same_buf(i) * RESERVE] = key_pos;\n            sm_possible_keys[groupID][same_buf(i) * RESERVE] = possible_key;\n          }\n        }\n        found_vote = g.ballot(found);\n        if (found_vote) {\n          break;\n        }\n        found_vote = digest_idx >= 0;\n      } while (g.any(found_vote));\n    }\n    __pipeline_commit();\n\n    /* Step3: check possible keys, and prefecth the value */\n    if (i > 0) {\n      key_idx_block -= 1;\n      K target_key = sm_target_keys[key_idx_block];\n      K* keys_ptr = sm_keys_ptr[key_idx_block];\n      int possible_num = sm_counts[key_idx_block];\n      sm_position[key_idx_block] = -1;\n      __pipeline_wait_prior(3);\n      int key_pos;\n      bool found_flag = false;\n      if (rank < possible_num) {\n        K possible_key =\n            sm_possible_keys[groupID][diff_buf(i) * RESERVE + rank];\n        key_pos = sm_possible_pos[groupID][diff_buf(i) * RESERVE + rank];\n        if (possible_key == target_key) {\n          found_flag = true;\n          auto key_ptr = BUCKET::keys(keys_ptr, key_pos);\n          sm_ranks[groupID][diff_buf(i)] = rank;\n          if (diff_buf(i) == 0) {\n            CAS_res[0] = key_ptr->compare_exchange_strong(\n                possible_key, static_cast<K>(LOCKED_KEY),\n                cuda::std::memory_order_acquire,\n                cuda::std::memory_order_relaxed);\n          } else {\n            CAS_res[1] = key_ptr->compare_exchange_strong(\n                possible_key, static_cast<K>(LOCKED_KEY),\n                cuda::std::memory_order_acquire,\n                cuda::std::memory_order_relaxed);\n          }\n        }\n      }\n      int found_vote = g.ballot(found_flag);\n      if (found_vote) {\n        int src_lane = __ffs(found_vote) - 1;\n        int target_pos = g.shfl(key_pos, src_lane);\n        sm_position[key_idx_block] = target_pos;\n        int key_idx_grid = blockIdx.x * blockDim.x + key_idx_block;\n        const VecV* v_src = values + key_idx_grid * dim;\n        auto tmp = reinterpret_cast<VecV*>(sm_values_buffer);\n        VecV* v_dst = tmp + (groupID * 2 + diff_buf(i)) * dim;\n        CopyValue::ldg_sts(rank, v_dst, v_src, dim);\n      }\n    }\n    __pipeline_commit();\n\n    /* Step4: write back value */\n    if (i > 1) {\n      key_idx_block -= 1;\n      VecV* value_ptr = sm_values_ptr[key_idx_block];\n      int target_pos = sm_position[key_idx_block];\n      K target_key = sm_target_keys[key_idx_block];\n      K* keys_ptr = sm_keys_ptr[key_idx_block];\n      int src_lane = sm_ranks[groupID][same_buf(i)];\n      __pipeline_wait_prior(3);\n      int succ = 0;\n      if (rank == src_lane) {\n        bool CAS_res_cur = same_buf(i) == 0 ? CAS_res[0] : CAS_res[1];\n        succ = CAS_res_cur ? 1 : 0;\n      }\n      succ = g.shfl(succ, src_lane);\n      if (target_pos >= 0 && succ == 1) {\n        auto tmp = reinterpret_cast<VecV*>(sm_values_buffer);\n        VecV* v_src = tmp + (groupID * 2 + same_buf(i)) * dim;\n        VecV* v_dst = value_ptr + target_pos * dim;\n        CopyValue::lds_stg(rank, v_dst, v_src, dim);\n        if (rank == 0) {\n          auto key_address = BUCKET::keys(keys_ptr, target_pos);\n          key_address->store(target_key, cuda::std::memory_order_release);\n        }\n      }\n    }\n  }  // End loop\n\n  /* Pipeline emptying: step3, i = loop_num */\n  {\n    int key_idx_block = groupID * GROUP_SIZE + (loop_num - 1);\n    K target_key = sm_target_keys[key_idx_block];\n    K* keys_ptr = sm_keys_ptr[key_idx_block];\n    int possible_num = sm_counts[key_idx_block];\n    sm_position[key_idx_block] = -1;\n    __pipeline_wait_prior(1);\n    int key_pos;\n    bool found_flag = false;\n    if (rank < possible_num) {\n      K possible_key =\n          sm_possible_keys[groupID][diff_buf(loop_num) * RESERVE + rank];\n      key_pos = sm_possible_pos[groupID][diff_buf(loop_num) * RESERVE + rank];\n      if (possible_key == target_key) {\n        found_flag = true;\n        auto key_ptr = BUCKET::keys(keys_ptr, key_pos);\n        sm_ranks[groupID][diff_buf(loop_num)] = rank;\n        if (diff_buf(loop_num) == 0) {\n          CAS_res[0] = key_ptr->compare_exchange_strong(\n              possible_key, static_cast<K>(LOCKED_KEY),\n              cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);\n        } else {\n          CAS_res[1] = key_ptr->compare_exchange_strong(\n              possible_key, static_cast<K>(LOCKED_KEY),\n              cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);\n        }\n      }\n    }\n    int found_vote = g.ballot(found_flag);\n    if (found_vote) {\n      int src_lane = __ffs(found_vote) - 1;\n      int target_pos = g.shfl(key_pos, src_lane);\n      sm_position[key_idx_block] = target_pos;\n      int key_idx_grid = blockIdx.x * blockDim.x + key_idx_block;\n      const VecV* v_src = values + key_idx_grid * dim;\n      auto tmp = reinterpret_cast<VecV*>(sm_values_buffer);\n      VecV* v_dst = tmp + (groupID * 2 + diff_buf(loop_num)) * dim;\n      CopyValue::ldg_sts(rank, v_dst, v_src, dim);\n    }\n  }\n  __pipeline_commit();\n\n  /* Pipeline emptying: step4, i = loop_num */\n  if (loop_num > 1) {\n    int key_idx_block = groupID * GROUP_SIZE + loop_num - 2;\n    VecV* value_ptr = sm_values_ptr[key_idx_block];\n    int target_pos = sm_position[key_idx_block];\n    K target_key = sm_target_keys[key_idx_block];\n    K* keys_ptr = sm_keys_ptr[key_idx_block];\n    int src_lane = sm_ranks[groupID][same_buf(loop_num)];\n    __pipeline_wait_prior(1);\n    int succ = 0;\n    if (rank == src_lane) {\n      bool CAS_res_cur = same_buf(loop_num) == 0 ? CAS_res[0] : CAS_res[1];\n      succ = CAS_res_cur ? 1 : 0;\n    }\n    succ = g.shfl(succ, src_lane);\n    if (target_pos >= 0 && succ == 1) {\n      auto tmp = reinterpret_cast<VecV*>(sm_values_buffer);\n      VecV* v_src = tmp + (groupID * 2 + same_buf(loop_num)) * dim;\n      VecV* v_dst = value_ptr + target_pos * dim;\n      CopyValue::lds_stg(rank, v_dst, v_src, dim);\n\n      auto key_ptr = BUCKET::keys(keys_ptr, target_pos);\n      if (rank == 0) {\n        auto key_address = BUCKET::keys(keys_ptr, target_pos);\n        key_address->store(target_key, cuda::std::memory_order_release);\n      }\n    }\n  }\n\n  /* Pipeline emptying: step4, i = loop_num + 1 */\n  {\n    int key_idx_block = groupID * GROUP_SIZE + loop_num - 1;\n    VecV* value_ptr = sm_values_ptr[key_idx_block];\n    int target_pos = sm_position[key_idx_block];\n    K target_key = sm_target_keys[key_idx_block];\n    K* keys_ptr = sm_keys_ptr[key_idx_block];\n    int src_lane = sm_ranks[groupID][same_buf(loop_num + 1)];\n    __pipeline_wait_prior(0);\n    int succ = 0;\n    if (rank == src_lane) {\n      bool CAS_res_cur = same_buf(loop_num + 1) == 0 ? CAS_res[0] : CAS_res[1];\n      succ = CAS_res_cur ? 1 : 0;\n    }\n    succ = g.shfl(succ, src_lane);\n    if (target_pos >= 0 && succ == 1) {\n      auto tmp = reinterpret_cast<VecV*>(sm_values_buffer);\n      VecV* v_src = tmp + (groupID * 2 + same_buf(loop_num + 1)) * dim;\n      VecV* v_dst = value_ptr + target_pos * dim;\n      CopyValue::lds_stg(rank, v_dst, v_src, dim);\n      if (rank == 0) {\n        auto key_address = BUCKET::keys(keys_ptr, target_pos);\n        key_address->store(target_key, cuda::std::memory_order_release);\n      }\n    }\n  }\n}  // End function\n\ntemplate <typename K = uint64_t, typename V = float, typename S = uint64_t>\nstruct Params_UpdateValues {\n  Params_UpdateValues(float load_factor_,\n                      Bucket<K, V, S>* __restrict__ buckets_,\n                      size_t buckets_num_, uint32_t bucket_capacity_,\n                      uint32_t dim_, const K* __restrict__ keys_,\n                      const V* __restrict__ values_, size_t n_)\n      : load_factor(load_factor_),\n        buckets(buckets_),\n        buckets_num(buckets_num_),\n        bucket_capacity(bucket_capacity_),\n        dim(dim_),\n        keys(keys_),\n        values(values_),\n        n(n_) {}\n  float load_factor;\n  Bucket<K, V, S>* __restrict__ buckets;\n  size_t buckets_num;\n  uint32_t bucket_capacity;\n  uint32_t dim;\n  const K* __restrict__ keys;\n  const V* __restrict__ values;\n  uint64_t n;\n};\n\ntemplate <typename K, typename V, typename S, typename VecV>\nstruct Launch_TLP_UpdateValues {\n  using Params = Params_UpdateValues<K, V, S>;\n  inline static void launch_kernel(Params& params, cudaStream_t& stream) {\n    constexpr int BLOCK_SIZE = 128;\n    params.dim = params.dim * sizeof(V) / sizeof(VecV);\n    tlp_update_values_kernel_with_io<K, V, S, VecV, BLOCK_SIZE>\n        <<<(params.n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(\n            params.buckets, params.buckets_num, params.bucket_capacity,\n            params.dim, params.keys,\n            reinterpret_cast<const VecV*>(params.values), params.n);\n  }\n};\n\ntemplate <typename K, typename V, typename S, typename VecV>\nstruct Launch_Pipeline_UpdateValues {\n  using Params = Params_UpdateValues<K, V, S>;\n  inline static void launch_kernel(Params& params, cudaStream_t& stream) {\n    constexpr int BLOCK_SIZE = 128;\n    constexpr uint32_t GROUP_SIZE = 16;\n    constexpr uint32_t GROUP_NUM = BLOCK_SIZE / GROUP_SIZE;\n\n    params.dim = params.dim * sizeof(V) / sizeof(VecV);\n    uint32_t shared_mem = GROUP_NUM * 2 * params.dim * sizeof(VecV);\n    shared_mem =\n        (shared_mem + sizeof(byte16) - 1) / sizeof(byte16) * sizeof(byte16);\n    pipeline_update_values_kernel_with_io<K, V, S, VecV, BLOCK_SIZE, GROUP_SIZE>\n        <<<(params.n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, shared_mem,\n           stream>>>(params.buckets, params.buckets_num, params.dim,\n                     params.keys, reinterpret_cast<const VecV*>(params.values),\n                     params.n);\n  }\n};\n\ntemplate <typename ArchTag>\nstruct ValueConfig_UpdateValues;\n\n/// TODO: support more arch.\ntemplate <>\nstruct ValueConfig_UpdateValues<Sm80> {\n  // Value size greater than it will bring poor performance for TLP.\n  static constexpr uint32_t size_tlp = 8 * sizeof(byte4);\n  // Value size greater than it will reduce the occupancy for Pipeline.\n  // When the value is very high, the kernel will fail to launch.\n  static constexpr uint32_t size_pipeline = 128 * sizeof(byte4);\n};\n\ntemplate <>\nstruct ValueConfig_UpdateValues<Sm70> {\n  // Value size greater than it will bring poor performance for TLP.\n  static constexpr uint32_t size_tlp = 8 * sizeof(byte4);\n  // Value size greater than it will reduce the occupancy for Pipeline.\n  // When the value is very high, the kernel will fail to launch.\n  static constexpr uint32_t size_pipeline = 64 * sizeof(byte4);\n};\n\ntemplate <typename K, typename V, typename S, typename ArchTag>\nstruct KernelSelector_UpdateValues {\n  using ValueConfig = ValueConfig_UpdateValues<ArchTag>;\n  using Params = Params_UpdateValues<K, V, S>;\n\n  static bool callable(bool unique_key, uint32_t bucket_size, uint32_t dim) {\n    constexpr uint32_t MinBucketCap = sizeof(VecD_Load) / sizeof(D);\n    if (!unique_key || bucket_size < MinBucketCap) return false;\n    uint32_t value_size = dim * sizeof(V);\n    if (value_size <= ValueConfig::size_tlp) return true;\n    if (bucket_size == 128 && value_size <= ValueConfig::size_pipeline) {\n      return true;\n    }\n    return false;\n  }\n\n  static void select_kernel(Params& params, cudaStream_t& stream) {\n    const uint32_t total_value_size =\n        static_cast<uint32_t>(params.dim * sizeof(V));\n\n    auto launch_TLP = [&]() {\n      if (total_value_size % sizeof(byte16) == 0) {\n        using VecV = byte16;\n        Launch_TLP_UpdateValues<K, V, S, VecV>::launch_kernel(params, stream);\n      } else if (total_value_size % sizeof(byte8) == 0) {\n        using VecV = byte8;\n        Launch_TLP_UpdateValues<K, V, S, VecV>::launch_kernel(params, stream);\n      } else if (total_value_size % sizeof(byte4) == 0) {\n        using VecV = byte4;\n        Launch_TLP_UpdateValues<K, V, S, VecV>::launch_kernel(params, stream);\n      } else if (total_value_size % sizeof(byte2) == 0) {\n        using VecV = byte2;\n        Launch_TLP_UpdateValues<K, V, S, VecV>::launch_kernel(params, stream);\n      } else {\n        using VecV = byte;\n        Launch_TLP_UpdateValues<K, V, S, VecV>::launch_kernel(params, stream);\n      }\n    };\n\n    auto launch_Pipeline = [&]() {\n      if (total_value_size % sizeof(byte16) == 0) {\n        using VecV = byte16;\n        Launch_Pipeline_UpdateValues<K, V, S, VecV>::launch_kernel(params,\n                                                                   stream);\n      } else if (total_value_size % sizeof(byte8) == 0) {\n        using VecV = byte8;\n        Launch_Pipeline_UpdateValues<K, V, S, VecV>::launch_kernel(params,\n                                                                   stream);\n      } else if (total_value_size % sizeof(byte4) == 0) {\n        using VecV = byte4;\n        Launch_Pipeline_UpdateValues<K, V, S, VecV>::launch_kernel(params,\n                                                                   stream);\n      } else if (total_value_size % sizeof(byte2) == 0) {\n        using VecV = byte2;\n        Launch_Pipeline_UpdateValues<K, V, S, VecV>::launch_kernel(params,\n                                                                   stream);\n      } else {\n        using VecV = byte;\n        Launch_Pipeline_UpdateValues<K, V, S, VecV>::launch_kernel(params,\n                                                                   stream);\n      }\n    };\n    // This part is according to the test on A100.\n    if (params.bucket_capacity != 128) {\n      launch_TLP();\n    } else {\n      if (total_value_size <= ValueConfig::size_tlp) {\n        if (params.load_factor <= 0.60f) {\n          launch_TLP();\n        } else {\n          launch_Pipeline();\n        }\n      } else {\n        launch_Pipeline();\n      }\n    }\n  }  // End function\n};\n\n/*\n * update with IO operation. This kernel is\n * usually used for the pure HBM mode for better performance.\n */\ntemplate <class K, class V, class S, uint32_t TILE_SIZE = 4>\n__global__ void update_values_kernel_with_io(\n    const Table<K, V, S>* __restrict table, Bucket<K, V, S>* buckets,\n    const size_t bucket_max_size, const size_t buckets_num, const size_t dim,\n    const K* __restrict keys, const V* __restrict values, const size_t N) {\n  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());\n  int* buckets_size = table->buckets_size;\n\n  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;\n       t += blockDim.x * gridDim.x) {\n    int key_pos = -1;\n    size_t key_idx = t / TILE_SIZE;\n\n    const K update_key = keys[key_idx];\n\n    if (IS_RESERVED_KEY<K>(update_key)) continue;\n\n    const V* update_value = values + key_idx * dim;\n\n    size_t bkt_idx = 0;\n    size_t start_idx = 0;\n    int src_lane = -1;\n\n    Bucket<K, V, S>* bucket = get_key_position<K>(\n        buckets, update_key, bkt_idx, start_idx, buckets_num, bucket_max_size);\n\n    OccupyResult occupy_result{OccupyResult::INITIAL};\n    const int bucket_size = buckets_size[bkt_idx];\n\n    if (bucket_size >= bucket_max_size) {\n      start_idx = (start_idx / TILE_SIZE) * TILE_SIZE;\n    }\n    occupy_result = find_and_lock_for_update<K, V, S, TILE_SIZE>(\n        g, bucket, update_key, start_idx, key_pos, src_lane, bucket_max_size);\n\n    occupy_result = g.shfl(occupy_result, src_lane);\n\n    if (occupy_result == OccupyResult::REFUSED) continue;\n\n    if (occupy_result == OccupyResult::DUPLICATE) {\n      copy_vector<V, TILE_SIZE>(g, update_value,\n                                bucket->vectors + key_pos * dim, dim);\n    }\n\n    if (g.thread_rank() == src_lane) {\n      (bucket->keys(key_pos))\n          ->store(update_key, cuda::std::memory_order_relaxed);\n    }\n  }\n}\n\ntemplate <typename K, typename V, typename S>\nstruct SelectUpdateValuesKernelWithIO {\n  static void execute_kernel(const float& load_factor, const int& block_size,\n                             const size_t bucket_max_size,\n                             const size_t buckets_num, const size_t dim,\n                             cudaStream_t& stream, const size_t& n,\n                             const Table<K, V, S>* __restrict table,\n                             Bucket<K, V, S>* buckets, const K* __restrict keys,\n                             const V* __restrict values) {\n    if (load_factor <= 0.75) {\n      const unsigned int tile_size = 4;\n      const size_t N = n * tile_size;\n      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);\n      update_values_kernel_with_io<K, V, S, tile_size>\n          <<<grid_size, block_size, 0, stream>>>(table, buckets,\n                                                 bucket_max_size, buckets_num,\n                                                 dim, keys, values, N);\n    } else {\n      const unsigned int tile_size = 32;\n      const size_t N = n * tile_size;\n      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);\n      update_values_kernel_with_io<K, V, S, tile_size>\n          <<<grid_size, block_size, 0, stream>>>(table, buckets,\n                                                 bucket_max_size, buckets_num,\n                                                 dim, keys, values, N);\n    }\n    return;\n  }\n};\n\n// Use 1 thread to deal with a KV-pair, including copying value.\ntemplate <typename K, typename V, typename S>\n__global__ void tlp_update_values_kernel_hybrid(\n    Bucket<K, V, S>* __restrict__ buckets, const uint64_t buckets_num,\n    uint32_t bucket_capacity, const uint32_t dim, const K* __restrict__ keys,\n    V** __restrict__ values, K** __restrict__ key_ptrs,\n    int* __restrict src_offset, uint64_t n) {\n  using BUCKET = Bucket<K, V, S>;\n\n  uint32_t tx = threadIdx.x;\n  uint32_t kv_idx = blockIdx.x * blockDim.x + tx;\n  K key{static_cast<K>(EMPTY_KEY)};\n  OccupyResult occupy_result{OccupyResult::INITIAL};\n  VecD_Comp target_digests{0};\n  V* bucket_values_ptr{nullptr};\n  K* bucket_keys_ptr{nullptr};\n  uint32_t key_pos = {0};\n  if (kv_idx < n) {\n    key = keys[kv_idx];\n    if (src_offset) src_offset[kv_idx] = kv_idx;\n    if (!IS_RESERVED_KEY<K>(key)) {\n      const K hashed_key = Murmur3HashDevice(key);\n      target_digests = digests_from_hashed<K>(hashed_key);\n      uint64_t global_idx =\n          static_cast<uint64_t>(hashed_key % (buckets_num * bucket_capacity));\n      key_pos = get_start_position(global_idx, bucket_capacity);\n      uint64_t bkt_idx = global_idx / bucket_capacity;\n      BUCKET* bucket = buckets + bkt_idx;\n      bucket_keys_ptr = reinterpret_cast<K*>(bucket->keys(0));\n      bucket_values_ptr = bucket->vectors;\n    } else {\n      key_ptrs[kv_idx] = nullptr;\n      return;\n    }\n  } else {\n    return;\n  }\n\n  // Load `STRIDE` digests every time.\n  constexpr uint32_t STRIDE = sizeof(VecD_Load) / sizeof(D);\n  // One more loop to handle empty keys.\n  for (int offset = 0; offset < bucket_capacity + STRIDE; offset += STRIDE) {\n    if (occupy_result != OccupyResult::INITIAL) break;\n\n    uint32_t pos_cur = align_to<STRIDE>(key_pos);\n    pos_cur = (pos_cur + offset) & (bucket_capacity - 1);\n\n    D* digests_ptr = BUCKET::digests(bucket_keys_ptr, bucket_capacity, pos_cur);\n    VecD_Load digests_vec = *(reinterpret_cast<VecD_Load*>(digests_ptr));\n    VecD_Comp digests_arr[4] = {digests_vec.x, digests_vec.y, digests_vec.z,\n                                digests_vec.w};\n\n    for (int i = 0; i < 4; i++) {\n      VecD_Comp probe_digests = digests_arr[i];\n      uint32_t possible_pos = 0;\n      bool result = false;\n      // Perform a vectorized comparison by byte,\n      // and if they are equal, set the corresponding byte in the result to\n      // 0xff.\n      int cmp_result = __vcmpeq4(probe_digests, target_digests);\n      cmp_result &= 0x01010101;\n      do {\n        if (cmp_result == 0) break;\n        // CUDA uses little endian,\n        // and the lowest byte in register stores in the lowest address.\n        uint32_t index = (__ffs(cmp_result) - 1) >> 3;\n        cmp_result &= (cmp_result - 1);\n        possible_pos = pos_cur + i * 4 + index;\n        auto current_key = BUCKET::keys(bucket_keys_ptr, possible_pos);\n        K expected_key = key;\n        // Modifications to the bucket will not before this instruction.\n        result = current_key->compare_exchange_strong(\n            expected_key, static_cast<K>(LOCKED_KEY),\n            cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);\n      } while (!result);\n      if (result) {\n        key_pos = possible_pos;\n        V* bucket_value_ptr = bucket_values_ptr + key_pos * dim;\n        values[kv_idx] = bucket_value_ptr;\n        key_ptrs[kv_idx] = bucket_keys_ptr + key_pos;\n        return;\n      }\n      VecD_Comp empty_digests_ = empty_digests<K>();\n      cmp_result = __vcmpeq4(probe_digests, empty_digests_);\n      cmp_result &= 0x01010101;\n      do {\n        if (cmp_result == 0) break;\n        uint32_t index = (__ffs(cmp_result) - 1) >> 3;\n        cmp_result &= (cmp_result - 1);\n        possible_pos = pos_cur + i * 4 + index;\n        if (offset == 0 && possible_pos < key_pos) continue;\n        auto current_key = BUCKET::keys(bucket_keys_ptr, possible_pos);\n        auto probe_key = current_key->load(cuda::std::memory_order_relaxed);\n        if (probe_key == static_cast<K>(EMPTY_KEY)) {\n          return;\n        }\n      } while (true);\n    }\n  }\n}\n\ntemplate <class K, class V, class S, uint32_t TILE_SIZE = 4>\n__global__ void update_values_kernel(const Table<K, V, S>* __restrict table,\n                                     Bucket<K, V, S>* buckets,\n                                     const size_t bucket_max_size,\n                                     const size_t buckets_num, const size_t dim,\n                                     const K* __restrict keys,\n                                     V** __restrict vectors,\n                                     int* __restrict src_offset, size_t N) {\n  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());\n  int* buckets_size = table->buckets_size;\n\n  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;\n       t += blockDim.x * gridDim.x) {\n    int key_pos = -1;\n    size_t key_idx = t / TILE_SIZE;\n\n    const K update_key = keys[key_idx];\n\n    if (IS_RESERVED_KEY<K>(update_key)) continue;\n\n    size_t bkt_idx = 0;\n    size_t start_idx = 0;\n    int src_lane = -1;\n\n    Bucket<K, V, S>* bucket = get_key_position<K>(\n        buckets, update_key, bkt_idx, start_idx, buckets_num, bucket_max_size);\n\n    OccupyResult occupy_result{OccupyResult::INITIAL};\n    const int bucket_size = buckets_size[bkt_idx];\n    *(src_offset + key_idx) = key_idx;\n\n    if (bucket_size >= bucket_max_size) {\n      start_idx = (start_idx / TILE_SIZE) * TILE_SIZE;\n    }\n    occupy_result = find_and_lock_for_update<K, V, S, TILE_SIZE>(\n        g, bucket, update_key, start_idx, key_pos, src_lane, bucket_max_size);\n\n    occupy_result = g.shfl(occupy_result, src_lane);\n\n    if (occupy_result == OccupyResult::REFUSED) continue;\n\n    if (g.thread_rank() == src_lane) {\n      if (occupy_result == OccupyResult::DUPLICATE) {\n        *(vectors + key_idx) = (bucket->vectors + key_pos * dim);\n      } else {\n        *(vectors + key_idx) = nullptr;\n      }\n    }\n\n    if (g.thread_rank() == src_lane) {\n      (bucket->keys(key_pos))\n          ->store(update_key, cuda::std::memory_order_relaxed);\n    }\n  }\n}\n\n}  // namespace merlin\n}  // namespace nv"
  },
  {
    "path": "include/merlin/core_kernels/upsert.cuh",
    "content": "/*\n * Copyright (c) 2023, NVIDIA CORPORATION.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *     http:///www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n#pragma once\n\n#include \"kernel_utils.cuh\"\n\nnamespace nv {\nnamespace merlin {\n\n// Use 1 thread to deal with a KV-pair, including copying value.\ntemplate <typename K = uint64_t, typename V = byte4, typename S = uint64_t,\n          typename VecV = byte16, uint32_t BLOCK_SIZE = 128, int Strategy = -1>\n__global__ void tlp_v1_upsert_kernel_with_io(\n    Bucket<K, V, S>* __restrict__ buckets, int32_t* __restrict__ buckets_size,\n    const uint64_t buckets_num, uint32_t bucket_capacity, const uint32_t dim,\n    const K* __restrict__ keys, const VecV* __restrict__ values,\n    const S* __restrict__ scores, uint64_t n, const S global_epoch) {\n  using BUCKET = Bucket<K, V, S>;\n  using CopyValue = CopyValueMultipleGroup<VecV, 1>;\n  using ScoreFunctor = ScoreFunctor<K, V, S, Strategy>;\n\n  // bucket_capacity is a multiple of 4.\n  constexpr uint32_t STRIDE_S = 4;\n  constexpr uint32_t Load_LEN_S = sizeof(byte16) / sizeof(S);\n  __shared__ S sm_bucket_scores[BLOCK_SIZE][2 * STRIDE_S];\n\n  uint32_t tx = threadIdx.x;\n  uint32_t kv_idx = blockIdx.x * blockDim.x + tx;\n  K key{static_cast<K>(EMPTY_KEY)};\n  S score{static_cast<S>(EMPTY_SCORE)};\n  OccupyResult occupy_result{OccupyResult::INITIAL};\n  VecD_Comp target_digests{0};\n  VecV* bucket_values_ptr{nullptr};\n  K* bucket_keys_ptr{nullptr};\n  int32_t* bucket_size_ptr{nullptr};\n  uint32_t key_pos = {0};\n  uint32_t bucket_size{0};\n  if (kv_idx < n) {\n    key = keys[kv_idx];\n    score = ScoreFunctor::desired_when_missed(scores, kv_idx, global_epoch);\n\n    if (!IS_RESERVED_KEY<K>(key)) {\n      const K hashed_key = Murmur3HashDevice(key);\n      target_digests = digests_from_hashed<K>(hashed_key);\n      uint64_t global_idx =\n          static_cast<uint64_t>(hashed_key % (buckets_num * bucket_capacity));\n      key_pos = get_start_position(global_idx, bucket_capacity);\n      uint64_t bkt_idx = global_idx / bucket_capacity;\n      bucket_size_ptr = buckets_size + bkt_idx;\n      BUCKET* bucket = buckets + bkt_idx;\n      bucket_size = *bucket_size_ptr;\n      bucket_keys_ptr = reinterpret_cast<K*>(bucket->keys(0));\n      bucket_values_ptr = reinterpret_cast<VecV*>(bucket->vectors);\n    } else {\n      return;\n    }\n  } else {\n    return;\n  }\n\n  // Load `STRIDE` digests every time.\n  constexpr uint32_t STRIDE = sizeof(VecD_Load) / sizeof(D);\n  // One more loop to handle empty keys.\n  for (int offset = 0; offset < bucket_capacity + STRIDE; offset += STRIDE) {\n    if (occupy_result != OccupyResult::INITIAL) break;\n\n    uint32_t pos_cur = align_to<STRIDE>(key_pos);\n    pos_cur = (pos_cur + offset) & (bucket_capacity - 1);\n\n    D* digests_ptr = BUCKET::digests(bucket_keys_ptr, bucket_capacity, pos_cur);\n    VecD_Load digests_vec = *(reinterpret_cast<VecD_Load*>(digests_ptr));\n    VecD_Comp digests_arr[4] = {digests_vec.x, digests_vec.y, digests_vec.z,\n                                digests_vec.w};\n\n    for (int i = 0; i < 4; i++) {\n      VecD_Comp probe_digests = digests_arr[i];\n      uint32_t possible_pos = 0;\n      bool result = false;\n      // Perform a vectorized comparison by byte,\n      // and if they are equal, set the corresponding byte in the result to\n      // 0xff.\n      int cmp_result = __vcmpeq4(probe_digests, target_digests);\n      cmp_result &= 0x01010101;\n      do {\n        if (cmp_result == 0) break;\n        // CUDA uses little endian,\n        // and the lowest byte in register stores in the lowest address.\n        uint32_t index = (__ffs(cmp_result) - 1) >> 3;\n        cmp_result &= (cmp_result - 1);\n        possible_pos = pos_cur + i * 4 + index;\n        auto current_key = BUCKET::keys(bucket_keys_ptr, possible_pos);\n        K expected_key = key;\n        // Modifications to the bucket will not before this instruction.\n        result = current_key->compare_exchange_strong(\n            expected_key, static_cast<K>(LOCKED_KEY),\n            cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);\n      } while (!result);\n      if (result) {\n        occupy_result = OccupyResult::DUPLICATE;\n        key_pos = possible_pos;\n        ScoreFunctor::update_with_digest(bucket_keys_ptr, key_pos, scores,\n                                         kv_idx, score, bucket_capacity,\n                                         get_digest<K>(key), false);\n\n        break;\n      } else if (bucket_size == bucket_capacity) {\n        continue;\n      }\n      VecD_Comp empty_digests_ = empty_digests<K>();\n      cmp_result = __vcmpeq4(probe_digests, empty_digests_);\n      cmp_result &= 0x01010101;\n      do {\n        if (cmp_result == 0) break;\n        uint32_t index = (__ffs(cmp_result) - 1) >> 3;\n        cmp_result &= (cmp_result - 1);\n        possible_pos = pos_cur + i * 4 + index;\n        if (offset == 0 && possible_pos < key_pos) continue;\n        auto current_key = BUCKET::keys(bucket_keys_ptr, possible_pos);\n        K expected_key = static_cast<K>(EMPTY_KEY);\n        result = current_key->compare_exchange_strong(\n            expected_key, static_cast<K>(LOCKED_KEY),\n            cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);\n      } while (!result);\n      if (result) {\n        occupy_result = OccupyResult::OCCUPIED_EMPTY;\n        key_pos = possible_pos;\n        ScoreFunctor::update_with_digest(bucket_keys_ptr, key_pos, scores,\n                                         kv_idx, score, bucket_capacity,\n                                         get_digest<K>(key), true);\n        atomicAdd(bucket_size_ptr, 1);\n        break;\n      }\n    }\n  }\n  while (occupy_result == OccupyResult::INITIAL) {\n    S* bucket_scores_ptr = BUCKET::scores(bucket_keys_ptr, bucket_capacity, 0);\n    S min_score = static_cast<S>(MAX_SCORE);\n    int min_pos = -1;\n#pragma unroll\n    for (int j = 0; j < STRIDE_S; j += Load_LEN_S) {\n      __pipeline_memcpy_async(sm_bucket_scores[tx] + j, bucket_scores_ptr + j,\n                              sizeof(S) * Load_LEN_S);\n    }\n    __pipeline_commit();\n    for (int i = 0; i < bucket_capacity; i += STRIDE_S) {\n      if (i < bucket_capacity - STRIDE_S) {\n#pragma unroll\n        for (int j = 0; j < STRIDE_S; j += Load_LEN_S) {\n          __pipeline_memcpy_async(\n              sm_bucket_scores[tx] + diff_buf(i / STRIDE_S) * STRIDE_S + j,\n              bucket_scores_ptr + i + STRIDE_S + j, sizeof(S) * Load_LEN_S);\n        }\n      }\n      __pipeline_commit();\n      __pipeline_wait_prior(1);\n      S temp_scores[Load_LEN_S];\n      S* src = sm_bucket_scores[tx] + same_buf(i / STRIDE_S) * STRIDE_S;\n#pragma unroll\n      for (int k = 0; k < STRIDE_S; k += Load_LEN_S) {\n        *reinterpret_cast<byte16*>(temp_scores) =\n            *reinterpret_cast<byte16*>(src + k);\n#pragma unroll\n        for (int j = 0; j < Load_LEN_S; j += 1) {\n          S temp_score = temp_scores[j];\n          if (temp_score < min_score) {\n            min_score = temp_score;\n            min_pos = i + k + j;\n          }\n        }\n      }\n    }\n\n    score = ScoreFunctor::desired_when_missed(scores, kv_idx, global_epoch);\n    if (score < min_score) {\n      occupy_result = OccupyResult::REFUSED;\n      break;\n    }\n    auto min_score_key = BUCKET::keys(bucket_keys_ptr, min_pos);\n    auto expected_key = min_score_key->load(cuda::std::memory_order_relaxed);\n    if (expected_key != static_cast<K>(LOCKED_KEY) &&\n        expected_key != static_cast<K>(EMPTY_KEY)) {\n      bool result = min_score_key->compare_exchange_strong(\n          expected_key, static_cast<K>(LOCKED_KEY),\n          cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);\n      if (result) {\n        S* min_score_ptr =\n            BUCKET::scores(bucket_keys_ptr, bucket_capacity, min_pos);\n        auto verify_score_ptr =\n            reinterpret_cast<AtomicScore<S>*>(min_score_ptr);\n        auto verify_score =\n            verify_score_ptr->load(cuda::std::memory_order_relaxed);\n        if (verify_score <= min_score) {\n          key_pos = min_pos;\n          ScoreFunctor::update_with_digest(bucket_keys_ptr, key_pos, scores,\n                                           kv_idx, score, bucket_capacity,\n                                           get_digest<K>(key), true);\n\n          if (expected_key == static_cast<K>(RECLAIM_KEY)) {\n            occupy_result = OccupyResult::OCCUPIED_RECLAIMED;\n            atomicAdd(bucket_size_ptr, 1);\n          } else {\n            occupy_result = OccupyResult::EVICT;\n          }\n        } else {\n          min_score_key->store(expected_key, cuda::std::memory_order_release);\n        }\n      }\n    }\n  }\n  VecV* bucket_value_ptr = bucket_values_ptr + key_pos * dim;\n  const VecV* param_value_ptr = values + kv_idx * dim;\n\n  if (occupy_result != OccupyResult::REFUSED) {\n    CopyValue::ldg_stg(0, bucket_value_ptr, param_value_ptr, dim);\n    auto key_address = BUCKET::keys(bucket_keys_ptr, key_pos);\n    // memory_order_release:\n    // Modifications to the bucket will not after this instruction.\n    key_address->store(key, cuda::std::memory_order_release);\n  }\n}\n\n// Use 1 thread to deal with a KV-pair, including copying value.\ntemplate <typename K = uint64_t, typename V = byte4, typename S = uint64_t,\n          typename VecV = byte16, uint32_t BLOCK_SIZE = 128,\n          uint32_t GROUP_SIZE = 16, int Strategy = -1>\n__global__ void tlp_v2_upsert_kernel_with_io(\n    Bucket<K, V, S>* __restrict__ buckets, int32_t* __restrict__ buckets_size,\n    const uint64_t buckets_num, uint32_t bucket_capacity, const uint32_t dim,\n    const K* __restrict__ keys, const VecV* __restrict__ values,\n    const S* __restrict__ scores, uint64_t n, const S global_epoch) {\n  using BUCKET = Bucket<K, V, S>;\n  using CopyValue = CopyValueMultipleGroup<VecV, GROUP_SIZE>;\n  using ScoreFunctor = ScoreFunctor<K, V, S, Strategy>;\n\n  auto g = cg::tiled_partition<GROUP_SIZE>(cg::this_thread_block());\n\n  // bucket_capacity is a multiple of 4.\n  constexpr uint32_t STRIDE_S = 4;\n  constexpr uint32_t Load_LEN_S = sizeof(byte16) / sizeof(S);\n  __shared__ S sm_bucket_scores[BLOCK_SIZE][2 * STRIDE_S];\n\n  uint32_t tx = threadIdx.x;\n  uint32_t kv_idx = blockIdx.x * blockDim.x + tx;\n  K key{static_cast<K>(EMPTY_KEY)};\n  S score{static_cast<S>(EMPTY_SCORE)};\n  OccupyResult occupy_result{OccupyResult::INITIAL};\n  VecD_Comp target_digests{0};\n  VecV* bucket_values_ptr{nullptr};\n  K* bucket_keys_ptr{nullptr};\n  int32_t* bucket_size_ptr{nullptr};\n  uint32_t key_pos = {0};\n  uint32_t bucket_size{0};\n  if (kv_idx < n) {\n    key = keys[kv_idx];\n    score = ScoreFunctor::desired_when_missed(scores, kv_idx, global_epoch);\n\n    if (!IS_RESERVED_KEY<K>(key)) {\n      const K hashed_key = Murmur3HashDevice(key);\n      target_digests = digests_from_hashed<K>(hashed_key);\n      uint64_t global_idx =\n          static_cast<uint64_t>(hashed_key % (buckets_num * bucket_capacity));\n      key_pos = get_start_position(global_idx, bucket_capacity);\n      uint64_t bkt_idx = global_idx / bucket_capacity;\n      bucket_size_ptr = buckets_size + bkt_idx;\n      BUCKET* bucket = buckets + bkt_idx;\n      bucket_size = *bucket_size_ptr;\n      bucket_keys_ptr = reinterpret_cast<K*>(bucket->keys(0));\n      bucket_values_ptr = reinterpret_cast<VecV*>(bucket->vectors);\n    } else {\n      occupy_result = OccupyResult::ILLEGAL;\n    }\n  } else {\n    occupy_result = OccupyResult::ILLEGAL;\n  }\n\n  // Load `STRIDE` digests every time.\n  constexpr uint32_t STRIDE = sizeof(VecD_Load) / sizeof(D);\n  // One more loop to handle empty keys.\n  for (int offset = 0; offset < bucket_capacity + STRIDE; offset += STRIDE) {\n    if (occupy_result != OccupyResult::INITIAL) break;\n\n    uint32_t pos_cur = align_to<STRIDE>(key_pos);\n    pos_cur = (pos_cur + offset) & (bucket_capacity - 1);\n\n    D* digests_ptr = BUCKET::digests(bucket_keys_ptr, bucket_capacity, pos_cur);\n    VecD_Load digests_vec = *(reinterpret_cast<VecD_Load*>(digests_ptr));\n    VecD_Comp digests_arr[4] = {digests_vec.x, digests_vec.y, digests_vec.z,\n                                digests_vec.w};\n\n    for (int i = 0; i < 4; i++) {\n      VecD_Comp probe_digests = digests_arr[i];\n      uint32_t possible_pos = 0;\n      bool result = false;\n      // Perform a vectorized comparison by byte,\n      // and if they are equal, set the corresponding byte in the result to\n      // 0xff.\n      int cmp_result = __vcmpeq4(probe_digests, target_digests);\n      cmp_result &= 0x01010101;\n      do {\n        if (cmp_result == 0) break;\n        // CUDA uses little endian,\n        // and the lowest byte in register stores in the lowest address.\n        uint32_t index = (__ffs(cmp_result) - 1) >> 3;\n        cmp_result &= (cmp_result - 1);\n        possible_pos = pos_cur + i * 4 + index;\n        auto current_key = BUCKET::keys(bucket_keys_ptr, possible_pos);\n        K expected_key = key;\n        // Modifications to the bucket will not before this instruction.\n        result = current_key->compare_exchange_strong(\n            expected_key, static_cast<K>(LOCKED_KEY),\n            cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);\n      } while (!result);\n      if (result) {\n        occupy_result = OccupyResult::DUPLICATE;\n        key_pos = possible_pos;\n        ScoreFunctor::update_with_digest(bucket_keys_ptr, key_pos, scores,\n                                         kv_idx, score, bucket_capacity,\n                                         get_digest<K>(key), false);\n        break;\n      } else if (bucket_size == bucket_capacity) {\n        continue;\n      }\n      VecD_Comp empty_digests_ = empty_digests<K>();\n      cmp_result = __vcmpeq4(probe_digests, empty_digests_);\n      cmp_result &= 0x01010101;\n      do {\n        if (cmp_result == 0) break;\n        uint32_t index = (__ffs(cmp_result) - 1) >> 3;\n        cmp_result &= (cmp_result - 1);\n        possible_pos = pos_cur + i * 4 + index;\n        if (offset == 0 && possible_pos < key_pos) continue;\n        auto current_key = BUCKET::keys(bucket_keys_ptr, possible_pos);\n        K expected_key = static_cast<K>(EMPTY_KEY);\n        result = current_key->compare_exchange_strong(\n            expected_key, static_cast<K>(LOCKED_KEY),\n            cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);\n      } while (!result);\n      if (result) {\n        occupy_result = OccupyResult::OCCUPIED_EMPTY;\n        key_pos = possible_pos;\n        ScoreFunctor::update_with_digest(bucket_keys_ptr, key_pos, scores,\n                                         kv_idx, score, bucket_capacity,\n                                         get_digest<K>(key), true);\n        atomicAdd(bucket_size_ptr, 1);\n        break;\n      }\n    }\n  }\n  while (occupy_result == OccupyResult::INITIAL) {\n    S* bucket_scores_ptr = BUCKET::scores(bucket_keys_ptr, bucket_capacity, 0);\n    S min_score = static_cast<S>(MAX_SCORE);\n    int min_pos = -1;\n#pragma unroll\n    for (int j = 0; j < STRIDE_S; j += Load_LEN_S) {\n      __pipeline_memcpy_async(sm_bucket_scores[tx] + j, bucket_scores_ptr + j,\n                              sizeof(S) * Load_LEN_S);\n    }\n    __pipeline_commit();\n    for (int i = 0; i < bucket_capacity; i += STRIDE_S) {\n      if (i < bucket_capacity - STRIDE_S) {\n#pragma unroll\n        for (int j = 0; j < STRIDE_S; j += Load_LEN_S) {\n          __pipeline_memcpy_async(\n              sm_bucket_scores[tx] + diff_buf(i / STRIDE_S) * STRIDE_S + j,\n              bucket_scores_ptr + i + STRIDE_S + j, sizeof(S) * Load_LEN_S);\n        }\n      }\n      __pipeline_commit();\n      __pipeline_wait_prior(1);\n      S temp_scores[Load_LEN_S];\n      S* src = sm_bucket_scores[tx] + same_buf(i / STRIDE_S) * STRIDE_S;\n#pragma unroll\n      for (int k = 0; k < STRIDE_S; k += Load_LEN_S) {\n        *reinterpret_cast<byte16*>(temp_scores) =\n            *reinterpret_cast<byte16*>(src + k);\n#pragma unroll\n        for (int j = 0; j < Load_LEN_S; j += 1) {\n          S temp_score = temp_scores[j];\n          if (temp_score < min_score) {\n            auto verify_key_ptr = BUCKET::keys(bucket_keys_ptr, i + k + j);\n            auto verify_key =\n                verify_key_ptr->load(cuda::std::memory_order_relaxed);\n            if (verify_key != static_cast<K>(LOCKED_KEY) &&\n                verify_key != static_cast<K>(EMPTY_KEY)) {\n              min_score = temp_score;\n              min_pos = i + k + j;\n            }\n          }\n        }\n      }\n    }\n\n    score = ScoreFunctor::desired_when_missed(scores, kv_idx, global_epoch);\n    if (score < min_score) {\n      occupy_result = OccupyResult::REFUSED;\n      break;\n    }\n    auto min_score_key = BUCKET::keys(bucket_keys_ptr, min_pos);\n    auto expected_key = min_score_key->load(cuda::std::memory_order_relaxed);\n    if (expected_key != static_cast<K>(LOCKED_KEY) &&\n        expected_key != static_cast<K>(EMPTY_KEY)) {\n      bool result = min_score_key->compare_exchange_strong(\n          expected_key, static_cast<K>(LOCKED_KEY),\n          cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);\n      if (result) {\n        S* min_score_ptr =\n            BUCKET::scores(bucket_keys_ptr, bucket_capacity, min_pos);\n        auto verify_score_ptr =\n            reinterpret_cast<AtomicScore<S>*>(min_score_ptr);\n        auto verify_score =\n            verify_score_ptr->load(cuda::std::memory_order_relaxed);\n        if (verify_score <= min_score) {\n          key_pos = min_pos;\n          ScoreFunctor::update_with_digest(bucket_keys_ptr, key_pos, scores,\n                                           kv_idx, score, bucket_capacity,\n                                           get_digest<K>(key), true);\n          if (expected_key == static_cast<K>(RECLAIM_KEY)) {\n            occupy_result = OccupyResult::OCCUPIED_RECLAIMED;\n            atomicAdd(bucket_size_ptr, 1);\n          } else {\n            occupy_result = OccupyResult::EVICT;\n          }\n        } else {\n          min_score_key->store(expected_key, cuda::std::memory_order_release);\n        }\n      }\n    }\n  }\n\n  VecV* bucket_value_ptr{nullptr};\n  if ((occupy_result != OccupyResult::ILLEGAL) &&\n      (occupy_result != OccupyResult::REFUSED)) {\n    bucket_value_ptr = bucket_values_ptr + key_pos * dim;\n  }\n  __syncthreads();\n  uint32_t rank = g.thread_rank();\n  uint32_t groupID = threadIdx.x / GROUP_SIZE;\n\n  // Shared memory reuse:\n  // __shared__ S sm_bucket_scores[BLOCK_SIZE][2 * STRIDE_S];\n  // __shared__ VecV sm_values_buffer[GROUP_NUM][2][GROUP_BUF];\n  // assert(GROUP_BUF >= 2 * dim);\n  constexpr uint32_t GROUP_BUFs =\n      GROUP_SIZE * 2 * STRIDE_S * sizeof(S) / sizeof(VecV);\n  constexpr uint32_t GROUP_BUF = GROUP_BUFs / 2;\n  auto sm_values_buffer =\n      reinterpret_cast<VecV*>(&(sm_bucket_scores[0][0])) + groupID * GROUP_BUFs;\n\n  auto occupy_result_next = g.shfl(occupy_result, 0);\n  if ((occupy_result_next != OccupyResult::ILLEGAL) &&\n      (occupy_result_next != OccupyResult::REFUSED)) {\n    VecV* dst = sm_values_buffer;\n    auto kv_idx_next = g.shfl(kv_idx, 0);\n    const VecV* src = values + kv_idx_next * dim;\n    CopyValue::ldg_sts(rank, dst, src, dim);\n  }\n  __pipeline_commit();\n\n  for (int i = 0; i < GROUP_SIZE; i++) {\n    if (i + 1 < GROUP_SIZE) {\n      auto occupy_result_next = g.shfl(occupy_result, i + 1);\n      if ((occupy_result_next != OccupyResult::ILLEGAL) &&\n          (occupy_result_next != OccupyResult::REFUSED)) {\n        VecV* dst = sm_values_buffer + diff_buf(i) * GROUP_BUF;\n        auto kv_idx_next = g.shfl(kv_idx, i + 1);\n        const VecV* src = values + kv_idx_next * dim;\n        CopyValue::ldg_sts(rank, dst, src, dim);\n      }\n    }\n    __pipeline_commit();\n    auto occupy_result_cur = g.shfl(occupy_result, i);\n    if ((occupy_result_cur != OccupyResult::ILLEGAL) &&\n        (occupy_result_cur != OccupyResult::REFUSED)) {\n      VecV* src = sm_values_buffer + same_buf(i) * GROUP_BUF;\n      __pipeline_wait_prior(0);\n      VecV* dst = g.shfl(bucket_value_ptr, i);\n      __pipeline_wait_prior(1);\n      CopyValue::lds_stg(rank, dst, src, dim);\n    }\n  }\n\n  if ((occupy_result != OccupyResult::ILLEGAL) &&\n      (occupy_result != OccupyResult::REFUSED)) {\n    auto key_address = BUCKET::keys(bucket_keys_ptr, key_pos);\n    // memory_order_release:\n    // Modifications to the bucket will not after this instruction.\n    key_address->store(key, cuda::std::memory_order_release);\n  }\n}\n\ntemplate <\n    typename K, typename V, typename S, typename VecV, uint32_t BLOCK_SIZE,\n    uint32_t GROUP_SIZE, uint32_t BUCKET_SIZE,\n    uint32_t GROUP_NUM = BLOCK_SIZE / GROUP_SIZE, uint32_t OFST_ParamScores = 0,\n    uint32_t OFST_BucketValuesPtr = OFST_ParamScores + sizeof(S) * BLOCK_SIZE,\n    uint32_t OFST_BucketsSizePtr =\n        OFST_BucketValuesPtr + sizeof(VecV*) * BLOCK_SIZE,\n    uint32_t OFST_BucketDigests =\n        OFST_BucketsSizePtr + sizeof(int*) * BLOCK_SIZE,\n    uint32_t OFST_BucketScores =\n        OFST_BucketDigests + sizeof(D) * GROUP_NUM * 2 * BUCKET_SIZE,\n    uint32_t OFST_BucketValues =\n        OFST_BucketScores + sizeof(S) * GROUP_NUM * 2 * BUCKET_SIZE>\nstruct SharedMemoryManager_Pipeline_Upsert {\n  /*\n    __shared__ S sm_param_scores[BLOCK_SIZE];\n    __shared__ VecV* sm_bucket_values_ptr[BLOCK_SIZE];\n    __shared__ int* sm_buckets_size_ptr[BLOCK_SIZE];\n    __shared__ D sm_bucket_digests[GROUP_NUM][2][BUCKET_SIZE];\n    __shared__ S sm_bucket_scores[GROUP_NUM][2][BUCKET_SIZE];\n    __shared__ VecV sm_values_buffer[GROUP_NUM][2][dim];\n  */\n  static inline uint32_t total_size(uint32_t dim) {\n    return BLOCK_SIZE * (sizeof(S) + sizeof(VecV*) + sizeof(int*)) +\n           GROUP_NUM * 2 *\n               (BUCKET_SIZE * (sizeof(D) + sizeof(S)) + dim * sizeof(VecV));\n  }\n  static __forceinline__ __device__ S* param_scores(byte* smem) {\n    return reinterpret_cast<S*>(smem + OFST_ParamScores);\n  }\n  static __forceinline__ __device__ VecV** bucket_values_ptr(byte* smem) {\n    return reinterpret_cast<VecV**>(smem + OFST_BucketValuesPtr);\n  }\n  static __forceinline__ __device__ int** buckets_size_ptr(byte* smem) {\n    return reinterpret_cast<int**>(smem + OFST_BucketsSizePtr);\n  }\n  static __forceinline__ __device__ D* bucket_digests(byte* smem,\n                                                      uint32_t groupID,\n                                                      uint32_t buf) {\n    return reinterpret_cast<D*>(smem + OFST_BucketDigests) +\n           BUCKET_SIZE * (groupID * 2 + buf);\n  }\n  static __forceinline__ __device__ S* bucket_scores(byte* smem,\n                                                     uint32_t groupID,\n                                                     uint32_t buf) {\n    return reinterpret_cast<S*>(smem + OFST_BucketScores) +\n           BUCKET_SIZE * (groupID * 2 + buf);\n  }\n  static __forceinline__ __device__ VecV* values_buffer(byte* smem,\n                                                        uint32_t groupID,\n                                                        uint32_t buf,\n                                                        uint32_t dim) {\n    return reinterpret_cast<VecV*>(smem + OFST_BucketValues) +\n           dim * (groupID * 2 + buf);\n  }\n};\n\ntemplate <typename K = uint64_t, typename V = byte4, typename S = uint64_t,\n          typename VecV = byte16, uint32_t BLOCK_SIZE = 128, int Strategy = -1>\n__global__ void pipeline_upsert_kernel_with_io(\n    Bucket<K, V, S>* __restrict__ buckets, int32_t* __restrict__ buckets_size,\n    const uint64_t buckets_num, const uint32_t dim, const K* __restrict__ keys,\n    const VecV* __restrict__ values, const S* __restrict__ scores, uint64_t n,\n    const S global_epoch) {\n  // Here, GROUP_SIZE * Comp_LEN = BUCKET_SIZE.\n  constexpr uint32_t BUCKET_SIZE = 128;\n  constexpr uint32_t GROUP_SIZE = 32;\n  constexpr uint32_t Comp_LEN = sizeof(VecD_Comp) / sizeof(D);\n  constexpr uint32_t Load_LEN = sizeof(VecD_Load) / sizeof(D);\n  constexpr uint32_t Load_LEN_S = sizeof(byte16) / sizeof(S);\n\n  using BUCKET = Bucket<K, V, S>;\n  using CopyValue = CopyValueMultipleGroup<VecV, GROUP_SIZE>;\n  using SMM = SharedMemoryManager_Pipeline_Upsert<K, V, S, VecV, BLOCK_SIZE,\n                                                  GROUP_SIZE, BUCKET_SIZE>;\n  using ScoreFunctor = ScoreFunctor<K, V, S, Strategy>;\n\n  __shared__ extern __align__(alignof(byte16)) byte smem[];\n\n  // Initialization.\n  auto g = cg::tiled_partition<GROUP_SIZE>(cg::this_thread_block());\n  uint32_t tx = threadIdx.x;\n  uint32_t kv_idx = blockIdx.x * blockDim.x + tx;\n  K key{static_cast<K>(EMPTY_KEY)};\n  VecD_Comp target_digests;\n  K* bucket_keys_ptr{nullptr};\n  OccupyResult occupy_result{OccupyResult::INITIAL};\n  uint32_t key_pos = 0;\n  if (kv_idx < n) {\n    key = keys[kv_idx];\n    if (scores != nullptr) {\n      S* sm_param_scores = SMM::param_scores(smem);\n      __pipeline_memcpy_async(sm_param_scores + tx, scores + kv_idx, sizeof(S));\n    }\n    if (!IS_RESERVED_KEY<K>(key)) {\n      const K hashed_key = Murmur3HashDevice(key);\n      target_digests = digests_from_hashed<K>(hashed_key);\n      uint64_t global_idx =\n          static_cast<uint64_t>(hashed_key % (buckets_num * BUCKET_SIZE));\n      uint64_t bkt_idx = global_idx / BUCKET_SIZE;\n      key_pos = get_start_position(global_idx, BUCKET_SIZE);\n      int** sm_buckets_size_ptr = SMM::buckets_size_ptr(smem);\n      sm_buckets_size_ptr[tx] = buckets_size + bkt_idx;\n      BUCKET* bucket = buckets + bkt_idx;\n      bucket_keys_ptr = reinterpret_cast<K*>(bucket->keys(0));\n      VecV** sm_bucket_values_ptr = SMM::bucket_values_ptr(smem);\n      __pipeline_memcpy_async(sm_bucket_values_ptr + tx, &(bucket->vectors),\n                              sizeof(VecV*));\n    } else {\n      occupy_result = OccupyResult::ILLEGAL;\n    }\n  } else {\n    occupy_result = OccupyResult::ILLEGAL;\n  }\n\n  uint32_t rank = g.thread_rank();\n  uint32_t groupID = threadIdx.x / GROUP_SIZE;\n\n  // Pipeline loading.\n  auto occupy_result_next = g.shfl(occupy_result, 0);\n  auto keys_ptr_next = g.shfl(bucket_keys_ptr, 0);\n  if (occupy_result_next == OccupyResult::INITIAL) {\n    D* sm_bucket_digests = SMM::bucket_digests(smem, groupID, 0);\n    D* dst = sm_bucket_digests + rank * Load_LEN;\n    D* src = BUCKET::digests(keys_ptr_next, BUCKET_SIZE, rank * Load_LEN);\n    if (rank * Load_LEN < BUCKET_SIZE) {\n      __pipeline_memcpy_async(dst, src, sizeof(VecD_Load));\n    }\n  }\n  __pipeline_commit();\n  // Padding, meet the param of the first `__pipeline_wait_prior`\n  // in the first loop.\n  __pipeline_commit();\n  __pipeline_commit();\n  for (int32_t i = 0; i < GROUP_SIZE; i++) {\n    // Step1: load digests from global memory to shared memory.\n    if (i + 1 < GROUP_SIZE) {\n      auto occupy_result_next = g.shfl(occupy_result, i + 1);\n      auto keys_ptr_next = g.shfl(bucket_keys_ptr, i + 1);\n      if (occupy_result_next == OccupyResult::INITIAL) {\n        D* sm_bucket_digests = SMM::bucket_digests(smem, groupID, diff_buf(i));\n        D* dst = sm_bucket_digests + rank * Load_LEN;\n        D* src = BUCKET::digests(keys_ptr_next, BUCKET_SIZE, rank * Load_LEN);\n        if (rank * Load_LEN < BUCKET_SIZE) {\n          __pipeline_memcpy_async(dst, src, sizeof(VecD_Load));\n        }\n      }\n    }\n    __pipeline_commit();\n    // Step2: to lock the target_key or empty_key by querying digests.\n    auto occupy_result_cur = g.shfl(occupy_result, i);\n    if (occupy_result_cur == OccupyResult::INITIAL) {\n      uint32_t tx_cur = groupID * GROUP_SIZE + i;\n      int** sm_buckets_size_ptr = SMM::buckets_size_ptr(smem);\n      auto bucket_size_ptr = sm_buckets_size_ptr[tx_cur];\n      K key_cur = g.shfl(key, i);\n      auto target_digests_cur = g.shfl(target_digests, i);\n      auto start_pos_cur = g.shfl(key_pos, i);\n      auto keys_ptr_cur = g.shfl(bucket_keys_ptr, i);\n      auto bucket_size_cur = *bucket_size_ptr;\n      __pipeline_wait_prior(3);\n      D* src = SMM::bucket_digests(smem, groupID, same_buf(i));\n      uint32_t start_offset = start_pos_cur / Comp_LEN;\n      uint32_t probe_offset =\n          Comp_LEN * ((start_offset + rank) & (GROUP_SIZE - 1));\n      VecD_Comp probe_digests =\n          *reinterpret_cast<VecD_Comp*>(src + probe_offset);\n      uint32_t cmp_result = __vcmpeq4(probe_digests, target_digests_cur);\n      cmp_result &= 0x01010101;\n      uint32_t possible_pos = 0;\n      bool result = false;\n      do {\n        if (cmp_result == 0) break;\n        int32_t index = (__ffs(cmp_result) - 1) >> 3;\n        cmp_result &= (cmp_result - 1);\n        possible_pos = probe_offset + index;\n        auto current_key = BUCKET::keys(keys_ptr_cur, possible_pos);\n        K expected_key = key_cur;\n        result = current_key->compare_exchange_strong(\n            expected_key, static_cast<K>(LOCKED_KEY),\n            cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);\n      } while (!result);\n      uint32_t found_vote = g.ballot(result);\n      if (found_vote) {\n        int32_t src_lane = __ffs(found_vote) - 1;\n        possible_pos = g.shfl(possible_pos, src_lane);\n        if (rank == i) {\n          occupy_result = OccupyResult::DUPLICATE;\n          key_pos = possible_pos;\n          S* sm_param_scores = SMM::param_scores(smem);\n          S score = ScoreFunctor::desired_when_missed(sm_param_scores, tx,\n                                                      global_epoch);\n          ScoreFunctor::update_with_digest(\n              bucket_keys_ptr, key_pos, sm_param_scores, tx, score, BUCKET_SIZE,\n              get_digest<K>(key), false);\n        }\n      } else if (bucket_size_cur < BUCKET_SIZE) {\n        VecD_Comp empty_digests_ = empty_digests<K>();\n        cmp_result = __vcmpeq4(probe_digests, empty_digests_);\n        cmp_result &= 0x01010101;\n        for (int32_t offset = 0; offset < GROUP_SIZE; offset += 1) {\n          if (rank == offset) {\n            do {\n              if (cmp_result == 0) break;\n              int32_t index = (__ffs(cmp_result) - 1) >> 3;\n              cmp_result &= (cmp_result - 1);\n              possible_pos = probe_offset + index;\n              auto current_key = BUCKET::keys(keys_ptr_cur, possible_pos);\n              K expected_key = static_cast<K>(EMPTY_KEY);\n              result = current_key->compare_exchange_strong(\n                  expected_key, static_cast<K>(LOCKED_KEY),\n                  cuda::std::memory_order_acquire,\n                  cuda::std::memory_order_relaxed);\n            } while (!result);\n          }\n          uint32_t found_vote = g.ballot(result);\n          if (found_vote) {\n            int32_t src_lane = __ffs(found_vote) - 1;\n            possible_pos = g.shfl(possible_pos, src_lane);\n            if (rank == i) {\n              occupy_result = OccupyResult::OCCUPIED_EMPTY;\n              S* sm_param_scores = SMM::param_scores(smem);\n              S score = ScoreFunctor::desired_when_missed(sm_param_scores, tx,\n                                                          global_epoch);\n              key_pos = possible_pos;\n              ScoreFunctor::update_with_digest(\n                  bucket_keys_ptr, key_pos, sm_param_scores, tx, score,\n                  BUCKET_SIZE, get_digest<K>(key), true);\n              atomicAdd(bucket_size_ptr, 1);\n            }\n            break;\n          }\n        }\n      }\n      occupy_result_cur = g.shfl(occupy_result, i);\n      if (occupy_result_cur == OccupyResult::INITIAL) {\n        S* sm_bucket_scores = SMM::bucket_scores(smem, groupID, same_buf(i));\n        S* dst = sm_bucket_scores + rank * Load_LEN_S;\n        S* src = BUCKET::scores(keys_ptr_cur, BUCKET_SIZE, rank * Load_LEN_S);\n#pragma unroll\n        for (int32_t k = 0; k < BUCKET_SIZE; k += GROUP_SIZE * Load_LEN_S) {\n          __pipeline_memcpy_async(dst + k, src + k, sizeof(S) * Load_LEN_S);\n        }\n      }\n    }\n    __pipeline_commit();\n    // Step 3: reduce to get the key with the minimum score.\n    if (i > 0) {\n      occupy_result_cur = g.shfl(occupy_result, i - 1);\n      uint32_t tx_cur = groupID * GROUP_SIZE + i - 1;\n      S* sm_param_scores = SMM::param_scores(smem);\n      S score_cur = ScoreFunctor::desired_when_missed(sm_param_scores, tx_cur,\n                                                      global_epoch);\n      int** sm_buckets_size_ptr = SMM::buckets_size_ptr(smem);\n      auto bucket_size_ptr = sm_buckets_size_ptr[tx_cur];\n      __pipeline_wait_prior(3);\n      S* src = SMM::bucket_scores(smem, groupID, diff_buf(i));\n      while (occupy_result_cur == OccupyResult::INITIAL) {\n        int min_pos_local = -1;\n        S min_score_local = static_cast<S>(MAX_SCORE);\n#pragma unroll\n        for (int j = 0; j < BUCKET_SIZE; j += GROUP_SIZE * Load_LEN_S) {\n          S temp_scores[Load_LEN_S];\n          *reinterpret_cast<byte16*>(temp_scores) =\n              *reinterpret_cast<byte16*>(src + rank * Load_LEN_S + j);\n#pragma unroll\n          for (int k = 0; k < Load_LEN_S; k++) {\n            S temp_score = temp_scores[k];\n            if (temp_score < min_score_local) {\n              min_score_local = temp_score;\n              min_pos_local = rank * Load_LEN_S + j + k;\n            }\n          }\n        }\n        const S min_score_global =\n            cg::reduce(g, min_score_local, cg::less<S>());\n        if (score_cur < min_score_global) {\n          if (rank == i - 1) {\n            occupy_result = OccupyResult::REFUSED;\n          }\n          occupy_result_cur = g.shfl(occupy_result, i - 1);\n          break;\n        }\n        uint32_t vote = g.ballot(min_score_local <= min_score_global);\n        if (vote) {\n          int src_lane = __ffs(vote) - 1;\n          int min_pos_global = g.shfl(min_pos_local, src_lane);\n          if (rank == i - 1) {\n            src[min_pos_global] = static_cast<S>(MAX_SCORE);  // Mark visited.\n            auto min_score_key = BUCKET::keys(bucket_keys_ptr, min_pos_global);\n            auto expected_key =\n                min_score_key->load(cuda::std::memory_order_relaxed);\n            if (expected_key != static_cast<K>(LOCKED_KEY) &&\n                expected_key != static_cast<K>(EMPTY_KEY)) {\n              bool result = min_score_key->compare_exchange_strong(\n                  expected_key, static_cast<K>(LOCKED_KEY),\n                  cuda::std::memory_order_acquire,\n                  cuda::std::memory_order_relaxed);\n              if (result) {\n                S* score_ptr = BUCKET::scores(bucket_keys_ptr, BUCKET_SIZE,\n                                              min_pos_global);\n                auto verify_score_ptr =\n                    reinterpret_cast<AtomicScore<S>*>(score_ptr);\n                auto verify_score =\n                    verify_score_ptr->load(cuda::std::memory_order_relaxed);\n                if (verify_score <= min_score_global) {\n                  if (expected_key == static_cast<K>(RECLAIM_KEY)) {\n                    occupy_result = OccupyResult::OCCUPIED_RECLAIMED;\n                    atomicAdd(bucket_size_ptr, 1);\n                  } else {\n                    occupy_result = OccupyResult::EVICT;\n                  }\n                  key_pos = min_pos_global;\n                  ScoreFunctor::update_with_digest(\n                      bucket_keys_ptr, key_pos, sm_param_scores, tx_cur,\n                      score_cur, BUCKET_SIZE, get_digest<K>(key), true);\n                } else {\n                  min_score_key->store(expected_key,\n                                       cuda::std::memory_order_release);\n                }\n              }\n            }\n          }\n          occupy_result_cur = g.shfl(occupy_result, i - 1);\n        }\n      }\n      // Prefetch values to shared memory.\n      if (occupy_result_cur != OccupyResult::ILLEGAL &&\n          occupy_result_cur != OccupyResult::REFUSED) {\n        VecV* dst = SMM::values_buffer(smem, groupID, diff_buf(i), dim);\n        auto kv_idx_cur = g.shfl(kv_idx, i - 1);\n        const VecV* src = values + kv_idx_cur * dim;\n        CopyValue::ldg_sts(rank, dst, src, dim);\n      }\n    }\n    __pipeline_commit();\n\n    // Step 4: write values to bucket or param buffer.\n    if (i > 1) {\n      occupy_result_cur = g.shfl(occupy_result, i - 2);\n      if (occupy_result_cur != OccupyResult::ILLEGAL &&\n          occupy_result_cur != OccupyResult::REFUSED) {\n        VecV* src = SMM::values_buffer(smem, groupID, same_buf(i), dim);\n        VecV** sm_bucket_values_ptr = SMM::bucket_values_ptr(smem);\n        auto bucket_values_ptr =\n            sm_bucket_values_ptr[groupID * GROUP_SIZE + i - 2];\n        auto key_pos_cur = g.shfl(key_pos, i - 2);\n        VecV* dst = bucket_values_ptr + key_pos_cur * dim;\n        __pipeline_wait_prior(3);\n        CopyValue::lds_stg(rank, dst, src, dim);\n        if (rank == i - 2) {\n          auto key_address = BUCKET::keys(bucket_keys_ptr, key_pos);\n          key_address->store(key, cuda::std::memory_order_release);\n        }\n      }\n    }\n  }\n  auto occupy_result_cur = g.shfl(occupy_result, GROUP_SIZE - 1);\n  uint32_t tx_cur = groupID * GROUP_SIZE + GROUP_SIZE - 1;\n  S* sm_param_scores = SMM::param_scores(smem);\n  S score_cur =\n      ScoreFunctor::desired_when_missed(sm_param_scores, tx_cur, global_epoch);\n  int** sm_buckets_size_ptr = SMM::buckets_size_ptr(smem);\n  auto bucket_size_ptr = sm_buckets_size_ptr[tx_cur];\n  __pipeline_wait_prior(1);\n  S* src = SMM::bucket_scores(smem, groupID, diff_buf(GROUP_SIZE));\n  while (occupy_result_cur == OccupyResult::INITIAL) {\n    int min_pos_local = -1;\n    S min_score_local = static_cast<S>(MAX_SCORE);\n#pragma unroll\n    for (int j = 0; j < BUCKET_SIZE; j += GROUP_SIZE * Load_LEN_S) {\n      S temp_scores[Load_LEN_S];\n      *reinterpret_cast<byte16*>(temp_scores) =\n          *reinterpret_cast<byte16*>(src + rank * Load_LEN_S + j);\n#pragma unroll\n      for (int k = 0; k < Load_LEN_S; k++) {\n        S temp_score = temp_scores[k];\n        if (temp_score < min_score_local) {\n          min_score_local = temp_score;\n          min_pos_local = rank * Load_LEN_S + j + k;\n        }\n      }\n    }\n    const S min_score_global = cg::reduce(g, min_score_local, cg::less<S>());\n    if (score_cur < min_score_global) {\n      if (rank == GROUP_SIZE - 1) {\n        occupy_result = OccupyResult::REFUSED;\n      }\n      occupy_result_cur = g.shfl(occupy_result, GROUP_SIZE - 1);\n      break;\n    }\n    uint32_t vote = g.ballot(min_score_local <= min_score_global);\n    if (vote) {\n      int src_lane = __ffs(vote) - 1;\n      int min_pos_global = g.shfl(min_pos_local, src_lane);\n      if (rank == GROUP_SIZE - 1) {\n        src[min_pos_global] = static_cast<S>(MAX_SCORE);  // Mark visited.\n        auto min_score_key = BUCKET::keys(bucket_keys_ptr, min_pos_global);\n        auto expected_key =\n            min_score_key->load(cuda::std::memory_order_relaxed);\n        if (expected_key != static_cast<K>(LOCKED_KEY) &&\n            expected_key != static_cast<K>(EMPTY_KEY)) {\n          auto min_score_ptr =\n              BUCKET::scores(bucket_keys_ptr, BUCKET_SIZE, min_pos_global);\n          bool result = min_score_key->compare_exchange_strong(\n              expected_key, static_cast<K>(LOCKED_KEY),\n              cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);\n          if (result) {\n            S* score_ptr =\n                BUCKET::scores(bucket_keys_ptr, BUCKET_SIZE, min_pos_global);\n            auto verify_score_ptr =\n                reinterpret_cast<AtomicScore<S>*>(score_ptr);\n            auto verify_score =\n                verify_score_ptr->load(cuda::std::memory_order_relaxed);\n            if (verify_score <= min_score_global) {\n              if (expected_key == static_cast<K>(RECLAIM_KEY)) {\n                atomicAdd(bucket_size_ptr, 1);\n                occupy_result = OccupyResult::OCCUPIED_RECLAIMED;\n              } else {\n                occupy_result = OccupyResult::EVICT;\n              }\n              key_pos = min_pos_global;\n              ScoreFunctor::update_with_digest(\n                  bucket_keys_ptr, key_pos, sm_param_scores, tx_cur, score_cur,\n                  BUCKET_SIZE, get_digest<K>(key), true);\n            } else {\n              min_score_key->store(expected_key,\n                                   cuda::std::memory_order_release);\n            }\n          }\n        }\n      }\n      occupy_result_cur = g.shfl(occupy_result, GROUP_SIZE - 1);\n    }\n  }\n  // Prefetch values to shared memory.\n  if (occupy_result_cur != OccupyResult::ILLEGAL &&\n      occupy_result_cur != OccupyResult::REFUSED) {\n    VecV* dst = SMM::values_buffer(smem, groupID, diff_buf(GROUP_SIZE), dim);\n    auto kv_idx_cur = g.shfl(kv_idx, GROUP_SIZE - 1);\n    const VecV* src = values + kv_idx_cur * dim;\n    CopyValue::ldg_sts(rank, dst, src, dim);\n  }\n  __pipeline_commit();\n\n  // Step 4: write values to bucket or param buffer.\n  occupy_result_cur = g.shfl(occupy_result, GROUP_SIZE - 2);\n  if (occupy_result_cur != OccupyResult::ILLEGAL &&\n      occupy_result_cur != OccupyResult::REFUSED) {\n    VecV* src = SMM::values_buffer(smem, groupID, same_buf(GROUP_SIZE), dim);\n    VecV** sm_bucket_values_ptr = SMM::bucket_values_ptr(smem);\n    auto bucket_values_ptr =\n        sm_bucket_values_ptr[groupID * GROUP_SIZE + GROUP_SIZE - 2];\n    auto key_pos_cur = g.shfl(key_pos, GROUP_SIZE - 2);\n    VecV* dst = bucket_values_ptr + key_pos_cur * dim;\n    __pipeline_wait_prior(1);\n    CopyValue::lds_stg(rank, dst, src, dim);\n    if (rank == GROUP_SIZE - 2) {\n      auto key_address = BUCKET::keys(bucket_keys_ptr, key_pos);\n      key_address->store(key, cuda::std::memory_order_release);\n    }\n  }\n\n  // Step 4: write values to bucket or param buffer.\n  occupy_result_cur = g.shfl(occupy_result, GROUP_SIZE - 1);\n  if (occupy_result_cur != OccupyResult::ILLEGAL &&\n      occupy_result_cur != OccupyResult::REFUSED) {\n    VecV* src =\n        SMM::values_buffer(smem, groupID, same_buf(GROUP_SIZE + 1), dim);\n    VecV** sm_bucket_values_ptr = SMM::bucket_values_ptr(smem);\n    auto bucket_values_ptr =\n        sm_bucket_values_ptr[groupID * GROUP_SIZE + GROUP_SIZE - 1];\n    auto key_pos_cur = g.shfl(key_pos, GROUP_SIZE - 1);\n    VecV* dst = bucket_values_ptr + key_pos_cur * dim;\n    __pipeline_wait_prior(0);\n    CopyValue::lds_stg(rank, dst, src, dim);\n    if (rank == GROUP_SIZE - 1) {\n      auto key_address = BUCKET::keys(bucket_keys_ptr, key_pos);\n      key_address->store(key, cuda::std::memory_order_release);\n    }\n  }\n}\n\ntemplate <typename K = uint64_t, typename V = float, typename S = uint64_t>\nstruct Params_Upsert {\n  Params_Upsert(float load_factor_, Bucket<K, V, S>* __restrict__ buckets_,\n                int* buckets_size_, size_t buckets_num_,\n                uint32_t bucket_capacity_, uint32_t dim_,\n                const K* __restrict__ keys_, const V* __restrict__ values_,\n                const S* __restrict__ scores_, size_t n_, const S global_epoch_)\n      : load_factor(load_factor_),\n        buckets(buckets_),\n        buckets_size(buckets_size_),\n        buckets_num(buckets_num_),\n        bucket_capacity(bucket_capacity_),\n        dim(dim_),\n        keys(keys_),\n        values(values_),\n        scores(scores_),\n        n(n_),\n        global_epoch(global_epoch_) {}\n  float load_factor;\n  Bucket<K, V, S>* __restrict__ buckets;\n  int* buckets_size;\n  size_t buckets_num;\n  uint32_t bucket_capacity;\n  uint32_t dim;\n  const K* __restrict__ keys;\n  const V* __restrict__ values;\n  const S* __restrict__ scores;\n  uint64_t n;\n  const S global_epoch;\n};\n\ntemplate <typename K, typename V, typename S, typename VecV, int Strategy>\nstruct Launch_TLPv1_Upsert {\n  using Params = Params_Upsert<K, V, S>;\n  inline static void launch_kernel(Params& params, cudaStream_t& stream) {\n    constexpr int BLOCK_SIZE = 128;\n    params.dim = params.dim * sizeof(V) / sizeof(VecV);\n    tlp_v1_upsert_kernel_with_io<K, V, S, VecV, BLOCK_SIZE, Strategy>\n        <<<(params.n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(\n            params.buckets, params.buckets_size, params.buckets_num,\n            params.bucket_capacity, params.dim, params.keys,\n            reinterpret_cast<const VecV*>(params.values), params.scores,\n            params.n, params.global_epoch);\n  }\n};\n\ntemplate <typename K, typename V, typename S, typename VecV, int Strategy>\nstruct Launch_TLPv2_Upsert {\n  using Params = Params_Upsert<K, V, S>;\n  inline static void launch_kernel(Params& params, cudaStream_t& stream) {\n    constexpr int BLOCK_SIZE = 128;\n    const uint32_t value_size = params.dim * sizeof(V);\n    params.dim = value_size / sizeof(VecV);\n\n    if (value_size <= 256) {\n      constexpr int GROUP_SIZE = 8;\n      tlp_v2_upsert_kernel_with_io<K, V, S, VecV, BLOCK_SIZE, GROUP_SIZE,\n                                   Strategy>\n          <<<(params.n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(\n              params.buckets, params.buckets_size, params.buckets_num,\n              params.bucket_capacity, params.dim, params.keys,\n              reinterpret_cast<const VecV*>(params.values), params.scores,\n              params.n, params.global_epoch);\n    } else {\n      constexpr int GROUP_SIZE = 16;\n      tlp_v2_upsert_kernel_with_io<K, V, S, VecV, BLOCK_SIZE, GROUP_SIZE,\n                                   Strategy>\n          <<<(params.n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(\n              params.buckets, params.buckets_size, params.buckets_num,\n              params.bucket_capacity, params.dim, params.keys,\n              reinterpret_cast<const VecV*>(params.values), params.scores,\n              params.n, params.global_epoch);\n    }\n  }\n};\n\ntemplate <typename K, typename V, typename S, typename VecV, int Strategy>\nstruct Launch_Pipeline_Upsert {\n  using Params = Params_Upsert<K, V, S>;\n  inline static void launch_kernel(Params& params, cudaStream_t& stream) {\n    constexpr int BLOCK_SIZE = 128;\n    constexpr uint32_t GROUP_SIZE = 32;\n    constexpr uint32_t BUCKET_SIZE = 128;\n    using SMM = SharedMemoryManager_Pipeline_Upsert<K, V, S, VecV, BLOCK_SIZE,\n                                                    GROUP_SIZE, BUCKET_SIZE>;\n\n    params.dim = params.dim * sizeof(V) / sizeof(VecV);\n    uint32_t shared_mem = SMM::total_size(params.dim);\n    shared_mem =\n        (shared_mem + sizeof(byte16) - 1) / sizeof(byte16) * sizeof(byte16);\n    pipeline_upsert_kernel_with_io<K, V, S, VecV, BLOCK_SIZE, Strategy>\n        <<<(params.n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, shared_mem,\n           stream>>>(params.buckets, params.buckets_size, params.buckets_num,\n                     params.dim, params.keys,\n                     reinterpret_cast<const VecV*>(params.values),\n                     params.scores, params.n, params.global_epoch);\n  }\n};\n\ntemplate <typename ArchTag>\nstruct ValueConfig_Upsert;\n\ntemplate <>\nstruct ValueConfig_Upsert<Sm80> {\n  // Value size greater than it will bring poor performance for TLPv1.\n  static constexpr uint32_t size_tlp_v1 = 8 * sizeof(byte4);\n  static constexpr uint32_t size_tlp_v2 = 128 * sizeof(byte4);\n};\n\ntemplate <>\nstruct ValueConfig_Upsert<Sm70> {\n  // Value size greater than it will bring poor performance for TLPv1.\n  static constexpr uint32_t size_tlp_v1 = 8 * sizeof(byte4);\n  static constexpr uint32_t size_tlp_v2 = 128 * sizeof(byte4);\n};\n\ntemplate <typename K, typename V, typename S, int Strategy, typename ArchTag>\nstruct KernelSelector_Upsert {\n  using ValueConfig = ValueConfig_Upsert<ArchTag>;\n  using Params = Params_Upsert<K, V, S>;\n\n  static bool callable(bool unique_key, uint32_t bucket_size, uint32_t dim) {\n    constexpr uint32_t MinBucketCap = sizeof(VecD_Load) / sizeof(D);\n    if (!unique_key || bucket_size < MinBucketCap) return false;\n    uint32_t value_size = dim * sizeof(V);\n#if defined(CUDART_VERSION) && (CUDART_VERSION >= 11030)\n    if (value_size <= ValueConfig::size_tlp_v2) return true;\n#else\n    if (value_size <= ValueConfig::size_tlp_v1) return true;\n#endif\n    return false;\n  }\n\n  static void select_kernel(Params& params, cudaStream_t& stream) {\n    const uint32_t total_value_size =\n        static_cast<uint32_t>(params.dim * sizeof(V));\n\n    auto launch_TLPv1 = [&]() {\n      if (total_value_size % sizeof(byte16) == 0) {\n        using VecV = byte16;\n        Launch_TLPv1_Upsert<K, V, S, VecV, Strategy>::launch_kernel(params,\n                                                                    stream);\n      } else if (total_value_size % sizeof(byte8) == 0) {\n        using VecV = byte8;\n        Launch_TLPv1_Upsert<K, V, S, VecV, Strategy>::launch_kernel(params,\n                                                                    stream);\n      } else if (total_value_size % sizeof(byte4) == 0) {\n        using VecV = byte4;\n        Launch_TLPv1_Upsert<K, V, S, VecV, Strategy>::launch_kernel(params,\n                                                                    stream);\n      } else if (total_value_size % sizeof(byte2) == 0) {\n        using VecV = byte2;\n        Launch_TLPv1_Upsert<K, V, S, VecV, Strategy>::launch_kernel(params,\n                                                                    stream);\n      } else {\n        using VecV = byte;\n        Launch_TLPv1_Upsert<K, V, S, VecV, Strategy>::launch_kernel(params,\n                                                                    stream);\n      }\n    };\n\n#if defined(CUDART_VERSION) && (CUDART_VERSION >= 11030)\n    auto launch_TLPv2 = [&]() {\n      if (total_value_size % sizeof(byte16) == 0) {\n        using VecV = byte16;\n        Launch_TLPv2_Upsert<K, V, S, VecV, Strategy>::launch_kernel(params,\n                                                                    stream);\n      } else if (total_value_size % sizeof(byte8) == 0) {\n        using VecV = byte8;\n        Launch_TLPv2_Upsert<K, V, S, VecV, Strategy>::launch_kernel(params,\n                                                                    stream);\n      } else if (total_value_size % sizeof(byte4) == 0) {\n        using VecV = byte4;\n        Launch_TLPv2_Upsert<K, V, S, VecV, Strategy>::launch_kernel(params,\n                                                                    stream);\n      } else if (total_value_size % sizeof(byte2) == 0) {\n        using VecV = byte2;\n        Launch_TLPv2_Upsert<K, V, S, VecV, Strategy>::launch_kernel(params,\n                                                                    stream);\n      } else {\n        using VecV = byte;\n        Launch_TLPv2_Upsert<K, V, S, VecV, Strategy>::launch_kernel(params,\n                                                                    stream);\n      }\n    };\n#endif\n\n    auto launch_Pipeline = [&]() {\n      if (total_value_size % sizeof(byte16) == 0) {\n        using VecV = byte16;\n        Launch_Pipeline_Upsert<K, V, S, VecV, Strategy>::launch_kernel(params,\n                                                                       stream);\n      } else if (total_value_size % sizeof(byte8) == 0) {\n        using VecV = byte8;\n        Launch_Pipeline_Upsert<K, V, S, VecV, Strategy>::launch_kernel(params,\n                                                                       stream);\n      } else if (total_value_size % sizeof(byte4) == 0) {\n        using VecV = byte4;\n        Launch_Pipeline_Upsert<K, V, S, VecV, Strategy>::launch_kernel(params,\n                                                                       stream);\n      } else if (total_value_size % sizeof(byte2) == 0) {\n        using VecV = byte2;\n        Launch_Pipeline_Upsert<K, V, S, VecV, Strategy>::launch_kernel(params,\n                                                                       stream);\n      } else {\n        using VecV = byte;\n        Launch_Pipeline_Upsert<K, V, S, VecV, Strategy>::launch_kernel(params,\n                                                                       stream);\n      }\n    };\n\n    // This part is according to the test on A100.\n    if (params.bucket_capacity != 128) {\n      if (total_value_size <= ValueConfig::size_tlp_v1) {\n        launch_TLPv1();\n      } else {\n#if defined(CUDART_VERSION) && (CUDART_VERSION >= 11030)\n        launch_TLPv2();\n#else\n        launch_TLPv1();\n#endif\n      }\n    } else {\n      if (total_value_size <= ValueConfig::size_tlp_v1) {\n        if (params.load_factor <= 0.98f) {\n          launch_TLPv1();\n        } else {\n          launch_Pipeline();\n        }\n      } else {\n        if (params.load_factor <= 0.95f) {\n#if defined(CUDART_VERSION) && (CUDART_VERSION >= 11030)\n          launch_TLPv2();\n#else\n          launch_Pipeline();\n#endif\n        } else {\n          launch_Pipeline();\n        }\n      }\n    }\n  }  // End function\n};\n\ntemplate <class K, class V, class S, int Strategy, uint32_t TILE_SIZE = 4>\n__global__ void upsert_kernel_with_io_core(\n    const Table<K, V, S>* __restrict table, Bucket<K, V, S>* buckets,\n    const size_t bucket_max_size, const size_t buckets_num, const size_t dim,\n    const K* __restrict keys, const V* __restrict values,\n    const S* __restrict scores, const S global_epoch, size_t N) {\n  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());\n  int* buckets_size = table->buckets_size;\n\n  using ScoreFunctor = ScoreFunctor<K, V, S, Strategy>;\n\n  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;\n       t += blockDim.x * gridDim.x) {\n    int key_pos = -1;\n    size_t key_idx = t / TILE_SIZE;\n\n    const K insert_key = keys[key_idx];\n\n    if (IS_RESERVED_KEY<K>(insert_key)) {\n      continue;\n    }\n\n    const S insert_score =\n        ScoreFunctor::desired_when_missed(scores, key_idx, global_epoch);\n    const V* insert_value = values + key_idx * dim;\n\n    size_t bkt_idx = 0;\n    size_t start_idx = 0;\n    int src_lane = -1;\n    K evicted_key;\n\n    Bucket<K, V, S>* bucket = get_key_position<K>(\n        buckets, insert_key, bkt_idx, start_idx, buckets_num, bucket_max_size);\n\n    OccupyResult occupy_result{OccupyResult::INITIAL};\n    const int bucket_size = buckets_size[bkt_idx];\n    do {\n      if (bucket_size < bucket_max_size) {\n        occupy_result = find_and_lock_when_vacant<K, V, S, TILE_SIZE>(\n            g, bucket, insert_key, insert_score, evicted_key, start_idx,\n            key_pos, src_lane, bucket_max_size);\n      } else {\n        start_idx = (start_idx / TILE_SIZE) * TILE_SIZE;\n        occupy_result = find_and_lock_when_full<K, V, S, TILE_SIZE,\n                                                ScoreFunctor::LOCK_MEM_ORDER,\n                                                ScoreFunctor::UNLOCK_MEM_ORDER>(\n            g, bucket, insert_key, insert_score, evicted_key, start_idx,\n            key_pos, src_lane, bucket_max_size);\n      }\n\n      occupy_result = g.shfl(occupy_result, src_lane);\n    } while (occupy_result == OccupyResult::CONTINUE);\n\n    if (occupy_result == OccupyResult::REFUSED) {\n      continue;\n    }\n\n    if ((occupy_result == OccupyResult::OCCUPIED_EMPTY ||\n         occupy_result == OccupyResult::OCCUPIED_RECLAIMED) &&\n        g.thread_rank() == src_lane) {\n      atomicAdd(&(buckets_size[bkt_idx]), 1);\n    }\n\n    copy_vector<V, TILE_SIZE>(g, insert_value, bucket->vectors + key_pos * dim,\n                              dim);\n    if (g.thread_rank() == src_lane) {\n      ScoreFunctor::update(bucket, key_pos, scores, key_idx, insert_score,\n                           (occupy_result != OccupyResult::DUPLICATE));\n      bucket->digests(key_pos)[0] = get_digest<K>(insert_key);\n      (bucket->keys(key_pos))\n          ->store(insert_key, ScoreFunctor::UNLOCK_MEM_ORDER);\n    }\n  }\n}\n\ntemplate <typename K, typename V, typename S, int Strategy>\nstruct SelectUpsertKernelWithIO {\n  static void execute_kernel(const float& load_factor, const int& block_size,\n                             const size_t bucket_max_size,\n                             const size_t buckets_num, const size_t dim,\n                             cudaStream_t& stream, const size_t& n,\n                             const Table<K, V, S>* __restrict table,\n                             Bucket<K, V, S>* buckets, const K* __restrict keys,\n                             const V* __restrict values,\n                             const S* __restrict scores, const S global_epoch) {\n    if (load_factor <= 0.5) {\n      const unsigned int tile_size = 4;\n      const size_t N = n * tile_size;\n      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);\n      upsert_kernel_with_io_core<K, V, S, Strategy, tile_size>\n          <<<grid_size, block_size, 0, stream>>>(\n              table, buckets, bucket_max_size, buckets_num, dim, keys, values,\n              scores, global_epoch, N);\n\n    } else if (load_factor <= 0.875) {\n      const unsigned int tile_size = 8;\n      const size_t N = n * tile_size;\n      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);\n      upsert_kernel_with_io_core<K, V, S, Strategy, tile_size>\n          <<<grid_size, block_size, 0, stream>>>(\n              table, buckets, bucket_max_size, buckets_num, dim, keys, values,\n              scores, global_epoch, N);\n    } else {\n      const unsigned int tile_size = 32;\n      const size_t N = n * tile_size;\n      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);\n      upsert_kernel_with_io_core<K, V, S, Strategy, tile_size>\n          <<<grid_size, block_size, 0, stream>>>(\n              table, buckets, bucket_max_size, buckets_num, dim, keys, values,\n              scores, global_epoch, N);\n    }\n    return;\n  }\n};\n\n// Use 1 thread to deal with a KV-pair.\ntemplate <typename K = uint64_t, typename V = byte4, typename S = uint64_t,\n          uint32_t BLOCK_SIZE = 128, int Strategy = -1>\n__global__ void upsert_kernel_lock_key_hybrid(\n    Bucket<K, V, S>* __restrict__ buckets, int32_t* __restrict__ buckets_size,\n    const uint64_t buckets_num, uint32_t bucket_capacity, const uint32_t dim,\n    const K* __restrict__ keys, V** __restrict__ value_ptrs,\n    const S* __restrict__ scores, K** __restrict__ key_ptrs,\n    int* __restrict keys_index, uint64_t n, const S global_epoch) {\n  using BUCKET = Bucket<K, V, S>;\n  using ScoreFunctor = ScoreFunctor<K, V, S, Strategy>;\n\n  // bucket_capacity is a multiple of 4.\n  constexpr uint32_t STRIDE_S = 4;\n  constexpr uint32_t Load_LEN_S = sizeof(byte16) / sizeof(S);\n  __shared__ S sm_bucket_scores[BLOCK_SIZE][2 * STRIDE_S];\n\n  uint32_t tx = threadIdx.x;\n  uint32_t kv_idx = blockIdx.x * blockDim.x + tx;\n  K key{static_cast<K>(EMPTY_KEY)};\n  S score{static_cast<S>(EMPTY_SCORE)};\n  OccupyResult occupy_result{OccupyResult::INITIAL};\n  VecD_Comp target_digests{0};\n  V* bucket_values_ptr{nullptr};\n  K* bucket_keys_ptr{nullptr};\n  int32_t* bucket_size_ptr{nullptr};\n  uint32_t key_pos = {0};\n  uint32_t bucket_size{0};\n  if (kv_idx < n) {\n    key = keys[kv_idx];\n\n    // help to address the original key after sorting value pointers.\n    if (keys_index) {\n      keys_index[kv_idx] = kv_idx;\n    }\n\n    score = ScoreFunctor::desired_when_missed(scores, kv_idx, global_epoch);\n    if (!IS_RESERVED_KEY<K>(key)) {\n      const K hashed_key = Murmur3HashDevice(key);\n      target_digests = digests_from_hashed<K>(hashed_key);\n      uint64_t global_idx =\n          static_cast<uint64_t>(hashed_key % (buckets_num * bucket_capacity));\n      key_pos = get_start_position(global_idx, bucket_capacity);\n      uint64_t bkt_idx = global_idx / bucket_capacity;\n      bucket_size_ptr = buckets_size + bkt_idx;\n      BUCKET* bucket = buckets + bkt_idx;\n      bucket_size = *bucket_size_ptr;\n      bucket_keys_ptr = reinterpret_cast<K*>(bucket->keys(0));\n      bucket_values_ptr = reinterpret_cast<V*>(bucket->vectors);\n    } else {\n      key_ptrs[kv_idx] = nullptr;\n      value_ptrs[kv_idx] = nullptr;\n      return;\n    }\n  } else {\n    return;\n  }\n\n  // Load `STRIDE` digests every time.\n  constexpr uint32_t STRIDE = sizeof(VecD_Load) / sizeof(D);\n  // One more loop to handle empty keys.\n  for (int offset = 0; offset < bucket_capacity + STRIDE; offset += STRIDE) {\n    if (occupy_result != OccupyResult::INITIAL) break;\n\n    uint32_t pos_cur = align_to<STRIDE>(key_pos);\n    pos_cur = (pos_cur + offset) & (bucket_capacity - 1);\n\n    D* digests_ptr = BUCKET::digests(bucket_keys_ptr, bucket_capacity, pos_cur);\n    VecD_Load digests_vec = *(reinterpret_cast<VecD_Load*>(digests_ptr));\n    VecD_Comp digests_arr[4] = {digests_vec.x, digests_vec.y, digests_vec.z,\n                                digests_vec.w};\n\n    for (int i = 0; i < 4; i++) {\n      VecD_Comp probe_digests = digests_arr[i];\n      uint32_t possible_pos = 0;\n      bool result = false;\n      // Perform a vectorized comparison by byte,\n      // and if they are equal, set the corresponding byte in the result to\n      // 0xff.\n      int cmp_result = __vcmpeq4(probe_digests, target_digests);\n      cmp_result &= 0x01010101;\n      do {\n        if (cmp_result == 0) break;\n        // CUDA uses little endian,\n        // and the lowest byte in register stores in the lowest address.\n        uint32_t index = (__ffs(cmp_result) - 1) >> 3;\n        cmp_result &= (cmp_result - 1);\n        possible_pos = pos_cur + i * 4 + index;\n        auto current_key = BUCKET::keys(bucket_keys_ptr, possible_pos);\n        K expected_key = key;\n        // Modifications to the bucket will not before this instruction.\n        result = current_key->compare_exchange_strong(\n            expected_key, static_cast<K>(LOCKED_KEY),\n            cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);\n      } while (!result);\n      if (result) {\n        occupy_result = OccupyResult::DUPLICATE;\n        key_pos = possible_pos;\n        ScoreFunctor::update_with_digest(bucket_keys_ptr, key_pos, scores,\n                                         kv_idx, score, bucket_capacity,\n                                         get_digest<K>(key), false);\n        break;\n      } else if (bucket_size == bucket_capacity) {\n        continue;\n      }\n      VecD_Comp empty_digests_ = empty_digests<K>();\n      cmp_result = __vcmpeq4(probe_digests, empty_digests_);\n      cmp_result &= 0x01010101;\n      do {\n        if (cmp_result == 0) break;\n        uint32_t index = (__ffs(cmp_result) - 1) >> 3;\n        cmp_result &= (cmp_result - 1);\n        possible_pos = pos_cur + i * 4 + index;\n        if (offset == 0 && possible_pos < key_pos) continue;\n        auto current_key = BUCKET::keys(bucket_keys_ptr, possible_pos);\n        K expected_key = static_cast<K>(EMPTY_KEY);\n        result = current_key->compare_exchange_strong(\n            expected_key, static_cast<K>(LOCKED_KEY),\n            cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);\n      } while (!result);\n      if (result) {\n        occupy_result = OccupyResult::OCCUPIED_EMPTY;\n        key_pos = possible_pos;\n        ScoreFunctor::update_with_digest(bucket_keys_ptr, key_pos, scores,\n                                         kv_idx, score, bucket_capacity,\n                                         get_digest<K>(key), true);\n        atomicAdd(bucket_size_ptr, 1);\n        break;\n      }\n    }\n  }\n\n  while (occupy_result == OccupyResult::INITIAL) {\n    S* bucket_scores_ptr = BUCKET::scores(bucket_keys_ptr, bucket_capacity, 0);\n    S min_score = static_cast<S>(MAX_SCORE);\n    int min_pos = -1;\n#pragma unroll\n    for (int j = 0; j < STRIDE_S; j += Load_LEN_S) {\n      __pipeline_memcpy_async(sm_bucket_scores[tx] + j, bucket_scores_ptr + j,\n                              sizeof(S) * Load_LEN_S);\n    }\n    __pipeline_commit();\n    for (int i = 0; i < bucket_capacity; i += STRIDE_S) {\n      if (i < bucket_capacity - STRIDE_S) {\n#pragma unroll\n        for (int j = 0; j < STRIDE_S; j += Load_LEN_S) {\n          __pipeline_memcpy_async(\n              sm_bucket_scores[tx] + diff_buf(i / STRIDE_S) * STRIDE_S + j,\n              bucket_scores_ptr + i + STRIDE_S + j, sizeof(S) * Load_LEN_S);\n        }\n      }\n      __pipeline_commit();\n      __pipeline_wait_prior(1);\n      S temp_scores[Load_LEN_S];\n      S* src = sm_bucket_scores[tx] + same_buf(i / STRIDE_S) * STRIDE_S;\n#pragma unroll\n      for (int k = 0; k < STRIDE_S; k += Load_LEN_S) {\n        *reinterpret_cast<byte16*>(temp_scores) =\n            *reinterpret_cast<byte16*>(src + k);\n#pragma unroll\n        for (int j = 0; j < Load_LEN_S; j += 1) {\n          S temp_score = temp_scores[j];\n          if (temp_score < min_score) {\n            auto verify_key_ptr = BUCKET::keys(bucket_keys_ptr, i + k + j);\n            auto verify_key =\n                verify_key_ptr->load(cuda::std::memory_order_relaxed);\n            if (verify_key != static_cast<K>(LOCKED_KEY) &&\n                verify_key != static_cast<K>(EMPTY_KEY)) {\n              min_score = temp_score;\n              min_pos = i + k + j;\n            }\n          }\n        }\n      }\n    }\n\n    score = ScoreFunctor::desired_when_missed(scores, kv_idx, global_epoch);\n    if (score <= min_score) {\n      occupy_result = OccupyResult::REFUSED;\n      break;\n    }\n    auto min_score_key = BUCKET::keys(bucket_keys_ptr, min_pos);\n    auto expected_key = min_score_key->load(cuda::std::memory_order_relaxed);\n    if (expected_key != static_cast<K>(LOCKED_KEY) &&\n        expected_key != static_cast<K>(EMPTY_KEY)) {\n      bool result = min_score_key->compare_exchange_strong(\n          expected_key, static_cast<K>(LOCKED_KEY),\n          cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);\n      if (result) {\n        S* min_score_ptr =\n            BUCKET::scores(bucket_keys_ptr, bucket_capacity, min_pos);\n        auto verify_score_ptr =\n            reinterpret_cast<AtomicScore<S>*>(min_score_ptr);\n        auto verify_score =\n            verify_score_ptr->load(cuda::std::memory_order_relaxed);\n        if (verify_score <= min_score) {\n          key_pos = min_pos;\n          ScoreFunctor::update_with_digest(bucket_keys_ptr, key_pos, scores,\n                                           kv_idx, score, bucket_capacity,\n                                           get_digest<K>(key), true);\n          if (expected_key == static_cast<K>(RECLAIM_KEY)) {\n            occupy_result = OccupyResult::OCCUPIED_RECLAIMED;\n            atomicAdd(bucket_size_ptr, 1);\n          } else {\n            occupy_result = OccupyResult::EVICT;\n          }\n        } else {\n          min_score_key->store(expected_key, cuda::std::memory_order_release);\n        }\n      }\n    }\n  }\n\n  if (kv_idx < n) {\n    if (occupy_result == OccupyResult::REFUSED) {\n      value_ptrs[kv_idx] = nullptr;\n      key_ptrs[kv_idx] = nullptr;\n    } else {\n      value_ptrs[kv_idx] = bucket_values_ptr + key_pos * dim;\n      key_ptrs[kv_idx] = bucket_keys_ptr + key_pos;\n    }\n  }\n}\n\ntemplate <class K, class V, class S>\n__global__ void write_kernel_unlock_key(const V* __restrict src,\n                                        V** __restrict dst,\n                                        const int* __restrict src_offset,\n                                        const size_t dim, const K* keys,\n                                        K** __restrict__ key_ptrs,\n                                        const size_t N) {\n  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;\n\n  for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) {\n    int vec_index = int(t / dim);\n    int dim_index = t % dim;\n    int real_idx = src_offset ? src_offset[vec_index] : vec_index;\n\n    K* key_ptr = key_ptrs[real_idx];\n    K key = keys[real_idx];\n    V* value_ptr = dst[vec_index];\n\n    if (key_ptr && dim_index == 0) *key_ptr = key;\n\n    if (value_ptr) {\n      value_ptr[dim_index] = src[real_idx * dim + dim_index];\n    }\n  }\n}\n\n/* Upsert with the end-user specified score.\n */\ntemplate <class K, class V, class S, int Strategy, uint32_t TILE_SIZE = 4>\n__global__ void upsert_kernel(const Table<K, V, S>* __restrict table,\n                              Bucket<K, V, S>* buckets,\n                              const size_t bucket_max_size,\n                              const size_t buckets_num, const size_t dim,\n                              const K* __restrict keys, V** __restrict vectors,\n                              const S* __restrict scores,\n                              int* __restrict src_offset, const S global_epoch,\n                              size_t N) {\n  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());\n  int* buckets_size = table->buckets_size;\n\n  using ScoreFunctor = ScoreFunctor<K, V, S, Strategy>;\n\n  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;\n       t += blockDim.x * gridDim.x) {\n    int key_pos = -1;\n    size_t key_idx = t / TILE_SIZE;\n\n    const K insert_key = keys[key_idx];\n    if (IS_RESERVED_KEY<K>(insert_key)) continue;\n\n    const S insert_score =\n        ScoreFunctor::desired_when_missed(scores, key_idx, global_epoch);\n\n    size_t bkt_idx = 0;\n    size_t start_idx = 0;\n    int src_lane = -1;\n    K evicted_key;\n\n    Bucket<K, V, S>* bucket = get_key_position<K>(\n        buckets, insert_key, bkt_idx, start_idx, buckets_num, bucket_max_size);\n\n    if (src_offset != nullptr && g.thread_rank() == 0) {\n      *(src_offset + key_idx) = key_idx;\n    }\n\n    OccupyResult occupy_result{OccupyResult::INITIAL};\n    const int bucket_size = buckets_size[bkt_idx];\n    do {\n      if (bucket_size < bucket_max_size) {\n        occupy_result = find_and_lock_when_vacant<K, V, S, TILE_SIZE>(\n            g, bucket, insert_key, insert_score, evicted_key, start_idx,\n            key_pos, src_lane, bucket_max_size);\n      } else {\n        start_idx = (start_idx / TILE_SIZE) * TILE_SIZE;\n        occupy_result = find_and_lock_when_vacant<K, V, S, TILE_SIZE>(\n            g, bucket, insert_key, insert_score, evicted_key, start_idx,\n            key_pos, src_lane, bucket_max_size);\n      }\n\n      occupy_result = g.shfl(occupy_result, src_lane);\n    } while (occupy_result == OccupyResult::CONTINUE);\n\n    if (occupy_result == OccupyResult::REFUSED) continue;\n\n    if ((occupy_result == OccupyResult::OCCUPIED_EMPTY ||\n         occupy_result == OccupyResult::OCCUPIED_RECLAIMED) &&\n        g.thread_rank() == src_lane) {\n      atomicAdd(&(buckets_size[bkt_idx]), 1);\n    }\n\n    if (g.thread_rank() == src_lane) {\n      *(vectors + key_idx) = (bucket->vectors + key_pos * dim);\n      ScoreFunctor::update(bucket, key_pos, scores, key_idx, insert_score,\n                           (occupy_result != OccupyResult::DUPLICATE));\n      bucket->digests(key_pos)[0] = get_digest<K>(insert_key);\n      (bucket->keys(key_pos))\n          ->store(insert_key, cuda::std::memory_order_relaxed);\n    }\n  }\n}\n\n/* Write the N data from src to each address in *dst,\n   usually called by upsert kernel.\n\n   `src`: A continuous memory pointer with Vector\n          which can be HBM.\n   `dst`: A pointer of pointer to V which should be on HBM,\n          but each value (a pointer of V) could point to a\n          memory on HBM or HMEM.\n   `N`: Number of vectors that need to be written.\n*/\ntemplate <class K, class V, class S>\n__global__ void write_kernel(const V* __restrict src, V** __restrict dst,\n                             const int* __restrict src_offset, const size_t dim,\n                             const size_t N) {\n  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;\n\n  for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) {\n    int vec_index = int(t / dim);\n    int dim_index = t % dim;\n\n    if (dst[vec_index] != nullptr) {\n      if (src_offset != nullptr) {\n        dst[vec_index][dim_index] =\n            src[src_offset[vec_index] * dim + dim_index];\n      } else {\n        dst[vec_index][dim_index] = src[vec_index * dim + dim_index];\n      }\n    }\n  }\n}\n\n/* Write the N data from src to each address in *dst by using CPU threads,\n * usually called by upsert kernel.\n *\n * @note: In some machines with AMD CPUs, the `write_kernel` has low performance\n * thru PCI-E, so we try to use the `memcpy` on CPU threads for writing work to\n * reach better performance.\n */\ntemplate <class V>\nvoid write_by_cpu(V** __restrict dst, const V* __restrict src,\n                  const int* __restrict offset, size_t dim, int N,\n                  int n_worker = 16) {\n  std::vector<std::thread> thds;\n  if (n_worker < 1) n_worker = 1;\n\n  auto functor = [dim](V** __restrict dst, const V* __restrict src,\n                       const int* __restrict offset, int handled_size,\n                       int trunk_size) -> void {\n    for (int i = handled_size; i < handled_size + trunk_size; i++) {\n      if (dst[i] != nullptr) {\n        memcpy(dst[i], src + offset[i] * dim, sizeof(V) * dim);\n      }\n    }\n  };\n\n  int32_t trunk_size_floor = N / n_worker;\n  int32_t trunk_size_remain = N % n_worker;\n  int32_t n_worker_used = trunk_size_floor == 0 ? trunk_size_remain : n_worker;\n\n  size_t handled_size = 0;\n  for (int i = 0; i < n_worker_used; i++) {\n    int32_t cur_trunk_size = trunk_size_floor;\n    if (trunk_size_remain != 0) {\n      cur_trunk_size += 1;\n      trunk_size_remain--;\n    }\n    thds.push_back(\n        std::thread(functor, dst, src, offset, handled_size, cur_trunk_size));\n    handled_size += cur_trunk_size;\n  }\n\n  for (int i = 0; i < n_worker_used; i++) {\n    thds[i].join();\n  }\n}\n\n}  // namespace merlin\n}  // namespace nv"
  },
  {
    "path": "include/merlin/core_kernels/upsert_and_evict.cuh",
    "content": "/*\n * Copyright (c) 2023, NVIDIA CORPORATION.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *     http:///www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n#pragma once\n\n#include \"kernel_utils.cuh\"\n\nnamespace nv {\nnamespace merlin {\n\n// Use 1 thread to deal with a KV-pair, including copying value.\ntemplate <typename K = uint64_t, typename V = byte4, typename S = uint64_t,\n          typename VecV = byte16, uint32_t BLOCK_SIZE = 128, int Strategy = -1>\n__global__ void tlp_v1_upsert_and_evict_kernel_unique(\n    Bucket<K, V, S>* __restrict__ buckets, int32_t* __restrict__ buckets_size,\n    const uint64_t buckets_num, uint32_t bucket_capacity, const uint32_t dim,\n    const K* __restrict__ keys, const VecV* __restrict__ values,\n    const S* __restrict__ scores, K* __restrict__ evicted_keys,\n    VecV* __restrict__ evicted_values, S* __restrict__ evicted_scores,\n    uint64_t n, uint64_t* __restrict__ evicted_counter, const S global_epoch) {\n  using BUCKET = Bucket<K, V, S>;\n  using CopyValue = CopyValueMultipleGroup<VecV, 1>;\n  using ScoreFunctor = ScoreFunctor<K, V, S, Strategy>;\n\n  // bucket_capacity is a multiple of 4.\n  constexpr uint32_t STRIDE_S = 4;\n  constexpr uint32_t Load_LEN_S = sizeof(byte16) / sizeof(S);\n  __shared__ __align__(sizeof(byte16))\n      S sm_bucket_scores[BLOCK_SIZE][2 * STRIDE_S];\n\n  uint32_t tx = threadIdx.x;\n  uint32_t kv_idx = blockIdx.x * blockDim.x + tx;\n  K key{static_cast<K>(EMPTY_KEY)};\n  S score{static_cast<S>(EMPTY_SCORE)};\n  OccupyResult occupy_result{OccupyResult::INITIAL};\n  VecD_Comp target_digests{0};\n  VecV* bucket_values_ptr{nullptr};\n  K* bucket_keys_ptr{nullptr};\n  int32_t* bucket_size_ptr{nullptr};\n  uint32_t key_pos = {0};\n  uint32_t evict_idx{0};\n  uint32_t bucket_size{0};\n  if (kv_idx < n) {\n    key = keys[kv_idx];\n    score = ScoreFunctor::desired_when_missed(scores, kv_idx, global_epoch);\n    if (!IS_RESERVED_KEY<K>(key)) {\n      const K hashed_key = Murmur3HashDevice(key);\n      target_digests = digests_from_hashed<K>(hashed_key);\n      uint64_t global_idx =\n          static_cast<uint64_t>(hashed_key % (buckets_num * bucket_capacity));\n      key_pos = get_start_position(global_idx, bucket_capacity);\n      uint64_t bkt_idx = global_idx / bucket_capacity;\n      bucket_size_ptr = buckets_size + bkt_idx;\n      BUCKET* bucket = buckets + bkt_idx;\n      bucket_size = *bucket_size_ptr;\n      bucket_keys_ptr = reinterpret_cast<K*>(bucket->keys(0));\n      bucket_values_ptr = reinterpret_cast<VecV*>(bucket->vectors);\n    } else {\n      return;\n    }\n  } else {\n    return;\n  }\n\n  // Load `STRIDE` digests every time.\n  constexpr uint32_t STRIDE = sizeof(VecD_Load) / sizeof(D);\n  // One more loop to handle empty keys.\n  for (int offset = 0; offset < bucket_capacity + STRIDE; offset += STRIDE) {\n    if (occupy_result != OccupyResult::INITIAL) break;\n\n    uint32_t pos_cur = align_to<STRIDE>(key_pos);\n    pos_cur = (pos_cur + offset) & (bucket_capacity - 1);\n\n    D* digests_ptr = BUCKET::digests(bucket_keys_ptr, bucket_capacity, pos_cur);\n    VecD_Load digests_vec = *(reinterpret_cast<VecD_Load*>(digests_ptr));\n    VecD_Comp digests_arr[4] = {digests_vec.x, digests_vec.y, digests_vec.z,\n                                digests_vec.w};\n\n    for (int i = 0; i < 4; i++) {\n      VecD_Comp probe_digests = digests_arr[i];\n      uint32_t possible_pos = 0;\n      bool result = false;\n      // Perform a vectorized comparison by byte,\n      // and if they are equal, set the corresponding byte in the result to\n      // 0xff.\n      int cmp_result = __vcmpeq4(probe_digests, target_digests);\n      cmp_result &= 0x01010101;\n      do {\n        if (cmp_result == 0) break;\n        // CUDA uses little endian,\n        // and the lowest byte in register stores in the lowest address.\n        uint32_t index = (__ffs(cmp_result) - 1) >> 3;\n        cmp_result &= (cmp_result - 1);\n        possible_pos = pos_cur + i * 4 + index;\n        auto current_key = BUCKET::keys(bucket_keys_ptr, possible_pos);\n        K expected_key = key;\n        // Modifications to the bucket will not before this instruction.\n        result = current_key->compare_exchange_strong(\n            expected_key, static_cast<K>(LOCKED_KEY),\n            cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);\n      } while (!result);\n      if (result) {\n        occupy_result = OccupyResult::DUPLICATE;\n        key_pos = possible_pos;\n        ScoreFunctor::update_with_digest(\n            bucket_keys_ptr, key_pos, scores, kv_idx, score, bucket_capacity,\n            get_digest<K>(key), (occupy_result != OccupyResult::DUPLICATE));\n        break;\n      } else if (bucket_size == bucket_capacity) {\n        continue;\n      }\n      VecD_Comp empty_digests_ = empty_digests<K>();\n      cmp_result = __vcmpeq4(probe_digests, empty_digests_);\n      cmp_result &= 0x01010101;\n      do {\n        if (cmp_result == 0) break;\n        uint32_t index = (__ffs(cmp_result) - 1) >> 3;\n        cmp_result &= (cmp_result - 1);\n        possible_pos = pos_cur + i * 4 + index;\n        if (offset == 0 && possible_pos < key_pos) continue;\n        auto current_key = BUCKET::keys(bucket_keys_ptr, possible_pos);\n        K expected_key = static_cast<K>(EMPTY_KEY);\n        result = current_key->compare_exchange_strong(\n            expected_key, static_cast<K>(LOCKED_KEY),\n            cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);\n      } while (!result);\n      if (result) {\n        occupy_result = OccupyResult::OCCUPIED_EMPTY;\n        key_pos = possible_pos;\n        ScoreFunctor::update_with_digest(\n            bucket_keys_ptr, key_pos, scores, kv_idx, score, bucket_capacity,\n            get_digest<K>(key), (occupy_result != OccupyResult::DUPLICATE));\n        atomicAdd(bucket_size_ptr, 1);\n        break;\n      }\n    }\n  }\n  while (occupy_result == OccupyResult::INITIAL) {\n    S* bucket_scores_ptr = BUCKET::scores(bucket_keys_ptr, bucket_capacity, 0);\n    S min_score = MAX_SCORE;\n    int min_pos = -1;\n#pragma unroll\n    for (int j = 0; j < STRIDE_S; j += Load_LEN_S) {\n      __pipeline_memcpy_async(sm_bucket_scores[tx] + j, bucket_scores_ptr + j,\n                              sizeof(S) * Load_LEN_S);\n    }\n    __pipeline_commit();\n    for (int i = 0; i < bucket_capacity; i += STRIDE_S) {\n      if (i < bucket_capacity - STRIDE_S) {\n#pragma unroll\n        for (int j = 0; j < STRIDE_S; j += Load_LEN_S) {\n          __pipeline_memcpy_async(\n              sm_bucket_scores[tx] + diff_buf(i / STRIDE_S) * STRIDE_S + j,\n              bucket_scores_ptr + i + STRIDE_S + j, sizeof(S) * Load_LEN_S);\n        }\n      }\n      __pipeline_commit();\n      __pipeline_wait_prior(1);\n      S temp_scores[Load_LEN_S];\n      S* src = sm_bucket_scores[tx] + same_buf(i / STRIDE_S) * STRIDE_S;\n#pragma unroll\n      for (int k = 0; k < STRIDE_S; k += Load_LEN_S) {\n        *reinterpret_cast<byte16*>(temp_scores) =\n            *reinterpret_cast<byte16*>(src + k);\n#pragma unroll\n        for (int j = 0; j < Load_LEN_S; j += 1) {\n          S temp_score = temp_scores[j];\n          if (temp_score < min_score) {\n            min_score = temp_score;\n            min_pos = i + k + j;\n          }\n        }\n      }\n    }\n    score = ScoreFunctor::desired_when_missed(scores, kv_idx, global_epoch);\n    if (score < min_score) {\n      occupy_result = OccupyResult::REFUSED;\n      evict_idx = atomicAdd(evicted_counter, 1);\n      evict_key_score<K, S>(evicted_keys, evicted_scores, evict_idx, key,\n                            score);\n      break;\n    }\n    auto min_score_key = BUCKET::keys(bucket_keys_ptr, min_pos);\n    auto expected_key = min_score_key->load(cuda::std::memory_order_relaxed);\n    if (expected_key != static_cast<K>(LOCKED_KEY) &&\n        expected_key != static_cast<K>(EMPTY_KEY)) {\n      bool result = min_score_key->compare_exchange_strong(\n          expected_key, static_cast<K>(LOCKED_KEY),\n          cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);\n      if (result) {\n        S* min_score_ptr =\n            BUCKET::scores(bucket_keys_ptr, bucket_capacity, min_pos);\n        auto verify_score_ptr =\n            reinterpret_cast<AtomicScore<S>*>(min_score_ptr);\n        auto verify_score =\n            verify_score_ptr->load(cuda::std::memory_order_relaxed);\n        if (verify_score <= min_score) {\n          key_pos = min_pos;\n          ScoreFunctor::update_with_digest(\n              bucket_keys_ptr, key_pos, scores, kv_idx, score, bucket_capacity,\n              get_digest<K>(key), (occupy_result != OccupyResult::DUPLICATE));\n\n          if (expected_key == static_cast<K>(RECLAIM_KEY)) {\n            occupy_result = OccupyResult::OCCUPIED_RECLAIMED;\n            atomicAdd(bucket_size_ptr, 1);\n          } else {\n            occupy_result = OccupyResult::EVICT;\n            evict_idx = atomicAdd(evicted_counter, 1);\n            evict_key_score<K, S>(evicted_keys, evicted_scores, evict_idx,\n                                  expected_key, min_score);\n          }\n        } else {\n          min_score_key->store(expected_key, cuda::std::memory_order_release);\n        }\n      }\n    }\n  }\n  VecV* bucket_value_ptr = bucket_values_ptr + key_pos * dim;\n  const VecV* param_value_ptr = values + kv_idx * dim;\n  VecV* evicted_value_ptr = evicted_values + evict_idx * dim;\n\n  if (occupy_result != OccupyResult::REFUSED) {\n    if (occupy_result == OccupyResult::EVICT) {\n      CopyValue::ldg_stg(0, evicted_value_ptr, bucket_value_ptr, dim);\n    }\n    CopyValue::ldg_stg(0, bucket_value_ptr, param_value_ptr, dim);\n    auto key_address = BUCKET::keys(bucket_keys_ptr, key_pos);\n    // memory_order_release:\n    // Modifications to the bucket will not after this instruction.\n    key_address->store(key, cuda::std::memory_order_release);\n  } else {\n    CopyValue::ldg_stg(0, evicted_value_ptr, param_value_ptr, dim);\n  }\n}\n\n// Use 1 thread to deal with a KV-pair, but use a threads group cto copy value.\ntemplate <typename K = uint64_t, typename V = byte4, typename S = uint64_t,\n          typename VecV = byte16, uint32_t BLOCK_SIZE = 128,\n          uint32_t GROUP_SIZE = 16, int Strategy = -1>\n__global__ void tlp_v2_upsert_and_evict_kernel_unique(\n    Bucket<K, V, S>* __restrict__ buckets, int32_t* __restrict__ buckets_size,\n    const uint64_t buckets_num, uint32_t bucket_capacity, const uint32_t dim,\n    const K* __restrict__ keys, const VecV* __restrict__ values,\n    const S* __restrict__ scores, K* __restrict__ evicted_keys,\n    VecV* __restrict__ evicted_values, S* __restrict__ evicted_scores,\n    uint64_t n, uint64_t* __restrict__ evicted_counter, const S global_epoch) {\n  using BUCKET = Bucket<K, V, S>;\n  using CopyValue = CopyValueMultipleGroup<VecV, GROUP_SIZE>;\n  using ScoreFunctor = ScoreFunctor<K, V, S, Strategy>;\n\n  // bucket_capacity is a multiple of 4.\n  constexpr uint32_t STRIDE_S = 4;\n  constexpr uint32_t Load_LEN_S = sizeof(byte16) / sizeof(S);\n  __shared__ __align__(sizeof(byte16))\n      S sm_bucket_scores[BLOCK_SIZE][2 * STRIDE_S];\n\n  auto g = cg::tiled_partition<GROUP_SIZE>(cg::this_thread_block());\n\n  uint32_t tx = threadIdx.x;\n  uint32_t kv_idx = blockIdx.x * blockDim.x + tx;\n  K key{static_cast<K>(EMPTY_KEY)};\n  S score{static_cast<S>(EMPTY_SCORE)};\n  OccupyResult occupy_result{OccupyResult::INITIAL};\n  VecD_Comp target_digests{0};\n  VecV* bucket_values_ptr{nullptr};\n  K* bucket_keys_ptr{nullptr};\n  int32_t* bucket_size_ptr{nullptr};\n  uint32_t key_pos = {0};\n  uint32_t evict_idx{0};\n  uint32_t bucket_size{0};\n  if (kv_idx < n) {\n    key = keys[kv_idx];\n    score = ScoreFunctor::desired_when_missed(scores, kv_idx, global_epoch);\n    if (!IS_RESERVED_KEY<K>(key)) {\n      const K hashed_key = Murmur3HashDevice(key);\n      target_digests = digests_from_hashed<K>(hashed_key);\n      uint64_t global_idx =\n          static_cast<uint64_t>(hashed_key % (buckets_num * bucket_capacity));\n      key_pos = get_start_position(global_idx, bucket_capacity);\n      uint64_t bkt_idx = global_idx / bucket_capacity;\n      bucket_size_ptr = buckets_size + bkt_idx;\n      BUCKET* bucket = buckets + bkt_idx;\n      bucket_size = *bucket_size_ptr;\n      bucket_keys_ptr = reinterpret_cast<K*>(bucket->keys(0));\n      bucket_values_ptr = reinterpret_cast<VecV*>(bucket->vectors);\n    } else {\n      occupy_result = OccupyResult::ILLEGAL;\n    }\n  } else {\n    occupy_result = OccupyResult::ILLEGAL;\n  }\n\n  // Load `STRIDE` digests every time.\n  constexpr uint32_t STRIDE = sizeof(VecD_Load) / sizeof(D);\n  // One more loop to handle empty keys.\n  for (int offset = 0; offset < bucket_capacity + STRIDE; offset += STRIDE) {\n    if (occupy_result != OccupyResult::INITIAL) break;\n\n    uint32_t pos_cur = align_to<STRIDE>(key_pos);\n    pos_cur = (pos_cur + offset) & (bucket_capacity - 1);\n\n    D* digests_ptr = BUCKET::digests(bucket_keys_ptr, bucket_capacity, pos_cur);\n    VecD_Load digests_vec = *(reinterpret_cast<VecD_Load*>(digests_ptr));\n    VecD_Comp digests_arr[4] = {digests_vec.x, digests_vec.y, digests_vec.z,\n                                digests_vec.w};\n\n    for (int i = 0; i < 4; i++) {\n      VecD_Comp probe_digests = digests_arr[i];\n      uint32_t possible_pos = 0;\n      bool result = false;\n      // Perform a vectorized comparison by byte,\n      // and if they are equal, set the corresponding byte in the result to\n      // 0xff.\n      int cmp_result = __vcmpeq4(probe_digests, target_digests);\n      cmp_result &= 0x01010101;\n      do {\n        if (cmp_result == 0) break;\n        // CUDA uses little endian,\n        // and the lowest byte in register stores in the lowest address.\n        uint32_t index = (__ffs(cmp_result) - 1) >> 3;\n        cmp_result &= (cmp_result - 1);\n        possible_pos = pos_cur + i * 4 + index;\n        auto current_key = BUCKET::keys(bucket_keys_ptr, possible_pos);\n        K expected_key = key;\n        // Modifications to the bucket will not before this instruction.\n        result = current_key->compare_exchange_strong(\n            expected_key, static_cast<K>(LOCKED_KEY),\n            cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);\n      } while (!result);\n      if (result) {\n        occupy_result = OccupyResult::DUPLICATE;\n        key_pos = possible_pos;\n        ScoreFunctor::update_with_digest(bucket_keys_ptr, key_pos, scores,\n                                         kv_idx, score, bucket_capacity,\n                                         get_digest<K>(key), false);\n        break;\n      } else if (bucket_size == bucket_capacity) {\n        continue;\n      }\n      VecD_Comp empty_digests_ = empty_digests<K>();\n      cmp_result = __vcmpeq4(probe_digests, empty_digests_);\n      cmp_result &= 0x01010101;\n      do {\n        if (cmp_result == 0) break;\n        uint32_t index = (__ffs(cmp_result) - 1) >> 3;\n        cmp_result &= (cmp_result - 1);\n        possible_pos = pos_cur + i * 4 + index;\n        if (offset == 0 && possible_pos < key_pos) continue;\n        auto current_key = BUCKET::keys(bucket_keys_ptr, possible_pos);\n        K expected_key = static_cast<K>(EMPTY_KEY);\n        result = current_key->compare_exchange_strong(\n            expected_key, static_cast<K>(LOCKED_KEY),\n            cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);\n      } while (!result);\n      if (result) {\n        occupy_result = OccupyResult::OCCUPIED_EMPTY;\n        key_pos = possible_pos;\n        ScoreFunctor::update_with_digest(bucket_keys_ptr, key_pos, scores,\n                                         kv_idx, score, bucket_capacity,\n                                         get_digest<K>(key), true);\n        atomicAdd(bucket_size_ptr, 1);\n        break;\n      }\n    }\n  }\n  while (occupy_result == OccupyResult::INITIAL) {\n    S* bucket_scores_ptr = BUCKET::scores(bucket_keys_ptr, bucket_capacity, 0);\n    S min_score = static_cast<S>(MAX_SCORE);\n    int min_pos = -1;\n#pragma unroll\n    for (int j = 0; j < STRIDE_S; j += Load_LEN_S) {\n      __pipeline_memcpy_async(sm_bucket_scores[tx] + j, bucket_scores_ptr + j,\n                              sizeof(S) * Load_LEN_S);\n    }\n    __pipeline_commit();\n    for (int i = 0; i < bucket_capacity; i += STRIDE_S) {\n      if (i < bucket_capacity - STRIDE_S) {\n#pragma unroll\n        for (int j = 0; j < STRIDE_S; j += Load_LEN_S) {\n          __pipeline_memcpy_async(\n              sm_bucket_scores[tx] + diff_buf(i / STRIDE_S) * STRIDE_S + j,\n              bucket_scores_ptr + i + STRIDE_S + j, sizeof(S) * Load_LEN_S);\n        }\n      }\n      __pipeline_commit();\n      __pipeline_wait_prior(1);\n      S temp_scores[Load_LEN_S];\n      S* src = sm_bucket_scores[tx] + same_buf(i / STRIDE_S) * STRIDE_S;\n#pragma unroll\n      for (int k = 0; k < STRIDE_S; k += Load_LEN_S) {\n        *reinterpret_cast<byte16*>(temp_scores) =\n            *reinterpret_cast<byte16*>(src + k);\n#pragma unroll\n        for (int j = 0; j < Load_LEN_S; j += 1) {\n          S temp_score = temp_scores[j];\n          if (temp_score < min_score) {\n            auto verify_key_ptr = BUCKET::keys(bucket_keys_ptr, i + k + j);\n            auto verify_key =\n                verify_key_ptr->load(cuda::std::memory_order_relaxed);\n            if (verify_key != static_cast<K>(LOCKED_KEY) &&\n                verify_key != static_cast<K>(EMPTY_KEY)) {\n              min_score = temp_score;\n              min_pos = i + k + j;\n            }\n          }\n        }\n      }\n    }\n    score = ScoreFunctor::desired_when_missed(scores, kv_idx, global_epoch);\n    if (score < min_score) {\n      occupy_result = OccupyResult::REFUSED;\n      evict_idx = atomicAdd(evicted_counter, 1);\n      evict_key_score<K, S>(evicted_keys, evicted_scores, evict_idx, key,\n                            score);\n      break;\n    }\n    auto min_score_key = BUCKET::keys(bucket_keys_ptr, min_pos);\n    auto expected_key = min_score_key->load(cuda::std::memory_order_relaxed);\n    if (expected_key != static_cast<K>(LOCKED_KEY) &&\n        expected_key != static_cast<K>(EMPTY_KEY)) {\n      bool result = min_score_key->compare_exchange_strong(\n          expected_key, static_cast<K>(LOCKED_KEY),\n          cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);\n      if (result) {\n        S* min_score_ptr =\n            BUCKET::scores(bucket_keys_ptr, bucket_capacity, min_pos);\n        auto verify_score_ptr =\n            reinterpret_cast<AtomicScore<S>*>(min_score_ptr);\n        auto verify_score =\n            verify_score_ptr->load(cuda::std::memory_order_relaxed);\n        if (verify_score <= min_score) {\n          key_pos = min_pos;\n          ScoreFunctor::update_with_digest(bucket_keys_ptr, key_pos, scores,\n                                           kv_idx, score, bucket_capacity,\n                                           get_digest<K>(key), true);\n          if (expected_key == static_cast<K>(RECLAIM_KEY)) {\n            occupy_result = OccupyResult::OCCUPIED_RECLAIMED;\n            atomicAdd(bucket_size_ptr, 1);\n          } else {\n            occupy_result = OccupyResult::EVICT;\n            evict_idx = atomicAdd(evicted_counter, 1);\n            evict_key_score<K, S>(evicted_keys, evicted_scores, evict_idx,\n                                  expected_key, min_score);\n          }\n        } else {\n          min_score_key->store(expected_key, cuda::std::memory_order_release);\n        }\n      }\n    }\n  }\n  VecV* bucket_value_ptr{nullptr};\n  if (occupy_result != OccupyResult::ILLEGAL) {\n    bucket_value_ptr = bucket_values_ptr + key_pos * dim;\n  }\n  uint32_t rank = g.thread_rank();\n  uint32_t groupID = threadIdx.x / GROUP_SIZE;\n\n  // Shared memory reuse:\n  // __shared__ S sm_bucket_scores[BLOCK_SIZE][2 * STRIDE_S];\n  // __shared__ VecV sm_values_buffer[GROUP_NUM][2][GROUP_BUF];\n  // assert(GROUP_BUF >= 2 * dim);\n  constexpr uint32_t GROUP_BUFs =\n      GROUP_SIZE * 2 * STRIDE_S * sizeof(S) / sizeof(VecV);\n  constexpr uint32_t GROUP_BUF = GROUP_BUFs / 2;\n  auto sm_values_buffer =\n      reinterpret_cast<VecV*>(&(sm_bucket_scores[0][0])) + groupID * GROUP_BUFs;\n\n  auto occupy_result_next = g.shfl(occupy_result, 0);\n  if (occupy_result_next != OccupyResult::ILLEGAL) {\n    auto kv_idx_next = g.shfl(kv_idx, 0);\n    const VecV* src = values + kv_idx_next * dim;\n    VecV* dst = sm_values_buffer;\n    CopyValue::ldg_sts(rank, dst, src, dim);\n\n    if (occupy_result_next == OccupyResult::EVICT) {\n      const VecV* src = g.shfl(bucket_value_ptr, 0);\n      dst = dst + dim;\n      CopyValue::ldg_sts(rank, dst, src, dim);\n    }\n  }\n  __pipeline_commit();\n\n  for (int i = 0; i < GROUP_SIZE; i++) {\n    if (i + 1 < GROUP_SIZE) {\n      auto occupy_result_next = g.shfl(occupy_result, i + 1);\n      if (occupy_result_next != OccupyResult::ILLEGAL) {\n        auto kv_idx_next = g.shfl(kv_idx, i + 1);\n        const VecV* src = values + kv_idx_next * dim;\n        VecV* dst = sm_values_buffer + diff_buf(i) * GROUP_BUF;\n        CopyValue::ldg_sts(rank, dst, src, dim);\n\n        if (occupy_result_next == OccupyResult::EVICT) {\n          const VecV* src = g.shfl(bucket_value_ptr, i + 1);\n          dst = dst + dim;\n          CopyValue::ldg_sts(rank, dst, src, dim);\n        }\n      }\n    }\n    __pipeline_commit();\n    auto occupy_result_cur = g.shfl(occupy_result, i);\n    if (occupy_result_cur != OccupyResult::ILLEGAL) {\n      auto evict_idx_cur = g.shfl(evict_idx, i);\n\n      VecV* src = sm_values_buffer + same_buf(i) * GROUP_BUF;\n      if (occupy_result_cur != OccupyResult::REFUSED) {\n        VecV* dst = g.shfl(bucket_value_ptr, i);\n        __pipeline_wait_prior(1);\n        CopyValue::lds_stg(rank, dst, src, dim);\n        if (rank == i) {\n          auto key_address = BUCKET::keys(bucket_keys_ptr, key_pos);\n          // memory_order_release:\n          // Modifications to the bucket will not after this instruction.\n          key_address->store(key, cuda::std::memory_order_release);\n        }\n        if (occupy_result_cur == OccupyResult::EVICT) {\n          src = src + dim;\n          VecV* dst = evicted_values + evict_idx_cur * dim;\n          CopyValue::lds_stg(rank, dst, src, dim);\n        }\n      } else {\n        VecV* dst = evicted_values + evict_idx_cur * dim;\n        __pipeline_wait_prior(1);\n        CopyValue::lds_stg(rank, dst, src, dim);\n      }\n    }\n  }\n}\n\ntemplate <typename K, typename V, typename S, typename VecV,\n          uint32_t BLOCK_SIZE, uint32_t GROUP_SIZE, uint32_t BUCKET_SIZE,\n          uint32_t GROUP_NUM = BLOCK_SIZE / GROUP_SIZE,\n          uint32_t offset_param_scores = 0,\n          uint32_t offset_bucket_values_ptr =\n              offset_param_scores + sizeof(S) * BLOCK_SIZE,\n          uint32_t offset_buckets_size_ptr =\n              offset_bucket_values_ptr + sizeof(VecV*) * BLOCK_SIZE,\n          uint32_t offset_bucket_digests =\n              offset_buckets_size_ptr + sizeof(int*) * BLOCK_SIZE,\n          uint32_t offset_bucket_scores =\n              offset_bucket_digests + sizeof(D) * GROUP_NUM * 2 * BUCKET_SIZE,\n          uint32_t offset_values_buffer =\n              offset_bucket_scores + sizeof(S) * GROUP_NUM * 2 * BUCKET_SIZE>\nstruct SharedMemoryManager_Pipeline_UpsertAndEvict {\n  // __shared__ S sm_param_scores[BLOCK_SIZE];\n  // __shared__ VecV* sm_bucket_values_ptr[BLOCK_SIZE];\n  // __shared__ int* sm_buckets_size_ptr[BLOCK_SIZE];\n  // __shared__ D sm_bucket_digests[GROUP_NUM][2][BUCKET_SIZE];\n  // __shared__ S sm_bucket_scores[GROUP_NUM][2][BUCKET_SIZE];\n  // __shared__ VecV sm_values_buffer[GROUP_NUM][2][dim * 2];\n\n  static inline uint32_t total_size(uint32_t dim) {\n    return BLOCK_SIZE * (sizeof(S) + sizeof(VecV*) + sizeof(int*)) +\n           GROUP_NUM * 2 *\n               (BUCKET_SIZE * (sizeof(D) + sizeof(S)) + 2 * dim * sizeof(VecV));\n  }\n  static __forceinline__ __device__ S* param_scores(byte* smem) {\n    return reinterpret_cast<S*>(smem + offset_param_scores);\n  }\n  static __forceinline__ __device__ VecV** bucket_values_ptr(byte* smem) {\n    return reinterpret_cast<VecV**>(smem + offset_bucket_values_ptr);\n  }\n  static __forceinline__ __device__ int** buckets_size_ptr(byte* smem) {\n    return reinterpret_cast<int**>(smem + offset_buckets_size_ptr);\n  }\n  static __forceinline__ __device__ D* bucket_digests(byte* smem,\n                                                      uint32_t groupID,\n                                                      uint32_t buf) {\n    return reinterpret_cast<D*>(smem + offset_bucket_digests) +\n           BUCKET_SIZE * (groupID * 2 + buf);\n  }\n  static __forceinline__ __device__ S* bucket_scores(byte* smem,\n                                                     uint32_t groupID,\n                                                     uint32_t buf) {\n    return reinterpret_cast<S*>(smem + offset_bucket_scores) +\n           BUCKET_SIZE * (groupID * 2 + buf);\n  }\n  static __forceinline__ __device__ VecV* values_buffer(byte* smem,\n                                                        uint32_t groupID,\n                                                        uint32_t buf,\n                                                        uint32_t dim) {\n    return reinterpret_cast<VecV*>(smem + offset_values_buffer) +\n           2 * dim * (groupID * 2 + buf);\n  }\n};\n\ntemplate <typename K = uint64_t, typename V = byte4, typename S = uint64_t,\n          typename VecV = byte16, uint32_t BLOCK_SIZE = 128, int Strategy = -1>\n__global__ void pipeline_upsert_and_evict_kernel_unique(\n    Bucket<K, V, S>* __restrict__ buckets, int32_t* __restrict__ buckets_size,\n    const uint64_t buckets_num, const uint32_t dim, const K* __restrict__ keys,\n    const VecV* __restrict__ values, const S* __restrict__ scores,\n    K* __restrict__ evicted_keys, VecV* __restrict__ evicted_values,\n    S* __restrict__ evicted_scores, uint64_t n,\n    uint64_t* __restrict__ evicted_counter, const S global_epoch) {\n  // Here, GROUP_SIZE * Comp_LEN = BUCKET_SIZE.\n  constexpr uint32_t BUCKET_SIZE = 128;\n  constexpr uint32_t GROUP_SIZE = 32;\n  constexpr uint32_t Comp_LEN = sizeof(VecD_Comp) / sizeof(D);\n  constexpr uint32_t Load_LEN = sizeof(VecD_Load) / sizeof(D);\n  constexpr uint32_t Load_LEN_S = sizeof(byte16) / sizeof(S);\n\n  using BUCKET = Bucket<K, V, S>;\n  using CopyValue = CopyValueMultipleGroup<VecV, GROUP_SIZE>;\n  using SMM =\n      SharedMemoryManager_Pipeline_UpsertAndEvict<K, V, S, VecV, BLOCK_SIZE,\n                                                  GROUP_SIZE, BUCKET_SIZE>;\n  using ScoreFunctor = ScoreFunctor<K, V, S, Strategy>;\n\n  extern __shared__ __align__(sizeof(byte16)) byte smem[];\n\n  // Initialization.\n  auto g = cg::tiled_partition<GROUP_SIZE>(cg::this_thread_block());\n  uint32_t tx = threadIdx.x;\n  uint32_t kv_idx = blockIdx.x * blockDim.x + tx;\n  K key{static_cast<K>(EMPTY_KEY)};\n  VecD_Comp target_digests;\n  K* bucket_keys_ptr{nullptr};\n  OccupyResult occupy_result{OccupyResult::INITIAL};\n  uint32_t key_pos = 0;\n  uint32_t evict_idx = 0;\n  if (kv_idx < n) {\n    key = keys[kv_idx];\n    if (scores != nullptr) {\n      S* sm_param_scores = SMM::param_scores(smem);\n      __pipeline_memcpy_async(sm_param_scores + tx, scores + kv_idx, sizeof(S));\n    }\n    if (!IS_RESERVED_KEY<K>(key)) {\n      const K hashed_key = Murmur3HashDevice(key);\n      target_digests = digests_from_hashed<K>(hashed_key);\n      uint64_t global_idx =\n          static_cast<uint64_t>(hashed_key % (buckets_num * BUCKET_SIZE));\n      uint64_t bkt_idx = global_idx / BUCKET_SIZE;\n      key_pos = get_start_position(global_idx, BUCKET_SIZE);\n      int** sm_buckets_size_ptr = SMM::buckets_size_ptr(smem);\n      sm_buckets_size_ptr[tx] = buckets_size + bkt_idx;\n      BUCKET* bucket = buckets + bkt_idx;\n      bucket_keys_ptr = reinterpret_cast<K*>(bucket->keys(0));\n      VecV** sm_bucket_values_ptr = SMM::bucket_values_ptr(smem);\n      __pipeline_memcpy_async(sm_bucket_values_ptr + tx, &(bucket->vectors),\n                              sizeof(VecV*));\n    } else {\n      occupy_result = OccupyResult::ILLEGAL;\n    }\n  } else {\n    occupy_result = OccupyResult::ILLEGAL;\n  }\n\n  uint32_t rank = g.thread_rank();\n  uint32_t groupID = threadIdx.x / GROUP_SIZE;\n\n  // Pipeline loading.\n  auto occupy_result_next = g.shfl(occupy_result, 0);\n  auto keys_ptr_next = g.shfl(bucket_keys_ptr, 0);\n  if (occupy_result_next == OccupyResult::INITIAL) {\n    D* sm_bucket_digests = SMM::bucket_digests(smem, groupID, 0);\n    D* dst = sm_bucket_digests + rank * Load_LEN;\n    D* src = BUCKET::digests(keys_ptr_next, BUCKET_SIZE, rank * Load_LEN);\n    if (rank * Load_LEN < BUCKET_SIZE) {\n      __pipeline_memcpy_async(dst, src, sizeof(VecD_Load));\n    }\n  }\n  __pipeline_commit();\n  // Padding, meet the param of the first `__pipeline_wait_prior`\n  // in the first loop.\n  __pipeline_commit();\n  __pipeline_commit();\n  for (int32_t i = 0; i < GROUP_SIZE; i++) {\n    // Step1: load digests from global memory to shared memory.\n    if (i + 1 < GROUP_SIZE) {\n      auto occupy_result_next = g.shfl(occupy_result, i + 1);\n      auto keys_ptr_next = g.shfl(bucket_keys_ptr, i + 1);\n      if (occupy_result_next == OccupyResult::INITIAL) {\n        D* sm_bucket_digests = SMM::bucket_digests(smem, groupID, diff_buf(i));\n        D* dst = sm_bucket_digests + rank * Load_LEN;\n        D* src = BUCKET::digests(keys_ptr_next, BUCKET_SIZE, rank * Load_LEN);\n        if (rank * Load_LEN < BUCKET_SIZE) {\n          __pipeline_memcpy_async(dst, src, sizeof(VecD_Load));\n        }\n      }\n    }\n    __pipeline_commit();\n    // Step2: to lock the target_key or empty_key by querying digests.\n    auto occupy_result_cur = g.shfl(occupy_result, i);\n    if (occupy_result_cur == OccupyResult::INITIAL) {\n      uint32_t tx_cur = groupID * GROUP_SIZE + i;\n      int** sm_buckets_size_ptr = SMM::buckets_size_ptr(smem);\n      auto bucket_size_ptr = sm_buckets_size_ptr[tx_cur];\n      K key_cur = g.shfl(key, i);\n      auto target_digests_cur = g.shfl(target_digests, i);\n      auto start_pos_cur = g.shfl(key_pos, i);\n      auto keys_ptr_cur = g.shfl(bucket_keys_ptr, i);\n      auto bucket_size_cur = bucket_size_ptr[0];\n      __pipeline_wait_prior(3);\n      D* src = SMM::bucket_digests(smem, groupID, same_buf(i));\n      uint32_t start_offset = start_pos_cur / Comp_LEN;\n      uint32_t probe_offset =\n          Comp_LEN * ((start_offset + rank) & (GROUP_SIZE - 1));\n      VecD_Comp probe_digests =\n          *reinterpret_cast<VecD_Comp*>(src + probe_offset);\n      uint32_t cmp_result = __vcmpeq4(probe_digests, target_digests_cur);\n      cmp_result &= 0x01010101;\n      uint32_t possible_pos = 0;\n      bool result = false;\n      do {\n        if (cmp_result == 0) break;\n        int32_t index = (__ffs(cmp_result) - 1) >> 3;\n        cmp_result &= (cmp_result - 1);\n        possible_pos = probe_offset + index;\n        auto current_key = BUCKET::keys(keys_ptr_cur, possible_pos);\n        K expected_key = key_cur;\n        result = current_key->compare_exchange_strong(\n            expected_key, static_cast<K>(LOCKED_KEY),\n            cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);\n      } while (!result);\n      uint32_t found_vote = g.ballot(result);\n      if (found_vote) {\n        int32_t src_lane = __ffs(found_vote) - 1;\n        possible_pos = g.shfl(possible_pos, src_lane);\n        if (rank == i) {\n          occupy_result = OccupyResult::DUPLICATE;\n          S* sm_param_scores = SMM::param_scores(smem);\n          key_pos = possible_pos;\n          S score = ScoreFunctor::desired_when_missed(sm_param_scores, tx,\n                                                      global_epoch);\n          ScoreFunctor::update_with_digest(\n              bucket_keys_ptr, key_pos, sm_param_scores, tx, score, BUCKET_SIZE,\n              get_digest<K>(key), false);\n        }\n      } else if (bucket_size_cur < BUCKET_SIZE) {\n        VecD_Comp empty_digests_ = empty_digests<K>();\n        cmp_result = __vcmpeq4(probe_digests, empty_digests_);\n        cmp_result &= 0x01010101;\n        for (int32_t offset = 0; offset < GROUP_SIZE; offset += 1) {\n          if (rank == offset) {\n            do {\n              if (cmp_result == 0) break;\n              int32_t index = (__ffs(cmp_result) - 1) >> 3;\n              cmp_result &= (cmp_result - 1);\n              possible_pos = probe_offset + index;\n              if (offset == 0 && possible_pos < start_pos_cur) continue;\n              auto current_key = BUCKET::keys(keys_ptr_cur, possible_pos);\n              K expected_key = static_cast<K>(EMPTY_KEY);\n              result = current_key->compare_exchange_strong(\n                  expected_key, static_cast<K>(LOCKED_KEY),\n                  cuda::std::memory_order_acquire,\n                  cuda::std::memory_order_relaxed);\n            } while (!result);\n          }\n          uint32_t found_vote = g.ballot(result);\n          if (found_vote) {\n            int32_t src_lane = __ffs(found_vote) - 1;\n            possible_pos = g.shfl(possible_pos, src_lane);\n            if (rank == i) {\n              occupy_result = OccupyResult::OCCUPIED_EMPTY;\n              S* sm_param_scores = SMM::param_scores(smem);\n              S score = ScoreFunctor::desired_when_missed(sm_param_scores, tx,\n                                                          global_epoch);\n              int** sm_buckets_size_ptr = SMM::buckets_size_ptr(smem);\n              int* bucket_size_ptr = sm_buckets_size_ptr[tx];\n              key_pos = possible_pos;\n              ScoreFunctor::update_with_digest(\n                  bucket_keys_ptr, key_pos, sm_param_scores, tx, score,\n                  BUCKET_SIZE, get_digest<K>(key), true);\n              atomicAdd(bucket_size_ptr, 1);\n            }\n            break;\n          }\n        }\n      }\n      occupy_result_cur = g.shfl(occupy_result, i);\n      if (occupy_result_cur == OccupyResult::INITIAL) {\n        S* sm_bucket_scores = SMM::bucket_scores(smem, groupID, same_buf(i));\n        S* dst = sm_bucket_scores + rank * Load_LEN_S;\n        S* src = BUCKET::scores(keys_ptr_cur, BUCKET_SIZE, rank * Load_LEN_S);\n#pragma unroll\n        for (int32_t k = 0; k < BUCKET_SIZE; k += GROUP_SIZE * Load_LEN_S) {\n          __pipeline_memcpy_async(dst + k, src + k, sizeof(S) * Load_LEN_S);\n        }\n      }\n    }\n    __pipeline_commit();\n    // Step 3: reduce to get the key with the minimum score.\n    if (i > 0) {\n      occupy_result_cur = g.shfl(occupy_result, i - 1);\n      uint32_t tx_cur = groupID * GROUP_SIZE + i - 1;\n      S* sm_param_scores = SMM::param_scores(smem);\n      S score_cur = ScoreFunctor::desired_when_missed(sm_param_scores, tx_cur,\n                                                      global_epoch);\n      int** sm_buckets_size_ptr = SMM::buckets_size_ptr(smem);\n      auto bucket_size_ptr = sm_buckets_size_ptr[tx_cur];\n      __pipeline_wait_prior(3);\n      S* src = SMM::bucket_scores(smem, groupID, diff_buf(i));\n      while (occupy_result_cur == OccupyResult::INITIAL) {\n        int min_pos_local = -1;\n        S min_score_local = MAX_SCORE;\n#pragma unroll\n        for (int j = 0; j < BUCKET_SIZE; j += GROUP_SIZE * Load_LEN_S) {\n          S temp_scores[Load_LEN_S];\n          *reinterpret_cast<byte16*>(temp_scores) =\n              *reinterpret_cast<byte16*>(src + rank * Load_LEN_S + j);\n#pragma unroll\n          for (int k = 0; k < Load_LEN_S; k++) {\n            S temp_score = temp_scores[k];\n            if (temp_score < min_score_local) {\n              min_score_local = temp_score;\n              min_pos_local = rank * Load_LEN_S + j + k;\n            }\n          }\n        }\n        const S min_score_global =\n            cg::reduce(g, min_score_local, cg::less<S>());\n        if (score_cur < min_score_global) {\n          if (rank == i - 1) {\n            occupy_result = OccupyResult::REFUSED;\n            evict_idx = atomicAdd(evicted_counter, 1);\n            evict_key_score<K, S>(evicted_keys, evicted_scores, evict_idx, key,\n                                  score_cur);\n          }\n          occupy_result_cur = g.shfl(occupy_result, i - 1);\n          break;\n        }\n        uint32_t vote = g.ballot(min_score_local <= min_score_global);\n        if (vote) {\n          int src_lane = __ffs(vote) - 1;\n          int min_pos_global = g.shfl(min_pos_local, src_lane);\n          if (rank == i - 1) {\n            src[min_pos_global] = static_cast<S>(MAX_SCORE);  // Mark visited.\n            auto min_score_key = BUCKET::keys(bucket_keys_ptr, min_pos_global);\n            auto expected_key =\n                min_score_key->load(cuda::std::memory_order_relaxed);\n            if (expected_key != static_cast<K>(LOCKED_KEY) &&\n                expected_key != static_cast<K>(EMPTY_KEY)) {\n              bool result = min_score_key->compare_exchange_strong(\n                  expected_key, static_cast<K>(LOCKED_KEY),\n                  cuda::std::memory_order_acquire,\n                  cuda::std::memory_order_relaxed);\n              if (result) {\n                S* score_ptr = BUCKET::scores(bucket_keys_ptr, BUCKET_SIZE,\n                                              min_pos_global);\n                auto verify_score_ptr =\n                    reinterpret_cast<AtomicScore<S>*>(score_ptr);\n                auto verify_score =\n                    verify_score_ptr->load(cuda::std::memory_order_relaxed);\n                if (verify_score <= min_score_global) {\n                  if (expected_key == static_cast<K>(RECLAIM_KEY)) {\n                    occupy_result = OccupyResult::OCCUPIED_RECLAIMED;\n                    atomicAdd(bucket_size_ptr, 1);\n                  } else {\n                    occupy_result = OccupyResult::EVICT;\n                    evict_idx = atomicAdd(evicted_counter, 1);\n                    evict_key_score<K, S>(evicted_keys, evicted_scores,\n                                          evict_idx, expected_key,\n                                          min_score_global);\n                  }\n                  key_pos = min_pos_global;\n                  ScoreFunctor::update_with_digest(\n                      bucket_keys_ptr, key_pos, sm_param_scores, tx_cur,\n                      score_cur, BUCKET_SIZE, get_digest<K>(key), true);\n\n                } else {\n                  min_score_key->store(expected_key,\n                                       cuda::std::memory_order_release);\n                }\n              }\n            }\n          }\n          occupy_result_cur = g.shfl(occupy_result, i - 1);\n        }\n      }\n      // Prefetch values to shared memory.\n      if (occupy_result_cur != OccupyResult::ILLEGAL) {\n        auto kv_idx_cur = g.shfl(kv_idx, i - 1);\n        const VecV* src = values + kv_idx_cur * dim;\n        VecV* dst = SMM::values_buffer(smem, groupID, diff_buf(i), dim);\n        CopyValue::ldg_sts(rank, dst, src, dim);\n\n        if (occupy_result_cur == OccupyResult::EVICT) {\n          VecV** sm_bucket_values_ptr = SMM::bucket_values_ptr(smem);\n          auto bucket_values_ptr =\n              sm_bucket_values_ptr[groupID * GROUP_SIZE + i - 1];\n          auto key_pos_cur = g.shfl(key_pos, i - 1);\n          const VecV* src = bucket_values_ptr + key_pos_cur * dim;\n          dst = dst + dim;\n          CopyValue::ldg_sts(rank, dst, src, dim);\n        }\n      }\n    }\n    __pipeline_commit();\n\n    // Step 4: write values to bucket and evicted buffer.\n    if (i > 1) {\n      occupy_result_cur = g.shfl(occupy_result, i - 2);\n      if (occupy_result_cur != OccupyResult::ILLEGAL) {\n        VecV** sm_bucket_values_ptr = SMM::bucket_values_ptr(smem);\n        auto bucket_values_ptr =\n            sm_bucket_values_ptr[groupID * GROUP_SIZE + i - 2];\n        auto key_pos_cur = g.shfl(key_pos, i - 2);\n        auto evict_idx_cur = g.shfl(evict_idx, i - 2);\n\n        VecV* src = SMM::values_buffer(smem, groupID, same_buf(i), dim);\n        if (occupy_result_cur == OccupyResult::REFUSED) {\n          VecV* dst = evicted_values + evict_idx_cur * dim;\n          __pipeline_wait_prior(3);\n          CopyValue::lds_stg(rank, dst, src, dim);\n        } else {\n          VecV* dst = bucket_values_ptr + key_pos_cur * dim;\n          __pipeline_wait_prior(3);\n          CopyValue::lds_stg(rank, dst, src, dim);\n          if (rank == i - 2) {\n            auto key_address = BUCKET::keys(bucket_keys_ptr, key_pos);\n            key_address->store(key, cuda::std::memory_order_release);\n          }\n          if (occupy_result_cur == OccupyResult::EVICT) {\n            src = src + dim;\n            VecV* dst = evicted_values + evict_idx_cur * dim;\n            __pipeline_wait_prior(3);\n            CopyValue::lds_stg(rank, dst, src, dim);\n          }\n        }\n      }\n    }\n  }\n  auto occupy_result_cur = g.shfl(occupy_result, GROUP_SIZE - 1);\n  uint32_t tx_cur = groupID * GROUP_SIZE + GROUP_SIZE - 1;\n  S* sm_param_scores = SMM::param_scores(smem);\n  S score_cur =\n      ScoreFunctor::desired_when_missed(sm_param_scores, tx_cur, global_epoch);\n  int** sm_buckets_size_ptr = SMM::buckets_size_ptr(smem);\n  auto bucket_size_ptr = sm_buckets_size_ptr[tx_cur];\n  __pipeline_wait_prior(1);\n  S* src = SMM::bucket_scores(smem, groupID, diff_buf(GROUP_SIZE));\n  while (occupy_result_cur == OccupyResult::INITIAL) {\n    int min_pos_local = -1;\n    S min_score_local = MAX_SCORE;\n#pragma unroll\n    for (int j = 0; j < BUCKET_SIZE; j += GROUP_SIZE * Load_LEN_S) {\n      S temp_scores[Load_LEN_S];\n      *reinterpret_cast<byte16*>(temp_scores) =\n          *reinterpret_cast<byte16*>(src + rank * Load_LEN_S + j);\n#pragma unroll\n      for (int k = 0; k < Load_LEN_S; k++) {\n        S temp_score = temp_scores[k];\n        if (temp_score < min_score_local) {\n          min_score_local = temp_score;\n          min_pos_local = rank * Load_LEN_S + j + k;\n        }\n      }\n    }\n    const S min_score_global = cg::reduce(g, min_score_local, cg::less<S>());\n    if (score_cur < min_score_global) {\n      if (rank == GROUP_SIZE - 1) {\n        occupy_result = OccupyResult::REFUSED;\n        evict_idx = atomicAdd(evicted_counter, 1);\n        evict_key_score<K, S>(evicted_keys, evicted_scores, evict_idx, key,\n                              score_cur);\n      }\n      occupy_result_cur = g.shfl(occupy_result, GROUP_SIZE - 1);\n      break;\n    }\n    uint32_t vote = g.ballot(min_score_local <= min_score_global);\n    if (vote) {\n      int src_lane = __ffs(vote) - 1;\n      int min_pos_global = g.shfl(min_pos_local, src_lane);\n      if (rank == GROUP_SIZE - 1) {\n        src[min_pos_global] = MAX_SCORE;  // Mark visited.\n        auto min_score_key = BUCKET::keys(bucket_keys_ptr, min_pos_global);\n        auto expected_key =\n            min_score_key->load(cuda::std::memory_order_acquire);\n        if (expected_key != static_cast<K>(LOCKED_KEY) &&\n            expected_key != static_cast<K>(EMPTY_KEY)) {\n          auto min_score_ptr =\n              BUCKET::scores(bucket_keys_ptr, BUCKET_SIZE, min_pos_global);\n          bool result = min_score_key->compare_exchange_strong(\n              expected_key, static_cast<K>(LOCKED_KEY),\n              cuda::std::memory_order_acquire, cuda::std::memory_order_acquire);\n          if (result) {\n            S* score_ptr =\n                BUCKET::scores(bucket_keys_ptr, BUCKET_SIZE, min_pos_global);\n            auto verify_score_ptr =\n                reinterpret_cast<AtomicScore<S>*>(score_ptr);\n            auto verify_score =\n                verify_score_ptr->load(cuda::std::memory_order_relaxed);\n            if (verify_score <= min_score_global) {\n              if (expected_key == static_cast<K>(RECLAIM_KEY)) {\n                atomicAdd(bucket_size_ptr, 1);\n                occupy_result = OccupyResult::OCCUPIED_RECLAIMED;\n              } else {\n                occupy_result = OccupyResult::EVICT;\n                evict_idx = atomicAdd(evicted_counter, 1);\n                evict_key_score<K, S>(evicted_keys, evicted_scores, evict_idx,\n                                      expected_key, min_score_global);\n              }\n              key_pos = min_pos_global;\n              ScoreFunctor::update_with_digest(\n                  bucket_keys_ptr, key_pos, sm_param_scores, tx_cur, score_cur,\n                  BUCKET_SIZE, get_digest<K>(key), true);\n            } else {\n              min_score_key->store(expected_key,\n                                   cuda::std::memory_order_release);\n            }\n          }\n        }\n      }\n      occupy_result_cur = g.shfl(occupy_result, GROUP_SIZE - 1);\n    }\n  }\n  if (occupy_result_cur != OccupyResult::ILLEGAL) {\n    auto kv_idx_cur = g.shfl(kv_idx, GROUP_SIZE - 1);\n    VecV** sm_bucket_values_ptr = SMM::bucket_values_ptr(smem);\n    auto bucket_values_ptr =\n        sm_bucket_values_ptr[groupID * GROUP_SIZE + GROUP_SIZE - 1];\n    auto key_pos_cur = g.shfl(key_pos, GROUP_SIZE - 1);\n\n    const VecV* src = values + kv_idx_cur * dim;\n    VecV* dst = SMM::values_buffer(smem, groupID, diff_buf(GROUP_SIZE), dim);\n    CopyValue::ldg_sts(rank, dst, src, dim);\n\n    if (occupy_result_cur == OccupyResult::EVICT) {\n      const VecV* src = bucket_values_ptr + key_pos_cur * dim;\n      dst = dst + dim;\n      CopyValue::ldg_sts(rank, dst, src, dim);\n    }\n  }\n  __pipeline_commit();\n\n  occupy_result_cur = g.shfl(occupy_result, GROUP_SIZE - 2);\n  if (occupy_result_cur != OccupyResult::ILLEGAL) {\n    VecV** sm_bucket_values_ptr = SMM::bucket_values_ptr(smem);\n    auto bucket_values_ptr =\n        sm_bucket_values_ptr[groupID * GROUP_SIZE + GROUP_SIZE - 2];\n    auto key_pos_cur = g.shfl(key_pos, GROUP_SIZE - 2);\n    auto evict_idx_cur = g.shfl(evict_idx, GROUP_SIZE - 2);\n\n    VecV* src = SMM::values_buffer(smem, groupID, same_buf(GROUP_SIZE), dim);\n    if (occupy_result_cur == OccupyResult::REFUSED) {\n      VecV* dst = evicted_values + evict_idx_cur * dim;\n      __pipeline_wait_prior(1);\n      CopyValue::lds_stg(rank, dst, src, dim);\n    } else {\n      VecV* dst = bucket_values_ptr + key_pos_cur * dim;\n      __pipeline_wait_prior(1);\n      CopyValue::lds_stg(rank, dst, src, dim);\n      if (rank == GROUP_SIZE - 2) {\n        auto key_address = BUCKET::keys(bucket_keys_ptr, key_pos);\n        key_address->store(key, cuda::std::memory_order_release);\n      }\n      if (occupy_result_cur == OccupyResult::EVICT) {\n        src = src + dim;\n        VecV* dst = evicted_values + evict_idx_cur * dim;\n        __pipeline_wait_prior(1);\n        CopyValue::lds_stg(rank, dst, src, dim);\n      }\n    }\n  }\n\n  occupy_result_cur = g.shfl(occupy_result, GROUP_SIZE - 1);\n  if (occupy_result_cur != OccupyResult::ILLEGAL) {\n    VecV** sm_bucket_values_ptr = SMM::bucket_values_ptr(smem);\n    auto bucket_values_ptr =\n        sm_bucket_values_ptr[groupID * GROUP_SIZE + GROUP_SIZE - 1];\n    auto evict_idx_cur = g.shfl(evict_idx, GROUP_SIZE - 1);\n    auto key_pos_cur = g.shfl(key_pos, GROUP_SIZE - 1);\n\n    VecV* src =\n        SMM::values_buffer(smem, groupID, same_buf(GROUP_SIZE + 1), dim);\n    if (occupy_result_cur == OccupyResult::REFUSED) {\n      VecV* dst = evicted_values + evict_idx_cur * dim;\n      __pipeline_wait_prior(0);\n      CopyValue::lds_stg(rank, dst, src, dim);\n    } else {\n      VecV* dst = bucket_values_ptr + key_pos_cur * dim;\n      __pipeline_wait_prior(0);\n      CopyValue::lds_stg(rank, dst, src, dim);\n      if (rank == GROUP_SIZE - 1) {\n        auto key_address = BUCKET::keys(bucket_keys_ptr, key_pos);\n        key_address->store(key, cuda::std::memory_order_release);\n      }\n      if (occupy_result_cur == OccupyResult::EVICT) {\n        src = src + dim;\n        VecV* dst = evicted_values + evict_idx_cur * dim;\n        __pipeline_wait_prior(0);\n        CopyValue::lds_stg(rank, dst, src, dim);\n      }\n    }\n  }\n}\n\ntemplate <typename K = uint64_t, typename V = float, typename S = uint64_t>\nstruct Params_UpsertAndEvict {\n  Params_UpsertAndEvict(\n      float load_factor_, Bucket<K, V, S>* __restrict__ buckets_,\n      int* buckets_size_, size_t buckets_num_, uint32_t bucket_capacity_,\n      uint32_t dim_, const K* __restrict__ keys_, const V* __restrict__ values_,\n      const S* __restrict__ scores_, K* __restrict__ evicted_keys_,\n      V* __restrict__ evicted_values_, S* __restrict__ evicted_scores_,\n      size_t n_, size_t* evicted_counter_, const S global_epoch_)\n      : load_factor(load_factor_),\n        buckets(buckets_),\n        buckets_size(buckets_size_),\n        buckets_num(buckets_num_),\n        bucket_capacity(bucket_capacity_),\n        dim(dim_),\n        keys(keys_),\n        values(values_),\n        scores(scores_),\n        evicted_keys(evicted_keys_),\n        evicted_values(evicted_values_),\n        evicted_scores(evicted_scores_),\n        n(n_),\n        evicted_counter(evicted_counter_),\n        global_epoch(global_epoch_) {}\n  float load_factor;\n  Bucket<K, V, S>* __restrict__ buckets;\n  int* buckets_size;\n  size_t buckets_num;\n  uint32_t bucket_capacity;\n  uint32_t dim;\n  const K* __restrict__ keys;\n  const V* __restrict__ values;\n  const S* __restrict__ scores;\n  K* __restrict__ evicted_keys;\n  V* __restrict__ evicted_values;\n  S* __restrict__ evicted_scores;\n  uint64_t n;\n  uint64_t* evicted_counter;\n  const S global_epoch;\n};\n\n// Use 1 thread to deal with a KV-pair, but use a threads group to copy value.\ntemplate <typename K = uint64_t, typename V = byte4, typename S = uint64_t,\n          typename VecV = byte16, uint32_t BLOCK_SIZE = 128,\n          uint32_t GROUP_SIZE = 32, int Strategy = -1>\n__global__ void insert_and_evict_kernel_with_filter(\n    Bucket<K, V, S>* __restrict__ buckets, int32_t* __restrict__ buckets_size,\n    const uint64_t buckets_num, uint32_t bucket_capacity, const uint32_t dim,\n    const K* __restrict__ keys, const VecV* __restrict__ values,\n    const S* __restrict__ scores, K* __restrict__ evicted_keys,\n    VecV* __restrict__ evicted_values, S* __restrict__ evicted_scores,\n    uint64_t n, uint64_t* __restrict__ evicted_counter, const S global_epoch) {\n  using BUCKET = Bucket<K, V, S>;\n  using CopyValue = CopyValueMultipleGroup<VecV, GROUP_SIZE>;\n  using ScoreFunctor = ScoreFunctor<K, V, S, Strategy>;\n\n  // bucket_capacity is a multiple of 4.\n  constexpr uint32_t STRIDE_S = 4;\n  constexpr uint32_t Load_LEN_S = sizeof(byte16) / sizeof(S);\n  __shared__ __align__(sizeof(byte16))\n      S sm_bucket_scores[BLOCK_SIZE][2 * STRIDE_S];\n\n  auto g = cg::tiled_partition<GROUP_SIZE>(cg::this_thread_block());\n\n  uint32_t tx = threadIdx.x;\n  uint32_t kv_idx = blockIdx.x * blockDim.x + tx;\n  K key{static_cast<K>(EMPTY_KEY)};\n  S score{static_cast<S>(EMPTY_SCORE)};\n  OccupyResult occupy_result{OccupyResult::INITIAL};\n  VecD_Comp target_digests{0};\n  VecV* bucket_values_ptr{nullptr};\n  K* bucket_keys_ptr{nullptr};\n  int32_t* bucket_size_ptr{nullptr};\n  uint32_t key_pos = {0};\n  uint32_t evict_idx{0};\n  uint32_t bucket_size{0};\n  if (kv_idx < n) {\n    key = keys[kv_idx];\n    score = ScoreFunctor::desired_when_missed(scores, kv_idx, global_epoch);\n    if (!IS_RESERVED_KEY<K>(key)) {\n      const K hashed_key = Murmur3HashDevice(key);\n      target_digests = digests_from_hashed<K>(hashed_key);\n      uint64_t global_idx =\n          static_cast<uint64_t>(hashed_key % (buckets_num * bucket_capacity));\n      key_pos = get_start_position(global_idx, bucket_capacity);\n      uint64_t bkt_idx = global_idx / bucket_capacity;\n      bucket_size_ptr = buckets_size + bkt_idx;\n      BUCKET* bucket = buckets + bkt_idx;\n      bucket_size = *bucket_size_ptr;\n      bucket_keys_ptr = reinterpret_cast<K*>(bucket->keys(0));\n      bucket_values_ptr = reinterpret_cast<VecV*>(bucket->vectors);\n    } else {\n      occupy_result = OccupyResult::ILLEGAL;\n    }\n  } else {\n    occupy_result = OccupyResult::ILLEGAL;\n  }\n\n  // Load `STRIDE` digests every time.\n  constexpr uint32_t STRIDE = sizeof(VecD_Load) / sizeof(D);\n  // One more loop to handle empty keys.\n  for (int offset = 0; offset < bucket_capacity + STRIDE; offset += STRIDE) {\n    if (occupy_result != OccupyResult::INITIAL) break;\n\n    uint32_t pos_cur = align_to<STRIDE>(key_pos);\n    pos_cur = (pos_cur + offset) & (bucket_capacity - 1);\n\n    D* digests_ptr = BUCKET::digests(bucket_keys_ptr, bucket_capacity, pos_cur);\n    VecD_Load digests_vec = *(reinterpret_cast<VecD_Load*>(digests_ptr));\n    VecD_Comp digests_arr[4] = {digests_vec.x, digests_vec.y, digests_vec.z,\n                                digests_vec.w};\n\n    for (int i = 0; i < 4; i++) {\n      VecD_Comp probe_digests = digests_arr[i];\n      uint32_t possible_pos = 0;\n      bool result = false;\n      // Perform a vectorized comparison by byte,\n      // and if they are equal, set the corresponding byte in the result to\n      // 0xff.\n      int cmp_result = __vcmpeq4(probe_digests, target_digests);\n      cmp_result &= 0x01010101;\n      do {\n        if (cmp_result == 0) break;\n        // CUDA uses little endian,\n        // and the lowest byte in register stores in the lowest address.\n        uint32_t index = (__ffs(cmp_result) - 1) >> 3;\n        cmp_result &= (cmp_result - 1);\n        possible_pos = pos_cur + i * 4 + index;\n        auto current_key = BUCKET::keys(bucket_keys_ptr, possible_pos);\n        K expected_key = key;\n        // Modifications to the bucket will not before this instruction.\n        result = current_key->compare_exchange_strong(\n            expected_key, static_cast<K>(LOCKED_KEY),\n            cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);\n      } while (!result);\n      if (result) {\n        occupy_result = OccupyResult::DUPLICATE;\n        key_pos = possible_pos;\n        ScoreFunctor::update_with_digest(bucket_keys_ptr, key_pos, scores,\n                                         kv_idx, score, bucket_capacity,\n                                         get_digest<K>(key), false);\n        break;\n      } else if (bucket_size == bucket_capacity) {\n        continue;\n      }\n      VecD_Comp empty_digests_ = empty_digests<K>();\n      cmp_result = __vcmpeq4(probe_digests, empty_digests_);\n      cmp_result &= 0x01010101;\n      do {\n        if (cmp_result == 0) break;\n        uint32_t index = (__ffs(cmp_result) - 1) >> 3;\n        cmp_result &= (cmp_result - 1);\n        possible_pos = pos_cur + i * 4 + index;\n        if (offset == 0 && possible_pos < key_pos) continue;\n        auto current_key = BUCKET::keys(bucket_keys_ptr, possible_pos);\n        K expected_key = static_cast<K>(EMPTY_KEY);\n        result = current_key->compare_exchange_strong(\n            expected_key, static_cast<K>(LOCKED_KEY),\n            cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);\n      } while (!result);\n      if (result) {\n        occupy_result = OccupyResult::OCCUPIED_EMPTY;\n        key_pos = possible_pos;\n        ScoreFunctor::update_with_digest(bucket_keys_ptr, key_pos, scores,\n                                         kv_idx, score, bucket_capacity,\n                                         get_digest<K>(key), true);\n        atomicAdd(bucket_size_ptr, 1);\n        break;\n      }\n    }\n  }\n  while (occupy_result == OccupyResult::INITIAL) {\n    S* bucket_scores_ptr = BUCKET::scores(bucket_keys_ptr, bucket_capacity, 0);\n    S min_score = static_cast<S>(MAX_SCORE);\n    int min_pos = -1;\n#pragma unroll\n    for (int j = 0; j < STRIDE_S; j += Load_LEN_S) {\n      __pipeline_memcpy_async(sm_bucket_scores[tx] + j, bucket_scores_ptr + j,\n                              sizeof(S) * Load_LEN_S);\n    }\n    __pipeline_commit();\n    for (int i = 0; i < bucket_capacity; i += STRIDE_S) {\n      if (i < bucket_capacity - STRIDE_S) {\n#pragma unroll\n        for (int j = 0; j < STRIDE_S; j += Load_LEN_S) {\n          __pipeline_memcpy_async(\n              sm_bucket_scores[tx] + diff_buf(i / STRIDE_S) * STRIDE_S + j,\n              bucket_scores_ptr + i + STRIDE_S + j, sizeof(S) * Load_LEN_S);\n        }\n      }\n      __pipeline_commit();\n      __pipeline_wait_prior(1);\n      S temp_scores[Load_LEN_S];\n      S* src = sm_bucket_scores[tx] + same_buf(i / STRIDE_S) * STRIDE_S;\n#pragma unroll\n      for (int k = 0; k < STRIDE_S; k += Load_LEN_S) {\n        *reinterpret_cast<byte16*>(temp_scores) =\n            *reinterpret_cast<byte16*>(src + k);\n#pragma unroll\n        for (int j = 0; j < Load_LEN_S; j += 1) {\n          S temp_score = temp_scores[j];\n          if (temp_score < min_score) {\n            auto verify_key_ptr = BUCKET::keys(bucket_keys_ptr, i + k + j);\n            auto verify_key =\n                verify_key_ptr->load(cuda::std::memory_order_relaxed);\n            if (verify_key != static_cast<K>(LOCKED_KEY) &&\n                verify_key != static_cast<K>(EMPTY_KEY)) {\n              min_score = temp_score;\n              min_pos = i + k + j;\n            }\n          }\n        }\n      }\n    }\n    score = ScoreFunctor::desired_when_missed(scores, kv_idx, global_epoch);\n    if (score < min_score) {\n      occupy_result = OccupyResult::REFUSED;\n      evict_idx = atomicAdd(evicted_counter, 1);\n      evict_key_score<K, S>(evicted_keys, evicted_scores, evict_idx, key,\n                            score);\n      break;\n    }\n    auto min_score_key = BUCKET::keys(bucket_keys_ptr, min_pos);\n    auto expected_key = min_score_key->load(cuda::std::memory_order_relaxed);\n    if (expected_key != static_cast<K>(LOCKED_KEY) &&\n        expected_key != static_cast<K>(EMPTY_KEY)) {\n      bool result = min_score_key->compare_exchange_strong(\n          expected_key, static_cast<K>(LOCKED_KEY),\n          cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);\n      if (result) {\n        S* min_score_ptr =\n            BUCKET::scores(bucket_keys_ptr, bucket_capacity, min_pos);\n        auto verify_score_ptr =\n            reinterpret_cast<AtomicScore<S>*>(min_score_ptr);\n        auto verify_score =\n            verify_score_ptr->load(cuda::std::memory_order_relaxed);\n        if (verify_score <= min_score) {\n          key_pos = min_pos;\n          ScoreFunctor::update_with_digest(bucket_keys_ptr, key_pos, scores,\n                                           kv_idx, score, bucket_capacity,\n                                           get_digest<K>(key), true);\n          if (expected_key == static_cast<K>(RECLAIM_KEY)) {\n            occupy_result = OccupyResult::OCCUPIED_RECLAIMED;\n            atomicAdd(bucket_size_ptr, 1);\n          } else {\n            occupy_result = OccupyResult::EVICT;\n            evict_idx = atomicAdd(evicted_counter, 1);\n            evict_key_score<K, S>(evicted_keys, evicted_scores, evict_idx,\n                                  expected_key, min_score);\n          }\n        } else {\n          min_score_key->store(expected_key, cuda::std::memory_order_release);\n        }\n      }\n    }\n  }\n  VecV* bucket_value_ptr{nullptr};\n  if (occupy_result != OccupyResult::ILLEGAL) {\n    bucket_value_ptr = bucket_values_ptr + key_pos * dim;\n  }\n  uint32_t rank = g.thread_rank();\n\n  for (int i = 0; i < GROUP_SIZE; i++) {\n    auto occupy_result_cur = g.shfl(occupy_result, i);\n    if (occupy_result_cur == OccupyResult::ILLEGAL) {\n      continue;\n    }\n    auto kv_idx_cur = kv_idx / GROUP_SIZE * GROUP_SIZE + i;\n    VecV const* input_buffer = values + kv_idx_cur * dim;\n    auto evict_idx_cur = g.shfl(evict_idx, i);\n    VecV* evict_buffer = evicted_values + evict_idx_cur * dim;\n    VecV* table_buffer = g.shfl(bucket_value_ptr, i);\n    if (occupy_result_cur == OccupyResult::EVICT) {\n      for (int j = rank; j < dim; j += GROUP_SIZE) {\n        evict_buffer[j] = table_buffer[j];\n      }\n    }\n    if (occupy_result_cur == OccupyResult::REFUSED) {\n      for (int j = rank; j < dim; j += GROUP_SIZE) {\n        evict_buffer[j] = input_buffer[j];\n      }\n    } else {\n      for (int j = rank; j < dim; j += GROUP_SIZE) {\n        table_buffer[j] = input_buffer[j];\n      }\n      if (rank == i) {\n        auto key_address = BUCKET::keys(bucket_keys_ptr, key_pos);\n        // memory_order_release:\n        // Modifications to the bucket will not after this instruction.\n        key_address->store(key, cuda::std::memory_order_release);\n      }\n    }\n  }\n}\n\ntemplate <typename K, typename V, typename S, typename VecV, int Strategy>\nstruct LaunchInsertAndEvictKernel {\n  using Params = Params_UpsertAndEvict<K, V, S>;\n  inline static void launch(Params& params, cudaStream_t& stream) {\n    constexpr int BLOCK_SIZE = 128;\n    params.dim = params.dim * sizeof(V) / sizeof(VecV);\n    constexpr int GROUP_SIZE = 32;\n    insert_and_evict_kernel_with_filter<K, V, S, VecV, BLOCK_SIZE, GROUP_SIZE,\n                                        Strategy>\n        <<<(params.n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(\n            params.buckets, params.buckets_size, params.buckets_num,\n            params.bucket_capacity, params.dim, params.keys,\n            reinterpret_cast<const VecV*>(params.values), params.scores,\n            params.evicted_keys, reinterpret_cast<VecV*>(params.evicted_values),\n            params.evicted_scores, params.n, params.evicted_counter,\n            params.global_epoch);\n  }\n};\n\ntemplate <typename K, typename V, typename S, int Strategy>\nstruct InsertAndEvictKernelLauncher {\n  using Params = Params_UpsertAndEvict<K, V, S>;\n  static void launch_kernel(Params& params, cudaStream_t& stream) {\n    const uint32_t total_value_size =\n        static_cast<uint32_t>(params.dim * sizeof(V));\n    if (total_value_size % sizeof(byte16) == 0) {\n      using VecV = byte16;\n      LaunchInsertAndEvictKernel<K, V, S, VecV, Strategy>::launch(params,\n                                                                  stream);\n    } else if (total_value_size % sizeof(byte8) == 0) {\n      using VecV = byte8;\n      LaunchInsertAndEvictKernel<K, V, S, VecV, Strategy>::launch(params,\n                                                                  stream);\n    } else {\n      using VecV = V;\n      LaunchInsertAndEvictKernel<K, V, S, VecV, Strategy>::launch(params,\n                                                                  stream);\n    }\n  }  // End function\n};\n\ntemplate <typename K, typename V, typename S, typename VecV, int Strategy>\nstruct Launch_TLPv1_UpsertAndEvict {\n  using Params = Params_UpsertAndEvict<K, V, S>;\n  inline static void launch_kernel(Params& params, cudaStream_t& stream) {\n    constexpr int BLOCK_SIZE = 128;\n    params.dim = params.dim * sizeof(V) / sizeof(VecV);\n    tlp_v1_upsert_and_evict_kernel_unique<K, V, S, VecV, BLOCK_SIZE, Strategy>\n        <<<(params.n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(\n            params.buckets, params.buckets_size, params.buckets_num,\n            params.bucket_capacity, params.dim, params.keys,\n            reinterpret_cast<const VecV*>(params.values), params.scores,\n            params.evicted_keys, reinterpret_cast<VecV*>(params.evicted_values),\n            params.evicted_scores, params.n, params.evicted_counter,\n            params.global_epoch);\n  }\n};\n\ntemplate <typename K, typename V, typename S, typename VecV, int Strategy>\nstruct Launch_TLPv2_UpsertAndEvict {\n  using Params = Params_UpsertAndEvict<K, V, S>;\n  inline static void launch_kernel(Params& params, cudaStream_t& stream) {\n    constexpr int BLOCK_SIZE = 128;\n    params.dim = params.dim * sizeof(V) / sizeof(VecV);\n    if (params.dim <= 8) {\n      constexpr int GROUP_SIZE = 8;\n      tlp_v2_upsert_and_evict_kernel_unique<K, V, S, VecV, BLOCK_SIZE,\n                                            GROUP_SIZE, Strategy>\n          <<<(params.n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(\n              params.buckets, params.buckets_size, params.buckets_num,\n              params.bucket_capacity, params.dim, params.keys,\n              reinterpret_cast<const VecV*>(params.values), params.scores,\n              params.evicted_keys,\n              reinterpret_cast<VecV*>(params.evicted_values),\n              params.evicted_scores, params.n, params.evicted_counter,\n              params.global_epoch);\n    } else {\n      constexpr int GROUP_SIZE = 16;\n      tlp_v2_upsert_and_evict_kernel_unique<K, V, S, VecV, BLOCK_SIZE,\n                                            GROUP_SIZE, Strategy>\n          <<<(params.n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(\n              params.buckets, params.buckets_size, params.buckets_num,\n              params.bucket_capacity, params.dim, params.keys,\n              reinterpret_cast<const VecV*>(params.values), params.scores,\n              params.evicted_keys,\n              reinterpret_cast<VecV*>(params.evicted_values),\n              params.evicted_scores, params.n, params.evicted_counter,\n              params.global_epoch);\n    }\n  }\n};\n\ntemplate <typename K, typename V, typename S, typename VecV, int Strategy>\nstruct Launch_Pipeline_UpsertAndEvict {\n  using Params = Params_UpsertAndEvict<K, V, S>;\n  inline static void launch_kernel(Params& params, cudaStream_t& stream) {\n    constexpr int BLOCK_SIZE = 128;\n    constexpr uint32_t GROUP_SIZE = 32;\n    constexpr uint32_t BUCKET_SIZE = 128;\n    using SMM =\n        SharedMemoryManager_Pipeline_UpsertAndEvict<K, V, S, VecV, BLOCK_SIZE,\n                                                    GROUP_SIZE, BUCKET_SIZE>;\n\n    params.dim = params.dim * sizeof(V) / sizeof(VecV);\n    uint32_t shared_mem = SMM::total_size(params.dim);\n    shared_mem =\n        (shared_mem + sizeof(byte16) - 1) / sizeof(byte16) * sizeof(byte16);\n    pipeline_upsert_and_evict_kernel_unique<K, V, S, VecV, BLOCK_SIZE, Strategy>\n        <<<(params.n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, shared_mem,\n           stream>>>(params.buckets, params.buckets_size, params.buckets_num,\n                     params.dim, params.keys,\n                     reinterpret_cast<const VecV*>(params.values),\n                     params.scores, params.evicted_keys,\n                     reinterpret_cast<VecV*>(params.evicted_values),\n                     params.evicted_scores, params.n, params.evicted_counter,\n                     params.global_epoch);\n  }\n};\n\ntemplate <typename ArchTag>\nstruct ValueConfig_UpsertAndEvict;\n\n/// TODO: support more arch\ntemplate <>\nstruct ValueConfig_UpsertAndEvict<Sm80> {\n  // Value size greater than it will bring poor performance for TLPv1.\n  static constexpr uint32_t size_tlp_v1 = 16 * sizeof(byte4);\n  // Value size greater than it will bring wrong result for TLPv2.\n  static constexpr uint32_t size_tlp_v2 = 64 * sizeof(byte4);\n  // Value size greater than it will reduce the occupancy for Pipeline.\n  // When the value is very high, the kernel will fail to launch.\n  static constexpr uint32_t size_pipeline = 128 * sizeof(byte4);\n};\n\ntemplate <>\nstruct ValueConfig_UpsertAndEvict<Sm70> {\n  // Value size greater than it will bring poor performance for TLPv1.\n  static constexpr uint32_t size_tlp_v1 = 16 * sizeof(byte4);\n  // Value size greater than it will bring wrong result for TLPv2.\n  static constexpr uint32_t size_tlp_v2 = 32 * sizeof(byte4);\n  // Value size greater than it will reduce the occupancy for Pipeline.\n  // When the value is very high, the kernel will fail to launch.\n  static constexpr uint32_t size_pipeline = 64 * sizeof(byte4);\n};\n\ntemplate <typename K, typename V, typename S, int Strategy, typename ArchTag>\nstruct KernelSelector_UpsertAndEvict {\n  using ValueConfig = ValueConfig_UpsertAndEvict<ArchTag>;\n  using Params = Params_UpsertAndEvict<K, V, S>;\n\n  static bool callable(bool unique_key, uint32_t bucket_size, uint32_t dim) {\n    constexpr uint32_t MinBucketCap = sizeof(VecD_Load) / sizeof(D);\n    if (!unique_key || bucket_size < MinBucketCap) return false;\n    uint32_t value_size = dim * sizeof(V);\n#if defined(CUDART_VERSION) && (CUDART_VERSION >= 11030)\n    if (value_size <= ValueConfig::size_tlp_v2) return true;\n#else\n    if (value_size <= ValueConfig::size_tlp_v1) return true;\n#endif\n    if (bucket_size == 128 && value_size <= ValueConfig::size_pipeline) {\n      return true;\n    }\n    return false;\n  }\n\n  static void select_kernel(Params& params, cudaStream_t& stream) {\n    const uint32_t total_value_size =\n        static_cast<uint32_t>(params.dim * sizeof(V));\n\n    auto launch_TLPv1 = [&]() {\n      if (total_value_size % sizeof(byte16) == 0) {\n        using VecV = byte16;\n        Launch_TLPv1_UpsertAndEvict<K, V, S, VecV, Strategy>::launch_kernel(\n            params, stream);\n      } else if (total_value_size % sizeof(byte8) == 0) {\n        using VecV = byte8;\n        Launch_TLPv1_UpsertAndEvict<K, V, S, VecV, Strategy>::launch_kernel(\n            params, stream);\n      } else if (total_value_size % sizeof(byte4) == 0) {\n        using VecV = byte4;\n        Launch_TLPv1_UpsertAndEvict<K, V, S, VecV, Strategy>::launch_kernel(\n            params, stream);\n      } else if (total_value_size % sizeof(byte2) == 0) {\n        using VecV = byte2;\n        Launch_TLPv1_UpsertAndEvict<K, V, S, VecV, Strategy>::launch_kernel(\n            params, stream);\n      } else {\n        using VecV = byte;\n        Launch_TLPv1_UpsertAndEvict<K, V, S, VecV, Strategy>::launch_kernel(\n            params, stream);\n      }\n    };\n\n#if defined(CUDART_VERSION) && (CUDART_VERSION >= 11030)\n    auto launch_TLPv2 = [&]() {\n      if (total_value_size % sizeof(byte16) == 0) {\n        using VecV = byte16;\n        Launch_TLPv2_UpsertAndEvict<K, V, S, VecV, Strategy>::launch_kernel(\n            params, stream);\n      } else if (total_value_size % sizeof(byte8) == 0) {\n        using VecV = byte8;\n        Launch_TLPv2_UpsertAndEvict<K, V, S, VecV, Strategy>::launch_kernel(\n            params, stream);\n      } else if (total_value_size % sizeof(byte4) == 0) {\n        using VecV = byte4;\n        Launch_TLPv2_UpsertAndEvict<K, V, S, VecV, Strategy>::launch_kernel(\n            params, stream);\n      } else if (total_value_size % sizeof(byte2) == 0) {\n        using VecV = byte2;\n        Launch_TLPv2_UpsertAndEvict<K, V, S, VecV, Strategy>::launch_kernel(\n            params, stream);\n      } else {\n        using VecV = byte;\n        Launch_TLPv2_UpsertAndEvict<K, V, S, VecV, Strategy>::launch_kernel(\n            params, stream);\n      }\n    };\n#endif\n\n    auto launch_Pipeline = [&]() {\n      if (total_value_size % sizeof(byte16) == 0) {\n        using VecV = byte16;\n        Launch_Pipeline_UpsertAndEvict<K, V, S, VecV, Strategy>::launch_kernel(\n            params, stream);\n      } else if (total_value_size % sizeof(byte8) == 0) {\n        using VecV = byte8;\n        Launch_Pipeline_UpsertAndEvict<K, V, S, VecV, Strategy>::launch_kernel(\n            params, stream);\n      } else if (total_value_size % sizeof(byte4) == 0) {\n        using VecV = byte4;\n        Launch_Pipeline_UpsertAndEvict<K, V, S, VecV, Strategy>::launch_kernel(\n            params, stream);\n      } else if (total_value_size % sizeof(byte2) == 0) {\n        using VecV = byte2;\n        Launch_Pipeline_UpsertAndEvict<K, V, S, VecV, Strategy>::launch_kernel(\n            params, stream);\n      } else {\n        using VecV = byte;\n        Launch_Pipeline_UpsertAndEvict<K, V, S, VecV, Strategy>::launch_kernel(\n            params, stream);\n      }\n    };\n\n    // This part is according to the test on A100.\n    if (params.bucket_capacity != 128) {\n      if (total_value_size <= ValueConfig::size_tlp_v1) {\n        launch_TLPv1();\n      } else {\n#if defined(CUDART_VERSION) && (CUDART_VERSION >= 11030)\n        launch_TLPv2();\n#else\n        launch_TLPv1();\n#endif\n      }\n    } else {\n      if (total_value_size <= ValueConfig::size_tlp_v1) {\n        if (params.load_factor <= 0.90f) {\n          launch_TLPv1();\n        } else {\n          launch_Pipeline();\n        }\n      } else if (total_value_size <= ValueConfig::size_tlp_v2) {\n        if (params.load_factor <= 0.85f) {\n#if defined(CUDART_VERSION) && (CUDART_VERSION >= 11030)\n          launch_TLPv2();\n#else\n          launch_Pipeline();\n#endif\n        } else {\n          launch_Pipeline();\n        }\n      } else {\n        launch_Pipeline();\n      }\n    }\n  }  // End function\n};\n\ntemplate <class K, class V, class S, int Strategy, uint32_t TILE_SIZE = 4>\n__global__ void upsert_and_evict_kernel_with_io_core(\n    const Table<K, V, S>* __restrict table, Bucket<K, V, S>* buckets,\n    const size_t bucket_max_size, const size_t buckets_num, const size_t dim,\n    const K* __restrict keys, const V* __restrict values,\n    const S* __restrict scores, K* __restrict evicted_keys,\n    V* __restrict evicted_values, S* __restrict evicted_scores,\n    const S global_epoch, size_t N) {\n  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());\n  int* buckets_size = table->buckets_size;\n\n  using ScoreFunctor = ScoreFunctor<K, V, S, Strategy>;\n\n  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;\n       t += blockDim.x * gridDim.x) {\n    int key_pos = -1;\n    const size_t key_idx = t / TILE_SIZE;\n\n    const K insert_key = keys[key_idx];\n\n    if (IS_RESERVED_KEY<K>(insert_key)) continue;\n\n    const S insert_score =\n        ScoreFunctor::desired_when_missed(scores, key_idx, global_epoch);\n    const V* insert_value = values + key_idx * dim;\n\n    size_t bkt_idx = 0;\n    size_t start_idx = 0;\n    int src_lane = -1;\n    K evicted_key;\n\n    Bucket<K, V, S>* bucket = get_key_position<K>(\n        buckets, insert_key, bkt_idx, start_idx, buckets_num, bucket_max_size);\n\n    OccupyResult occupy_result{OccupyResult::INITIAL};\n    const int bucket_size = buckets_size[bkt_idx];\n    do {\n      if (bucket_size < bucket_max_size) {\n        occupy_result = find_and_lock_when_vacant<K, V, S, TILE_SIZE>(\n            g, bucket, insert_key, insert_score, evicted_key, start_idx,\n            key_pos, src_lane, bucket_max_size);\n      } else {\n        start_idx = (start_idx / TILE_SIZE) * TILE_SIZE;\n        occupy_result = find_and_lock_when_full<K, V, S, TILE_SIZE,\n                                                ScoreFunctor::LOCK_MEM_ORDER,\n                                                ScoreFunctor::UNLOCK_MEM_ORDER>(\n            g, bucket, insert_key, insert_score, evicted_key, start_idx,\n            key_pos, src_lane, bucket_max_size);\n      }\n      occupy_result = g.shfl(occupy_result, src_lane);\n    } while (occupy_result == OccupyResult::CONTINUE);\n\n    if (occupy_result == OccupyResult::REFUSED) {\n      if (g.thread_rank() == 0) {\n        evicted_keys[key_idx] = insert_key;\n        evicted_scores[key_idx] = insert_score;\n      }\n      copy_vector<V, TILE_SIZE>(g, insert_value, evicted_values + key_idx * dim,\n                                dim);\n      continue;\n    }\n\n    if ((occupy_result == OccupyResult::OCCUPIED_EMPTY ||\n         occupy_result == OccupyResult::OCCUPIED_RECLAIMED) &&\n        g.thread_rank() == src_lane) {\n      atomicAdd(&(buckets_size[bkt_idx]), 1);\n    }\n\n    if (occupy_result == OccupyResult::EVICT) {\n      if (g.thread_rank() == src_lane) {\n        evicted_keys[key_idx] = evicted_key;\n        if (scores != nullptr) {\n          evicted_scores[key_idx] = scores[key_idx];\n        }\n      }\n      copy_vector<V, TILE_SIZE>(g, bucket->vectors + key_pos * dim,\n                                evicted_values + key_idx * dim, dim);\n    }\n\n    copy_vector<V, TILE_SIZE>(g, insert_value, bucket->vectors + key_pos * dim,\n                              dim);\n    if (g.thread_rank() == src_lane) {\n      ScoreFunctor::update(bucket, key_pos, scores, key_idx, insert_score,\n                           (occupy_result != OccupyResult::DUPLICATE));\n      bucket->digests(key_pos)[0] = get_digest<K>(insert_key);\n      (bucket->keys(key_pos))\n          ->store(insert_key, ScoreFunctor::UNLOCK_MEM_ORDER);\n    }\n  }\n}\n\ntemplate <typename K, typename V, typename S, int Strategy>\nstruct SelectUpsertAndEvictKernelWithIO {\n  static void execute_kernel(\n      const float& load_factor, const int& block_size,\n      const size_t bucket_max_size, const size_t buckets_num, const size_t dim,\n      cudaStream_t& stream, const size_t& n,\n      const Table<K, V, S>* __restrict table, Bucket<K, V, S>* buckets,\n      const K* __restrict keys, const V* __restrict values,\n      const S* __restrict scores, K* __restrict evicted_keys,\n      V* __restrict evicted_values, S* __restrict evicted_scores,\n      const S global_epoch) {\n    if (load_factor <= 0.5) {\n      const unsigned int tile_size = 4;\n      const size_t N = n * tile_size;\n      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);\n      upsert_and_evict_kernel_with_io_core<K, V, S, Strategy, tile_size>\n          <<<grid_size, block_size, 0, stream>>>(\n              table, buckets, bucket_max_size, buckets_num, dim, keys, values,\n              scores, evicted_keys, evicted_values, evicted_scores,\n              global_epoch, N);\n\n    } else if (load_factor <= 0.875) {\n      const unsigned int tile_size = 8;\n      const size_t N = n * tile_size;\n      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);\n\n      upsert_and_evict_kernel_with_io_core<K, V, S, Strategy, tile_size>\n          <<<grid_size, block_size, 0, stream>>>(\n              table, buckets, bucket_max_size, buckets_num, dim, keys, values,\n              scores, evicted_keys, evicted_values, evicted_scores,\n              global_epoch, N);\n\n    } else {\n      const unsigned int tile_size = 32;\n      const size_t N = n * tile_size;\n      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);\n      upsert_and_evict_kernel_with_io_core<K, V, S, Strategy, tile_size>\n          <<<grid_size, block_size, 0, stream>>>(\n              table, buckets, bucket_max_size, buckets_num, dim, keys, values,\n              scores, evicted_keys, evicted_values, evicted_scores,\n              global_epoch, N);\n    }\n    return;\n  }\n};\n\n}  // namespace merlin\n}  // namespace nv"
  },
  {
    "path": "include/merlin/core_kernels.cuh",
    "content": "/*\n * Copyright (c) 2022, NVIDIA CORPORATION.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *     http:///www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n#pragma once\n\n#include <cstdint>\n#include <cub/cub.cuh>\n#include <cuda/std/functional>\n#include \"allocator.cuh\"\n#include \"core_kernels/kernel_utils.cuh\"\n#include \"core_kernels/accum_or_assign.cuh\"\n#include \"core_kernels/contains.cuh\"\n#include \"core_kernels/find_or_insert.cuh\"\n#include \"core_kernels/find_ptr_or_insert.cuh\"\n#include \"core_kernels/lookup.cuh\"\n#include \"core_kernels/lookup_ptr.cuh\"\n#include \"core_kernels/update.cuh\"\n#include \"core_kernels/update_score.cuh\"\n#include \"core_kernels/update_values.cuh\"\n#include \"core_kernels/upsert.cuh\"\n#include \"core_kernels/upsert_and_evict.cuh\"\n// Dual-bucket headers depend on types from lookup.cuh and upsert.cuh\n// (FoundFunctorV1, LookupValueBufConfig, Params_Upsert,\n// SharedMemoryManager_Pipeline_Upsert), so they must come after.\n#include \"core_kernels/dual_bucket_utils.cuh\"\n#include \"core_kernels/dual_bucket_upsert.cuh\"\n#include \"core_kernels/dual_bucket_lookup.cuh\"\n\nnamespace nv {\nnamespace merlin {\n\ntemplate <class S>\n__global__ void create_locks(S* __restrict mutex, const size_t start,\n                             const size_t end) {\n  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;\n  if (start + tid < end) {\n    new (mutex + start + tid) S();\n  }\n}\n\ntemplate <class S>\n__global__ void release_locks(S* __restrict mutex, const size_t start,\n                              const size_t end) {\n  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;\n  if (start + tid < end) {\n    (mutex + start + tid)->~S();\n  }\n}\n\ntemplate <class K, class V, class S>\n__global__ void create_atomic_keys(Bucket<K, V, S>* __restrict buckets,\n                                   const size_t start, const size_t end,\n                                   const size_t bucket_max_size,\n                                   const bool dual_bucket_mode = false) {\n  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;\n  if (start + tid < end) {\n    const D empty_d =\n        dual_bucket_mode ? dual_bucket_empty_digest<K>() : empty_digest<K>();\n    for (size_t i = 0; i < bucket_max_size; i++)\n      buckets[start + tid].digests(i)[0] = empty_d;\n    for (size_t i = 0; i < bucket_max_size; i++)\n      new (buckets[start + tid].keys(i))\n          AtomicKey<K>{static_cast<K>(EMPTY_KEY)};\n  }\n}\n\ntemplate <class K, class V, class S>\n__global__ void create_atomic_scores(Bucket<K, V, S>* __restrict buckets,\n                                     const size_t start, const size_t end,\n                                     const size_t bucket_max_size) {\n  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;\n  if (start + tid < end) {\n    for (size_t i = 0; i < bucket_max_size; i++) {\n      new (buckets[start + tid].scores(i))\n          AtomicScore<S>{static_cast<S>(EMPTY_SCORE)};\n    }\n  }\n}\n\ntemplate <class K, class V, class S>\n__global__ void allocate_bucket_vectors(Bucket<K, V, S>* __restrict buckets,\n                                        const size_t index, V* address) {\n  buckets[index].vectors = address;\n}\n\ntemplate <class K, class V, class S>\n__global__ void allocate_bucket_others(Bucket<K, V, S>* __restrict buckets,\n                                       size_t total_size_per_bucket,\n                                       size_t num_of_buckets,\n                                       const int start_index, uint8_t* address,\n                                       const uint32_t reserve_size,\n                                       const size_t bucket_max_size) {\n  for (size_t step = 0; step < num_of_buckets; step++) {\n    size_t index = start_index + step;\n    buckets[index].digests_ = address;\n    buckets[index].keys_ =\n        reinterpret_cast<AtomicKey<K>*>(buckets[index].digests_ + reserve_size);\n    buckets[index].scores_ = reinterpret_cast<AtomicScore<S>*>(\n        buckets[index].keys_ + bucket_max_size);\n    address += total_size_per_bucket;\n  }\n}\n\ntemplate <class K, class V, class S>\n__global__ void get_bucket_others_address(Bucket<K, V, S>* __restrict buckets,\n                                          const int index, uint8_t** address) {\n  *address = buckets[index].digests_;\n}\n\ntemplate <class P>\nvoid realloc(P* ptr, size_t old_size, size_t new_size,\n             BaseAllocator* allocator) {\n  // Truncate old_size to limit dowstream copy ops.\n  old_size = std::min(old_size, new_size);\n\n  // Alloc new buffer and copy at old data.\n  char* new_ptr;\n  allocator->alloc(MemoryType::Device, (void**)&new_ptr, new_size);\n  if (*ptr != nullptr) {\n    CUDA_CHECK(cudaMemcpy(new_ptr, *ptr, old_size, cudaMemcpyDefault));\n    allocator->free(MemoryType::Device, *ptr);\n  }\n\n  // Zero-fill remainder.\n  CUDA_CHECK(cudaMemset(new_ptr + old_size, 0, new_size - old_size));\n\n  // Switch to new pointer.\n  *ptr = reinterpret_cast<P>(new_ptr);\n  return;\n}\n\ntemplate <class P>\nvoid realloc_host(P* ptr, size_t old_size, size_t new_size,\n                  BaseAllocator* allocator) {\n  // Truncate old_size to limit dowstream copy ops.\n  old_size = std::min(old_size, new_size);\n\n  // Alloc new buffer and copy at old data.\n  char* new_ptr = nullptr;\n  allocator->alloc(MemoryType::Host, (void**)&new_ptr, new_size);\n\n  if (*ptr != nullptr) {\n    std::memcpy(new_ptr, *ptr, old_size);\n    allocator->free(MemoryType::Host, *ptr);\n  }\n\n  // Zero-fill remainder.\n  std::memset(new_ptr + old_size, 0, new_size - old_size);\n\n  // Switch to new pointer.\n  *ptr = reinterpret_cast<P>(new_ptr);\n  return;\n}\n\n/* Initialize the buckets with index from start to end. */\ntemplate <class K, class V, class S>\nvoid initialize_buckets(Table<K, V, S>** table, BaseAllocator* allocator,\n                        const size_t start, const size_t end) {\n  /* As testing results show us, when the number of buckets is greater than\n   * the 4 million the performance will drop significantly, we believe the\n   * to many pinned memory allocation causes this issue, so we change the\n   * strategy to allocate some memory slices whose size is not greater than\n   * 64GB, and put the buckets pointer point to the slices.\n   */\n  MERLIN_CHECK(start < end,\n               \"initialize_buckets, start should be less than end!\");\n  size_t buckets_num = end - start;\n  const size_t total_size_of_vectors =\n      buckets_num * (*table)->bucket_max_size * sizeof(V) * (*table)->dim;\n  const size_t num_of_memory_slices =\n      1 + (total_size_of_vectors - 1) / (*table)->bytes_per_slice;\n  size_t num_of_buckets_in_one_slice =\n      (*table)->bytes_per_slice /\n      ((*table)->bucket_max_size * sizeof(V) * (*table)->dim);\n  size_t num_of_allocated_buckets = 0;\n\n  realloc_host<V**>(\n      &((*table)->slices), (*table)->num_of_memory_slices * sizeof(V*),\n      ((*table)->num_of_memory_slices + num_of_memory_slices) * sizeof(V*),\n      allocator);\n\n  bool mixed_hbm = false;\n  for (size_t i = (*table)->num_of_memory_slices;\n       i < (*table)->num_of_memory_slices + num_of_memory_slices; i++) {\n    if (i == (*table)->num_of_memory_slices + num_of_memory_slices - 1) {\n      num_of_buckets_in_one_slice = buckets_num - num_of_allocated_buckets;\n    }\n    size_t slice_real_size = num_of_buckets_in_one_slice *\n                             (*table)->bucket_max_size * sizeof(V) *\n                             (*table)->dim;\n    if ((*table)->remaining_hbm_for_vectors >= slice_real_size) {\n      if (!(*table)->is_pure_hbm) {\n        mixed_hbm = true;\n      }\n      allocator->alloc(MemoryType::Device, (void**)&((*table)->slices[i]),\n                       slice_real_size);\n      (*table)->remaining_hbm_for_vectors -= slice_real_size;\n    } else {\n      (*table)->is_pure_hbm = false;\n      allocator->alloc(MemoryType::Pinned, (void**)&((*table)->slices[i]),\n                       slice_real_size, cudaHostAllocMapped);\n    }\n    for (int j = 0; j < num_of_buckets_in_one_slice; j++) {\n      if ((*table)->is_pure_hbm || mixed_hbm) {\n        size_t index = start + num_of_allocated_buckets + j;\n        V* address =\n            (*table)->slices[i] + j * (*table)->bucket_max_size * (*table)->dim;\n        allocate_bucket_vectors<K, V, S>\n            <<<1, 1>>>((*table)->buckets, index, address);\n        CUDA_CHECK(cudaDeviceSynchronize());\n      } else {\n        V* h_ptr =\n            (*table)->slices[i] + j * (*table)->bucket_max_size * (*table)->dim;\n        V* address = nullptr;\n        CUDA_CHECK(cudaHostGetDevicePointer(&address, h_ptr, 0));\n        size_t index = start + num_of_allocated_buckets + j;\n        allocate_bucket_vectors<K, V, S>\n            <<<1, 1>>>((*table)->buckets, index, address);\n      }\n    }\n    CUDA_CHECK(cudaDeviceSynchronize());\n    num_of_allocated_buckets += num_of_buckets_in_one_slice;\n  }\n\n  (*table)->num_of_memory_slices += num_of_memory_slices;\n  uint32_t bucket_max_size = static_cast<uint32_t>((*table)->bucket_max_size);\n  size_t bucket_memory_size =\n      bucket_max_size * (sizeof(AtomicKey<K>) + sizeof(AtomicScore<S>));\n  // Align to the cache line size.\n  constexpr uint32_t CACHE_LINE_SIZE = 128U / sizeof(uint8_t);\n  uint32_t reserve_size =\n      bucket_max_size < CACHE_LINE_SIZE ? CACHE_LINE_SIZE : bucket_max_size;\n  bucket_memory_size += reserve_size * sizeof(uint8_t);\n\n  MERLIN_CHECK(start % (*table)->num_of_buckets_per_alloc == 0,\n               \"initialize_buckets, start must be times of \"\n               \"num_of_buckets_per_alloc!\");\n  /* NOTICE: Only the buckets which index is the times of\n   * `num_of_buckets_per_alloc` will allocate a real address, that provides the\n   * callers a method to avoid memory fragmentation.\n   */\n  for (int i = start; i < end; i += (*table)->num_of_buckets_per_alloc) {\n    uint8_t* address = nullptr;\n    size_t num_of_buckets =\n        std::min(end - i, (*table)->num_of_buckets_per_alloc);\n    allocator->alloc(MemoryType::Device, (void**)&(address),\n                     bucket_memory_size * num_of_buckets);\n    allocate_bucket_others<K, V, S>\n        <<<1, 1>>>((*table)->buckets, bucket_memory_size, num_of_buckets, i,\n                   address, reserve_size, bucket_max_size);\n  }\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  {\n    const size_t block_size = 512;\n    const size_t N = end - start + 1;\n    const int grid_size = SAFE_GET_GRID_SIZE(N, block_size);\n    create_locks<Mutex><<<grid_size, block_size>>>((*table)->locks, start, end);\n  }\n\n  {\n    const size_t block_size = 512;\n    const size_t N = end - start + 1;\n    const int grid_size = SAFE_GET_GRID_SIZE(N, block_size);\n    create_atomic_keys<K, V, S><<<grid_size, block_size>>>(\n        (*table)->buckets, start, end, (*table)->bucket_max_size,\n        (*table)->dual_bucket_mode);\n  }\n\n  {\n    const size_t block_size = 512;\n    const size_t N = end - start + 1;\n    const int grid_size = SAFE_GET_GRID_SIZE(N, block_size);\n    create_atomic_scores<K, V, S><<<grid_size, block_size>>>(\n        (*table)->buckets, start, end, (*table)->bucket_max_size);\n  }\n  CUDA_CHECK(cudaDeviceSynchronize());\n  CudaCheckError();\n}\n\ntemplate <class K, class V, class S>\nsize_t get_slice_size(Table<K, V, S>** table) {\n  const size_t min_slice_size =\n      (*table)->bucket_max_size * sizeof(V) * (*table)->dim;\n  const size_t max_table_size = (*table)->max_size * sizeof(V) * (*table)->dim;\n  size_t slice_size = 0;\n\n  if (max_table_size >= GB(128)) {\n    slice_size = GB(16);\n  } else if (max_table_size >= GB(16)) {\n    slice_size = GB(2);\n  } else if (max_table_size >= GB(2)) {\n    slice_size = MB(128);\n  } else if (max_table_size >= MB(128)) {\n    slice_size = MB(16);\n  } else if (max_table_size >= MB(16)) {\n    slice_size = MB(1);\n  } else {\n    slice_size = min_slice_size;\n  }\n\n  return std::max(min_slice_size, slice_size);\n}\n\n/* Initialize a Table struct.\n\n   K: The key type\n   V: The value type which should be static array type and C++ class\n      with customized construct is not supported.\n   S: The score type, the score will be used to store the timestamp\n      or occurrence frequency or any thing for eviction.\n   DIM: Vector dimension.\n*/\ntemplate <class K, class V, class S>\nvoid create_table(Table<K, V, S>** table, BaseAllocator* allocator,\n                  const size_t dim, const size_t init_size = 134217728,\n                  const size_t max_size = std::numeric_limits<size_t>::max(),\n                  const size_t max_hbm_for_vectors = 0,\n                  const size_t bucket_max_size = 128,\n                  const size_t num_of_buckets_per_alloc = 1,\n                  const size_t tile_size = 32, const bool primary = true,\n                  const bool dual_bucket_mode = false) {\n  allocator->alloc(MemoryType::Host, (void**)table, sizeof(Table<K, V, S>));\n  std::memset(*table, 0, sizeof(Table<K, V, S>));\n  (*table)->dual_bucket_mode = dual_bucket_mode;\n  (*table)->dim = dim;\n  (*table)->bucket_max_size = bucket_max_size;\n  (*table)->max_size = std::max(init_size, max_size);\n  (*table)->tile_size = tile_size;\n  (*table)->is_pure_hbm = true;\n  (*table)->bytes_per_slice = get_slice_size<K, V, S>(table);\n  (*table)->num_of_buckets_per_alloc = num_of_buckets_per_alloc;\n\n  // The bucket number will be the minimum needed for saving memory if no\n  // rehash.\n  if ((init_size * 2) > (*table)->max_size) {\n    (*table)->buckets_num =\n        1 + (((*table)->max_size - 1) / (*table)->bucket_max_size);\n  } else {\n    (*table)->buckets_num = 1;\n    while ((*table)->buckets_num * (*table)->bucket_max_size < init_size) {\n      (*table)->buckets_num *= 2;\n    }\n  }\n\n  (*table)->capacity = (*table)->buckets_num * (*table)->bucket_max_size;\n  (*table)->max_hbm_for_vectors = max_hbm_for_vectors;\n  (*table)->remaining_hbm_for_vectors = max_hbm_for_vectors;\n  (*table)->primary = primary;\n\n  allocator->alloc(MemoryType::Device, (void**)&((*table)->locks),\n                   (*table)->buckets_num * sizeof(Mutex));\n  CUDA_CHECK(\n      cudaMemset((*table)->locks, 0, (*table)->buckets_num * sizeof(Mutex)));\n\n  allocator->alloc(MemoryType::Device, (void**)&((*table)->buckets_size),\n                   (*table)->buckets_num * sizeof(int));\n  CUDA_CHECK(cudaMemset((*table)->buckets_size, 0,\n                        (*table)->buckets_num * sizeof(int)));\n\n  allocator->alloc(MemoryType::Device, (void**)&((*table)->buckets),\n                   (*table)->buckets_num * sizeof(Bucket<K, V, S>));\n  CUDA_CHECK(cudaMemset((*table)->buckets, 0,\n                        (*table)->buckets_num * sizeof(Bucket<K, V, S>)));\n\n  initialize_buckets<K, V, S>(table, allocator, 0, (*table)->buckets_num);\n  CudaCheckError();\n}\n\n/* Double the capacity on storage, must be followed by calling the\n * rehash_kernel. */\ntemplate <class K, class V, class S>\nvoid double_capacity(Table<K, V, S>** table, BaseAllocator* allocator) {\n  realloc<Mutex*>(&((*table)->locks), (*table)->buckets_num * sizeof(Mutex),\n                  (*table)->buckets_num * sizeof(Mutex) * 2, allocator);\n  realloc<int*>(&((*table)->buckets_size), (*table)->buckets_num * sizeof(int),\n                (*table)->buckets_num * sizeof(int) * 2, allocator);\n\n  realloc<Bucket<K, V, S>*>(\n      &((*table)->buckets), (*table)->buckets_num * sizeof(Bucket<K, V, S>),\n      (*table)->buckets_num * sizeof(Bucket<K, V, S>) * 2, allocator);\n\n  initialize_buckets<K, V, S>(table, allocator, (*table)->buckets_num,\n                              (*table)->buckets_num * 2);\n\n  (*table)->capacity *= 2;\n  (*table)->buckets_num *= 2;\n}\n\n/* free all of the resource of a Table. */\ntemplate <class K, class V, class S>\nvoid destroy_table(Table<K, V, S>** table, BaseAllocator* allocator) {\n  uint8_t** d_address = nullptr;\n  CUDA_CHECK(cudaMalloc((void**)&d_address, sizeof(uint8_t*)));\n  /* NOTICE: Only the buckets which index is the times of\n   * `num_of_buckets_per_alloc` will hold a real address, and need to be freed\n   */\n  for (int i = 0; i < (*table)->buckets_num;\n       i += (*table)->num_of_buckets_per_alloc) {\n    uint8_t* h_address;\n    get_bucket_others_address<K, V, S>\n        <<<1, 1>>>((*table)->buckets, i, d_address);\n    CUDA_CHECK(cudaMemcpy(&h_address, d_address, sizeof(uint8_t*),\n                          cudaMemcpyDeviceToHost));\n    allocator->free(MemoryType::Device, h_address);\n  }\n  CUDA_CHECK(cudaFree(d_address));\n\n  for (int i = 0; i < (*table)->num_of_memory_slices; i++) {\n    if (is_on_device((*table)->slices[i])) {\n      allocator->free(MemoryType::Device, (*table)->slices[i]);\n    } else {\n      allocator->free(MemoryType::Pinned, (*table)->slices[i]);\n    }\n  }\n  {\n    const size_t block_size = 512;\n    const size_t N = (*table)->buckets_num;\n    const int grid_size = SAFE_GET_GRID_SIZE(N, block_size);\n    release_locks<Mutex>\n        <<<grid_size, block_size>>>((*table)->locks, 0, (*table)->buckets_num);\n  }\n  allocator->free(MemoryType::Host, (*table)->slices);\n  allocator->free(MemoryType::Device, (*table)->buckets_size);\n  allocator->free(MemoryType::Device, (*table)->buckets);\n  allocator->free(MemoryType::Device, (*table)->locks);\n  allocator->free(MemoryType::Host, *table);\n  CUDA_CHECK(cudaDeviceSynchronize());\n  CudaCheckError();\n}\n\ntemplate <class K, class V, class S, uint32_t TILE_SIZE = 4>\n__forceinline__ __device__ void defragmentation_for_rehash(\n    Bucket<K, V, S>* __restrict bucket, uint32_t remove_pos,\n    const size_t bucket_max_size, const size_t buckets_num, const size_t dim) {\n  uint32_t key_idx;\n  size_t global_idx = 0;\n  size_t start_idx = 0;\n  K find_key;\n  K hashed_key;\n\n  uint32_t empty_pos = remove_pos;\n\n  int i = 1;\n  while (i < bucket_max_size) {\n    key_idx = (remove_pos + i) & (bucket_max_size - 1);\n    find_key = (bucket->keys(key_idx))->load(cuda::std::memory_order_relaxed);\n    if (find_key == static_cast<K>(EMPTY_KEY)) {\n      break;\n    }\n    hashed_key = Murmur3HashDevice(find_key);\n    global_idx = hashed_key % (buckets_num * bucket_max_size);\n    start_idx = get_start_position(global_idx, bucket_max_size);\n\n    if ((start_idx <= empty_pos && empty_pos < key_idx) ||\n        (key_idx < start_idx && start_idx <= empty_pos) ||\n        (empty_pos <= key_idx && key_idx < start_idx)) {\n      const K key =\n          (*(bucket->keys(key_idx))).load(cuda::std::memory_order_relaxed);\n      bucket->digests(empty_pos)[0] = get_digest<K>(key);\n      (*(bucket->keys(empty_pos))).store(key, cuda::std::memory_order_relaxed);\n      const S score =\n          (*(bucket->scores(key_idx))).load(cuda::std::memory_order_relaxed);\n      (*(bucket->scores(empty_pos)))\n          .store(score, cuda::std::memory_order_relaxed);\n      for (int j = 0; j < dim; j++) {\n        bucket->vectors[empty_pos * dim + j] =\n            bucket->vectors[key_idx * dim + j];\n      }\n      bucket->digests(key_idx)[0] = empty_digest<K>();\n      (*(bucket->keys(key_idx)))\n          .store(static_cast<K>(EMPTY_KEY), cuda::std::memory_order_relaxed);\n      empty_pos = key_idx;\n      remove_pos = key_idx;\n      i = 1;\n    } else {\n      i++;\n    }\n  }\n}\n\ntemplate <class K, class V, class S, uint32_t TILE_SIZE = 4>\n__forceinline__ __device__ void move_key_to_new_bucket(\n    cg::thread_block_tile<TILE_SIZE> g, int rank, const K& key, const S& score,\n    const V* __restrict vector, Bucket<K, V, S>* __restrict new_bucket,\n    const size_t new_bkt_idx, const size_t new_start_idx,\n    int* __restrict buckets_size, const size_t bucket_max_size,\n    const size_t buckets_num, const size_t dim) {\n  uint32_t key_pos;\n  unsigned empty_vote;\n  int src_lane;\n\n  for (uint32_t tile_offset = 0; tile_offset < bucket_max_size;\n       tile_offset += TILE_SIZE) {\n    size_t key_offset =\n        (new_start_idx + tile_offset + rank) & (bucket_max_size - 1);\n    const K current_key =\n        (*(new_bucket->keys(key_offset))).load(cuda::std::memory_order_relaxed);\n    empty_vote = g.ballot(current_key == static_cast<K>(EMPTY_KEY));\n    if (empty_vote) {\n      src_lane = __ffs(empty_vote) - 1;\n      key_pos =\n          (new_start_idx + tile_offset + src_lane) & (bucket_max_size - 1);\n      if (rank == src_lane) {\n        new_bucket->digests(key_pos)[0] = get_digest<K>(key);\n        new_bucket->keys(key_pos)->store(key, cuda::std::memory_order_relaxed);\n        new_bucket->scores(key_pos)->store(score,\n                                           cuda::std::memory_order_relaxed);\n        atomicAdd(&(buckets_size[new_bkt_idx]), 1);\n      }\n      copy_vector<V, TILE_SIZE>(g, vector, new_bucket->vectors + key_pos * dim,\n                                dim);\n      break;\n    }\n  }\n}\n\ntemplate <class K, class V, class S, uint32_t TILE_SIZE = 4>\n__global__ void rehash_kernel_for_fast_mode(\n    const Table<K, V, S>* __restrict table, Bucket<K, V, S>* buckets,\n    size_t N) {\n  int* __restrict buckets_size = table->buckets_size;\n  const size_t bucket_max_size = table->bucket_max_size;\n  const size_t buckets_num = table->buckets_num;\n  const size_t dim = table->dim;\n\n  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());\n  int rank = g.thread_rank();\n  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;\n  size_t global_idx;\n  uint32_t start_idx = 0;\n  K target_key = 0;\n  S target_score = 0;\n\n  for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) {\n    uint32_t bkt_idx = t / TILE_SIZE;\n    Bucket<K, V, S>* bucket = (buckets + bkt_idx);\n\n    lock<Mutex, TILE_SIZE>(g, table->locks[bkt_idx]);\n    uint32_t key_idx = 0;\n    while (key_idx < bucket_max_size) {\n      key_idx = g.shfl(key_idx, 0);\n      target_key =\n          (bucket->keys(key_idx))->load(cuda::std::memory_order_relaxed);\n      target_score =\n          bucket->scores(key_idx)->load(cuda::std::memory_order_relaxed);\n      if (target_key != static_cast<K>(EMPTY_KEY) &&\n          target_key != static_cast<K>(RECLAIM_KEY)) {\n        const K hashed_key = Murmur3HashDevice(target_key);\n        global_idx = hashed_key % (buckets_num * bucket_max_size);\n        uint32_t new_bkt_idx = global_idx / bucket_max_size;\n        if (new_bkt_idx != bkt_idx) {\n          start_idx = get_start_position(global_idx, bucket_max_size);\n          move_key_to_new_bucket<K, V, S, TILE_SIZE>(\n              g, rank, target_key, target_score,\n              (bucket->vectors + key_idx * dim), buckets + new_bkt_idx,\n              new_bkt_idx, start_idx, buckets_size, bucket_max_size,\n              buckets_num, table->dim);\n          if (rank == 0) {\n            bucket->digests(key_idx)[0] = empty_digest<K>();\n            (bucket->keys(key_idx))\n                ->store(static_cast<K>(EMPTY_KEY),\n                        cuda::std::memory_order_relaxed);\n            atomicSub(&(buckets_size[bkt_idx]), 1);\n            defragmentation_for_rehash<K, V, S, TILE_SIZE>(\n                bucket, key_idx, bucket_max_size, buckets_num / 2, dim);\n            key_idx = 0;\n          }\n        } else {\n          key_idx++;\n        }\n      } else {\n        key_idx++;\n      }\n    }\n    unlock<Mutex, TILE_SIZE>(g, table->locks[bkt_idx]);\n  }\n}\n\n/* Read the N data from src to each address in *dst,\n   usually called by upsert kernel.\n\n   `src`: A pointer of pointer of V which should be on HBM,\n          but each value (a pointer of V) could point to a\n          memory on HBM or HMEM.\n   `dst`: A continue memory pointer with Vector\n          which should be HBM.\n   `mask`: One for each `dst`. If true, reading from src,\n           or false reading from default_val.\n   `default_val`: Default value with shape (1, DIM) or (N, DIM)\n   `N`: The number of vectors needed to be read.\n   'full_size_default':\n      If true, the d_def_val will be treated as\n      a full size default value which shape must be (N, DIM).\n*/\ntemplate <class K, class V, class S>\n__global__ void read_kernel(const V* const* __restrict src, V* __restrict dst,\n                            const bool* mask, const int* __restrict dst_offset,\n                            const size_t dim, size_t N) {\n  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;\n\n  for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) {\n    int vec_index = int(t / dim);\n    int dim_index = t % dim;\n    int real_dst_offset =\n        dst_offset != nullptr ? dst_offset[vec_index] : vec_index;\n\n    /// Copy selected values and fill in default value for all others.\n    if (mask[real_dst_offset] && src[vec_index] != nullptr) {\n      dst[real_dst_offset * dim + dim_index] = src[vec_index][dim_index];\n    }\n  }\n}\n\n/* Read the N data from src to each address in *dst,\n *  usually called by upsert kernel.\n *\n *  `src`: A pointer of pointer of V which should be on HBM,\n *         but each value (a pointer of V) could point to a\n *         memory on HBM or HMEM.\n *  `dst`: A continue memory pointer with Vector\n *         which should be HBM.\n *  `N`: Number of vectors needed to be read.\n */\ntemplate <class K, class V, class S>\n__global__ void read_kernel(const V* const* __restrict src, V* __restrict dst,\n                            const int* __restrict dst_offset, const size_t dim,\n                            const size_t N) {\n  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;\n\n  for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) {\n    int vec_index = int(t / dim);\n    int real_dst_offset =\n        dst_offset != nullptr ? dst_offset[vec_index] : vec_index;\n    int dim_index = t % dim;\n    if (src[vec_index] != nullptr) {\n      dst[real_dst_offset * dim + dim_index] = src[vec_index][dim_index];\n    }\n  }\n}\n\n/* Clear all key-value in the table. */\ntemplate <class K, class V, class S>\n__global__ void clear_kernel(Table<K, V, S>* __restrict table,\n                             Bucket<K, V, S>* buckets, size_t N) {\n  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;\n  const size_t bucket_max_size = table->bucket_max_size;\n  const D empty_d = table->dual_bucket_mode ? dual_bucket_empty_digest<K>()\n                                            : empty_digest<K>();\n\n  for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) {\n    int key_idx = t % bucket_max_size;\n    int bkt_idx = t / bucket_max_size;\n    Bucket<K, V, S>* bucket = &(buckets[bkt_idx]);\n\n    bucket->digests(key_idx)[0] = empty_d;\n    (bucket->keys(key_idx))\n        ->store(static_cast<K>(EMPTY_KEY), cuda::std::memory_order_relaxed);\n    if (key_idx == 0) {\n      table->buckets_size[bkt_idx] = 0;\n    }\n  }\n}\n\n/* Remove specified keys. */\ntemplate <class K, class V, class S, uint32_t TILE_SIZE = 4>\n__global__ void remove_kernel(const Table<K, V, S>* __restrict table,\n                              const K* __restrict keys,\n                              Bucket<K, V, S>* __restrict buckets,\n                              int* __restrict buckets_size,\n                              const size_t bucket_max_size,\n                              const size_t buckets_num, size_t N) {\n  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());\n  int rank = g.thread_rank();\n\n  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;\n       t += blockDim.x * gridDim.x) {\n    int key_idx = t / TILE_SIZE;\n    K find_key = keys[key_idx];\n    if (IS_RESERVED_KEY<K>(find_key)) continue;\n\n    int key_pos = -1;\n\n    size_t bkt_idx = 0;\n    size_t start_idx = 0;\n    uint32_t tile_offset = 0;\n\n    Bucket<K, V, S>* bucket = get_key_position<K>(\n        buckets, find_key, bkt_idx, start_idx, buckets_num, bucket_max_size);\n\n    unsigned found_vote = 0;\n#pragma unroll\n    for (tile_offset = 0; tile_offset < bucket_max_size;\n         tile_offset += TILE_SIZE) {\n      key_pos = (start_idx + tile_offset + rank) & (bucket_max_size - 1);\n\n      const K current_key =\n          (bucket->keys(key_pos))->load(cuda::std::memory_order_relaxed);\n\n      found_vote = g.ballot(find_key == current_key);\n      if (found_vote) {\n        break;\n      }\n\n      if (g.any(current_key == static_cast<K>(EMPTY_KEY))) {\n        break;\n      }\n    }\n\n    if (found_vote) {\n      const int src_lane = __ffs(found_vote) - 1;\n\n      if (g.thread_rank() == src_lane) {\n        const int key_pos =\n            (start_idx + tile_offset + src_lane) & (bucket_max_size - 1);\n        bucket->digests(key_pos)[0] = reclaim_digest<K>();\n        (bucket->keys(key_pos))\n            ->store(static_cast<K>(RECLAIM_KEY),\n                    cuda::std::memory_order_relaxed);\n        (bucket->scores(key_pos))\n            ->store(static_cast<S>(EMPTY_SCORE),\n                    cuda::std::memory_order_relaxed);\n        atomicSub(&buckets_size[bkt_idx], 1);\n      }\n      break;\n    }\n  }\n}\n\n/* Remove specified keys which match the Predict. */\ntemplate <class K, class V, class S,\n          template <typename, typename> class PredFunctor,\n          uint32_t TILE_SIZE = 1>\n__global__ void remove_kernel(const Table<K, V, S>* __restrict table,\n                              const K pattern, const S threshold,\n                              size_t* __restrict count,\n                              Bucket<K, V, S>* __restrict buckets,\n                              int* __restrict buckets_size,\n                              const size_t bucket_max_size,\n                              const size_t buckets_num, size_t N) {\n  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());\n  PredFunctor<K, S> pred;\n\n  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;\n       t += blockDim.x * gridDim.x) {\n    uint32_t bkt_idx = t;\n    uint32_t key_pos = 0;\n\n    Bucket<K, V, S>* bucket = buckets + bkt_idx;\n\n    K current_key = 0;\n    S current_score = 0;\n    uint32_t key_offset = 0;\n    while (key_offset < bucket_max_size) {\n      current_key =\n          bucket->keys(key_offset)->load(cuda::std::memory_order_relaxed);\n      current_score =\n          bucket->scores(key_offset)->load(cuda::std::memory_order_relaxed);\n      if (!IS_RESERVED_KEY<K>(current_key)) {\n        if (pred(current_key, current_score, pattern, threshold)) {\n          atomicAdd(count, 1);\n          key_pos = key_offset;\n          bucket->digests(key_pos)[0] = reclaim_digest<K>();\n          (bucket->keys(key_pos))\n              ->store(static_cast<K>(RECLAIM_KEY),\n                      cuda::std::memory_order_relaxed);\n          (bucket->scores(key_pos))\n              ->store(static_cast<S>(EMPTY_SCORE),\n                      cuda::std::memory_order_relaxed);\n          atomicSub(&buckets_size[bkt_idx], 1);\n        } else {\n          key_offset++;\n        }\n      } else {\n        key_offset++;\n      }\n    }\n  }\n}\n\ntemplate <typename K, typename V, typename S, typename PredFunctor,\n          uint32_t GroupSize = 32>\n__global__ void remove_kernel_v2(const uint64_t search_length,\n                                 const uint64_t offset, PredFunctor pred,\n                                 Bucket<K, V, S>* buckets,\n                                 int* __restrict buckets_size,\n                                 const uint64_t bucket_capacity,\n                                 const uint64_t dim, uint64_t* remove_counter) {\n  cg::thread_block_tile<GroupSize> g =\n      cg::tiled_partition<GroupSize>(cg::this_thread_block());\n\n  uint64_t tid = static_cast<uint64_t>(blockIdx.x) * blockDim.x + threadIdx.x;\n\n  for (uint64_t i = tid; i < search_length; i += gridDim.x * blockDim.x) {\n    uint64_t bkt_idx = (i + offset) / bucket_capacity;\n    uint64_t key_idx = (i + offset) % bucket_capacity;\n\n    // May be different for threads within the same group.\n    Bucket<K, V, S>* bucket = buckets + bkt_idx;\n\n    const K key = bucket->keys(key_idx)->load(cuda::std::memory_order_relaxed);\n    const S score =\n        bucket->scores(key_idx)->load(cuda::std::memory_order_relaxed);\n    const V* value = bucket->vectors + key_idx * dim;\n\n    bool match = pred.template operator()<GroupSize>(key, value, score, g);\n    if (IS_RESERVED_KEY<K>(key)) {\n      match = false;\n    }\n    uint32_t vote = g.ballot(match);\n    int group_cnt = __popc(vote);\n    if (g.thread_rank() == 0) {\n      atomicAdd(remove_counter, static_cast<uint64_t>(group_cnt));\n      if (bucket_capacity >= GroupSize) {\n        atomicSub(&buckets_size[bkt_idx], group_cnt);\n      }\n    }\n    // Only matched threads need to erase.\n    if (match) {\n      bucket->digests(key_idx)[0] = reclaim_digest<K>();\n      bucket->keys(key_idx)->store(static_cast<K>(RECLAIM_KEY),\n                                   cuda::std::memory_order_relaxed);\n      bucket->scores(key_idx)->store(static_cast<S>(EMPTY_SCORE),\n                                     cuda::std::memory_order_relaxed);\n      if (bucket_capacity < GroupSize) {\n        atomicSub(&buckets_size[bkt_idx], 1);\n      }\n    }\n  }\n}\n\n/* Dump with score. */\ntemplate <class K, class V, class S>\ninline std::tuple<size_t, size_t> dump_kernel_shared_memory_size(\n    const size_t available_shared_memory) {\n  const size_t block_size{std::min(\n      available_shared_memory / 2 / sizeof(KVM<K, V, S>), UINT64_C(1024))};\n  MERLIN_CHECK(\n      block_size > 0,\n      \"[HierarchicalKV] block_size <= 0, the K-V-S size may be too large!\");\n\n  return std::make_tuple(block_size * sizeof(KVM<K, V, S>), block_size);\n}\n\ntemplate <class K, class V, class S>\n__global__ void dump_kernel(const Table<K, V, S>* __restrict table,\n                            Bucket<K, V, S>* buckets, K* d_key,\n                            V* __restrict d_val, S* __restrict d_score,\n                            const size_t offset, const size_t search_length,\n                            size_t* d_dump_counter) {\n  extern __shared__ unsigned char s[];\n  KVM<K, V, S>* const block_tuples{reinterpret_cast<KVM<K, V, S>*>(s)};\n\n  const size_t bucket_max_size{table->bucket_max_size};\n  const size_t dim{table->dim};\n\n  __shared__ size_t block_acc;\n  __shared__ size_t global_acc;\n\n  const size_t tid{blockIdx.x * blockDim.x + threadIdx.x};\n\n  if (threadIdx.x == 0) {\n    block_acc = 0;\n  }\n  __syncthreads();\n\n  if (tid < search_length) {\n    Bucket<K, V, S>* const bucket{&buckets[(tid + offset) / bucket_max_size]};\n\n    const int key_idx{static_cast<int>((tid + offset) % bucket_max_size)};\n    const K key{(bucket->keys(key_idx))->load(cuda::std::memory_order_relaxed)};\n\n    if (!IS_RESERVED_KEY<K>(key)) {\n      size_t local_index{atomicAdd(&block_acc, 1)};\n      block_tuples[local_index] = {\n          key, &bucket->vectors[key_idx * dim],\n          bucket->scores(key_idx)->load(cuda::std::memory_order_relaxed)};\n    }\n  }\n  __syncthreads();\n\n  if (threadIdx.x == 0) {\n    global_acc = atomicAdd(d_dump_counter, block_acc);\n  }\n  __syncthreads();\n\n  if (threadIdx.x < block_acc) {\n    const KVM<K, V, S>& tuple{block_tuples[threadIdx.x]};\n\n    const size_t j{global_acc + threadIdx.x};\n    d_key[j] = tuple.key;\n    for (int i{0}; i < dim; ++i) {\n      d_val[j * dim + i] = tuple.value[i];\n    }\n    if (d_score != nullptr) {\n      d_score[j] = tuple.score;\n    }\n  }\n}\n\n/* Dump with score. */\ntemplate <class K, class V, class S,\n          template <typename, typename> class PredFunctor>\n__global__ void dump_kernel(const Table<K, V, S>* __restrict table,\n                            Bucket<K, V, S>* buckets, const K pattern,\n                            const S threshold, K* d_key, V* __restrict d_val,\n                            S* __restrict d_score, const size_t offset,\n                            const size_t search_length,\n                            size_t* d_dump_counter) {\n  extern __shared__ unsigned char s[];\n  const size_t bucket_max_size = table->bucket_max_size;\n  const size_t dim = table->dim;\n  K* smem = (K*)s;\n  K* block_result_key = smem;\n  V* block_result_val = (V*)&(smem[blockDim.x]);\n  S* block_result_score = (S*)&(block_result_val[blockDim.x * dim]);\n  __shared__ size_t block_acc;\n  __shared__ size_t global_acc;\n  PredFunctor<K, S> pred;\n\n  const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n\n  if (threadIdx.x == 0) {\n    block_acc = 0;\n  }\n  __syncthreads();\n\n  if (tid < search_length) {\n    int bkt_idx = (tid + offset) / bucket_max_size;\n    int key_idx = (tid + offset) % bucket_max_size;\n    Bucket<K, V, S>* bucket = &(buckets[bkt_idx]);\n\n    const K key =\n        (bucket->keys(key_idx))->load(cuda::std::memory_order_relaxed);\n    S score = bucket->scores(key_idx)->load(cuda::std::memory_order_relaxed);\n\n    if (!IS_RESERVED_KEY<K>(key) && pred(key, score, pattern, threshold)) {\n      size_t local_index = atomicAdd(&block_acc, 1);\n      block_result_key[local_index] = key;\n      for (int i = 0; i < dim; i++) {\n        atomicExch(&(block_result_val[local_index * dim + i]),\n                   bucket->vectors[key_idx * dim + i]);\n      }\n      if (d_score != nullptr) {\n        block_result_score[local_index] = score;\n      }\n    }\n  }\n  __syncthreads();\n\n  if (threadIdx.x == 0) {\n    global_acc = atomicAdd(d_dump_counter, block_acc);\n  }\n  __syncthreads();\n\n  if (threadIdx.x < block_acc) {\n    d_key[global_acc + threadIdx.x] = block_result_key[threadIdx.x];\n    for (int i = 0; i < dim; i++) {\n      d_val[(global_acc + threadIdx.x) * dim + i] =\n          block_result_val[threadIdx.x * dim + i];\n    }\n    if (d_score != nullptr) {\n      d_score[global_acc + threadIdx.x] = block_result_score[threadIdx.x];\n    }\n  }\n}\n\ntemplate <class K, class V, class S, class VecV,\n          template <typename, typename> class PredFunctor, int TILE_SIZE>\n__global__ void dump_kernel_v2(const Table<K, V, S>* __restrict table,\n                               Bucket<K, V, S>* buckets, const K pattern,\n                               const S threshold, K* d_key, V* __restrict d_val,\n                               S* __restrict d_score, const size_t offset,\n                               const size_t search_length,\n                               size_t* d_dump_counter) {\n  const size_t bucket_max_size = table->bucket_max_size;\n  int vec_dim = table->dim * sizeof(V) / sizeof(VecV);\n  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());\n\n  PredFunctor<K, S> pred;\n  size_t tid = static_cast<size_t>(blockIdx.x) * blockDim.x + threadIdx.x;\n\n  for (size_t i = tid; i < search_length; i += gridDim.x * blockDim.x) {\n    size_t bkt_idx = (i + offset) / bucket_max_size;\n    size_t key_idx = (i + offset) % bucket_max_size;\n    size_t leading_key_idx = key_idx / TILE_SIZE * TILE_SIZE;\n    Bucket<K, V, S>* bucket = &(buckets[bkt_idx]);\n\n    const K key =\n        (bucket->keys(key_idx))->load(cuda::std::memory_order_relaxed);\n    S score = bucket->scores(key_idx)->load(cuda::std::memory_order_relaxed);\n\n    bool match =\n        (!IS_RESERVED_KEY<K>(key)) && pred(key, score, pattern, threshold);\n    unsigned int vote = g.ballot(match);\n    int tile_cnt = __popc(vote);\n    size_t tile_offset = 0;\n    if (g.thread_rank() == 0) {\n      tile_offset = atomicAdd(d_dump_counter, static_cast<size_t>(tile_cnt));\n    }\n    tile_offset = g.shfl(tile_offset, 0);\n    int bias_g = tile_cnt - __popc(vote >> (key_idx % TILE_SIZE));\n\n    if (match) {\n      d_key[tile_offset + bias_g] = key;\n      if (d_score) {\n        d_score[tile_offset + bias_g] = score;\n      }\n    }\n\n#pragma unroll\n    for (int r = 0; r < TILE_SIZE; r++) {\n      unsigned int biased_vote = vote >> r;\n      bool cur_match = biased_vote & 1;\n      if (cur_match) {\n        int bias = tile_cnt - __popc(biased_vote);\n        size_t cur_idx = leading_key_idx + r;\n\n        VecV* d_val_vec = reinterpret_cast<VecV*>(d_val);\n        VecV* vec = reinterpret_cast<VecV*>(bucket->vectors);\n        for (int j = g.thread_rank(); j < vec_dim; j += TILE_SIZE) {\n          d_val_vec[(tile_offset + bias) * vec_dim + j] =\n              vec[cur_idx * vec_dim + j];\n        }\n      }\n    }\n  }\n}\n\ntemplate <typename K, typename V, typename S, typename PredFunctor,\n          uint32_t GroupSize = 32>\n__global__ void dump_kernel(const uint64_t search_length, const uint64_t offset,\n                            PredFunctor pred, Bucket<K, V, S>* buckets,\n                            const uint64_t bucket_capacity, const uint64_t dim,\n                            K* __restrict__ out_keys, V* __restrict__ out_vals,\n                            S* __restrict__ out_scores,\n                            uint64_t* dump_counter) {\n  cg::thread_block_tile<GroupSize> g =\n      cg::tiled_partition<GroupSize>(cg::this_thread_block());\n\n  uint64_t tid = static_cast<uint64_t>(blockIdx.x) * blockDim.x + threadIdx.x;\n\n  for (uint64_t i = tid; i < search_length; i += gridDim.x * blockDim.x) {\n    uint64_t bkt_idx = (i + offset) / bucket_capacity;\n    uint64_t key_idx = (i + offset) % bucket_capacity;\n\n    // May be different for threads within the same group.\n    Bucket<K, V, S>* bucket = buckets + bkt_idx;\n\n    const K key = bucket->keys(key_idx)->load(cuda::std::memory_order_relaxed);\n    const S score =\n        bucket->scores(key_idx)->load(cuda::std::memory_order_relaxed);\n    const V* value = bucket->vectors + key_idx * dim;\n\n    bool match = pred.template operator()<GroupSize>(key, value, score, g);\n    uint32_t vote = g.ballot(match);\n    int group_cnt = __popc(vote);\n    uint64_t group_offset = 0;\n    if (g.thread_rank() == 0) {\n      group_offset = atomicAdd(dump_counter, static_cast<uint64_t>(group_cnt));\n    }\n    group_offset = g.shfl(group_offset, 0);\n    // Each thread gets the count of previous matches ranks.\n    // Using `g.thread_rank()` instead of `key_idx % GroupSize` to handle case:\n    // bucket_capacity < GroupSize.\n    int previous_cnt = group_cnt - __popc(vote >> g.thread_rank());\n    // Only matched threads need to output.\n    if (match) {\n      out_keys[group_offset + previous_cnt] = key;\n      if (out_scores) {\n        out_scores[group_offset + previous_cnt] = score;\n      }\n    }\n\n    for (int r = 0; r < GroupSize; r++) {\n      uint32_t biased_vote = vote >> r;\n      bool cur_match = biased_vote & 1;\n      if (cur_match) {\n        int bias = group_cnt - __popc(biased_vote);\n\n        /// TODO:timing them\n        //----------------------- Solution 1\n        // uint64_t cur_bkt_idx = g.shfl(bkt_idx, r);\n        // uint64_t cur_key_idx = g.shfl(key_idx, r);\n        // auto cur_bucket = buckets + cur_bkt_idx;\n        //----------------------- Solution 2\n        uint64_t cur_idx = (i / GroupSize) * GroupSize + r + offset;\n        uint64_t cur_bkt_idx = cur_idx / bucket_capacity;\n        uint64_t cur_key_idx = cur_idx % bucket_capacity;\n        Bucket<K, V, S>* cur_bucket = buckets + cur_bkt_idx;\n\n        for (int j = g.thread_rank(); j < dim; j += GroupSize) {\n          out_vals[(group_offset + bias) * dim + j] =\n              cur_bucket->vectors[cur_key_idx * dim + j];\n        }\n      }\n    }\n  }\n}\n\ntemplate <class K, class V, class S,\n          template <typename, typename> class PredFunctor>\n__global__ void size_if_kernel(const Table<K, V, S>* __restrict table,\n                               Bucket<K, V, S>* buckets, const K pattern,\n                               const S threshold, size_t* d_counter) {\n  extern __shared__ unsigned char s[];\n\n  const size_t bucket_max_size{table->bucket_max_size};\n\n  size_t local_acc = 0;\n  __shared__ size_t block_acc;\n  PredFunctor<K, S> pred;\n\n  const size_t tid{blockIdx.x * blockDim.x + threadIdx.x};\n\n  if (threadIdx.x == 0) {\n    block_acc = 0;\n  }\n  __syncthreads();\n\n  for (size_t i = tid; i < table->capacity; i += blockDim.x * gridDim.x) {\n    Bucket<K, V, S>* const bucket{&buckets[i / bucket_max_size]};\n\n    const int key_idx{static_cast<int>(i % bucket_max_size)};\n    const K key{(bucket->keys(key_idx))->load(cuda::std::memory_order_relaxed)};\n    S score = bucket->scores(key_idx)->load(cuda::std::memory_order_relaxed);\n\n    if ((!IS_RESERVED_KEY(key)) && pred(key, score, pattern, threshold)) {\n      ++local_acc;\n    }\n  }\n  atomicAdd(&block_acc, local_acc);\n  __syncthreads();\n\n  if (threadIdx.x == 0) {\n    atomicAdd(d_counter, block_acc);\n  }\n}\n\ntemplate <typename K, typename V, typename S, typename ExecutionFunc,\n          uint32_t GroupSize = 32>\n__global__ void traverse_kernel(const uint64_t search_length,\n                                const uint64_t offset, ExecutionFunc f,\n                                Bucket<K, V, S>* buckets,\n                                const uint64_t bucket_capacity,\n                                const uint64_t dim) {\n  cg::thread_block_tile<GroupSize> g =\n      cg::tiled_partition<GroupSize>(cg::this_thread_block());\n\n  uint64_t tid = static_cast<uint64_t>(blockIdx.x) * blockDim.x + threadIdx.x;\n\n  for (uint64_t i = tid; i < search_length; i += gridDim.x * blockDim.x) {\n    uint64_t bkt_idx = (i + offset) / bucket_capacity;\n    uint64_t key_idx = (i + offset) % bucket_capacity;\n\n    // May be different for threads within the same group.\n    Bucket<K, V, S>* bucket = buckets + bkt_idx;\n\n    const K key = bucket->keys(key_idx)->load(cuda::std::memory_order_relaxed);\n    S* score = reinterpret_cast<S*>(bucket->scores(key_idx));\n    V* value = bucket->vectors + key_idx * dim;\n\n    f.template operator()<GroupSize>(key, value, score, g);\n  }\n}\n\ntemplate <typename K>\n__global__ void unlock_keys_kernel(uint64_t n, K** __restrict__ locked_key_ptrs,\n                                   const K* __restrict__ keys,\n                                   bool* __restrict__ succeededs) {\n  int kv_idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (kv_idx < n) {\n    K* locked_key_ptr = locked_key_ptrs[kv_idx];\n    bool flag = true;\n    if (locked_key_ptr != nullptr) {\n      K locked_key = *locked_key_ptr;\n      K expected_key = static_cast<K>(LOCKED_KEY);\n      K key = keys[kv_idx];\n      if (locked_key == expected_key) {\n        *locked_key_ptr = key;\n      } else {\n        flag = false;\n      }\n    } else {\n      flag = false;\n    }\n    if (succeededs != nullptr) {\n      succeededs[kv_idx] = flag;\n    }\n  }\n}\n\ntemplate <typename K, typename V, typename S, typename Tidx, int TILE_SIZE = 8>\n__global__ void compact_key_value_score_kernel(\n    const bool* masks, size_t n, const Tidx* offsets,\n    K* __restrict const src_keys, V* __restrict const src_values,\n    S* __restrict const src_scores, K* __restrict dst_keys,\n    V* __restrict dst_values, S* __restrict dst_scores, const size_t dim) {\n  int tid = blockIdx.x * blockDim.x + threadIdx.x;\n  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());\n  int rank = g.thread_rank();\n\n  bool is_existed = false;\n  if (tid < n) {\n    if (masks[tid]) {\n      is_existed = true;\n    }\n  }\n  unsigned int vote = g.ballot(is_existed);\n  unsigned int r_vote = __brev(vote) >> (32 - TILE_SIZE);\n  K empty_key = (K)EMPTY_KEY;\n  Tidx bias;\n  if (is_existed) {\n    r_vote = r_vote >> (TILE_SIZE - rank - 1);\n    int prefix_n = __popc(r_vote) - 1;\n    bias = offsets[tid / TILE_SIZE] + static_cast<Tidx>(prefix_n);\n    dst_keys[bias] = src_keys[tid];\n    if (src_scores and dst_scores) dst_scores[bias] = src_scores[tid];\n  }\n\n  int group_offset = (tid / TILE_SIZE) * TILE_SIZE;\n  for (int i = 0; i < TILE_SIZE; i++) {\n    if (group_offset + i >= n) return;\n    auto cur_existed = g.shfl(is_existed, i);\n    if (cur_existed) {\n      auto cur_bias = g.shfl(bias, i);\n      for (size_t j = rank; j < dim; j += TILE_SIZE) {\n        dst_values[dim * cur_bias + j] =\n            src_values[dim * (group_offset + i) + j];\n      }\n    }\n  }\n}\n\ntemplate <typename K, typename V, typename S, int Strategy = -1>\n__global__ void lock_kernel_with_filter(\n    Bucket<K, V, S>* __restrict__ buckets, uint64_t const buckets_num,\n    uint32_t bucket_capacity, uint32_t const dim, K const* __restrict__ keys,\n    K** __restrict locked_keys_ptr, bool* __restrict succeed,\n    S const* __restrict__ scores, const S global_epoch, uint64_t n) {\n  using BUCKET = Bucket<K, V, S>;\n  using ScoreFunctor = ScoreFunctor<K, V, S, Strategy>;\n  // Load `STRIDE` digests every time.\n  constexpr uint32_t STRIDE = sizeof(VecD_Load) / sizeof(D);\n\n  uint32_t tx = threadIdx.x;\n  uint32_t kv_idx = blockIdx.x * blockDim.x + tx;\n  K key{static_cast<K>(EMPTY_KEY)};\n  S score{static_cast<S>(EMPTY_SCORE)};\n  OccupyResult occupy_result{OccupyResult::INITIAL};\n  VecD_Comp target_digests{0};\n  K* bucket_keys_ptr{nullptr};\n  uint32_t key_pos = {0};\n  if (kv_idx < n) {\n    key = keys[kv_idx];\n    score = ScoreFunctor::desired_when_missed(scores, kv_idx, global_epoch);\n    if (!IS_RESERVED_KEY<K>(key)) {\n      const K hashed_key = Murmur3HashDevice(key);\n      target_digests = digests_from_hashed<K>(hashed_key);\n      uint64_t global_idx =\n          static_cast<uint64_t>(hashed_key % (buckets_num * bucket_capacity));\n      key_pos = get_start_position(global_idx, bucket_capacity);\n      uint64_t bkt_idx = global_idx / bucket_capacity;\n      BUCKET* bucket = buckets + bkt_idx;\n      bucket_keys_ptr = reinterpret_cast<K*>(bucket->keys(0));\n    } else {\n      occupy_result = OccupyResult::ILLEGAL;\n      goto WRITE_BACK;\n    }\n  } else {\n    return;\n  }\n\n  // One more loop to handle empty keys.\n  for (int offset = 0; offset < bucket_capacity + STRIDE; offset += STRIDE) {\n    uint32_t pos_cur = align_to<STRIDE>(key_pos);\n    pos_cur = (pos_cur + offset) & (bucket_capacity - 1);\n\n    D* digests_ptr = BUCKET::digests(bucket_keys_ptr, bucket_capacity, pos_cur);\n    VecD_Load digests_vec = *(reinterpret_cast<VecD_Load*>(digests_ptr));\n    VecD_Comp digests_arr[4] = {digests_vec.x, digests_vec.y, digests_vec.z,\n                                digests_vec.w};\n\n    for (int i = 0; i < 4; i++) {\n      VecD_Comp probe_digests = digests_arr[i];\n      uint32_t possible_pos = 0;\n      // Perform a vectorized comparison by byte,\n      // and if they are equal, set the corresponding byte in the result to\n      // 0xff.\n      int cmp_result = __vcmpeq4(probe_digests, target_digests);\n      cmp_result &= 0x01010101;\n      do {\n        if (cmp_result == 0) break;\n        // CUDA uses little endian,\n        // and the lowest byte in register stores in the lowest address.\n        uint32_t index = (__ffs(cmp_result) - 1) >> 3;\n        cmp_result &= (cmp_result - 1);\n        possible_pos = pos_cur + i * 4 + index;\n        auto current_key = bucket_keys_ptr[possible_pos];\n        if (current_key == key) {\n          key_pos = possible_pos;\n          occupy_result = OccupyResult::DUPLICATE;\n          goto WRITE_BACK;\n        }\n      } while (true);\n      VecD_Comp empty_digests_ = empty_digests<K>();\n      cmp_result = __vcmpeq4(probe_digests, empty_digests_);\n      cmp_result &= 0x01010101;\n      do {\n        if (cmp_result == 0) break;\n        uint32_t index = (__ffs(cmp_result) - 1) >> 3;\n        cmp_result &= (cmp_result - 1);\n        possible_pos = pos_cur + i * 4 + index;\n        if (offset == 0 && possible_pos < key_pos) continue;\n        auto current_key = bucket_keys_ptr[possible_pos];\n        if (current_key == static_cast<K>(EMPTY_KEY)) {\n          occupy_result = OccupyResult::OCCUPIED_EMPTY;\n          goto WRITE_BACK;\n        }\n      } while (true);\n    }\n  }\n\nWRITE_BACK:\n  bool found_ = occupy_result == OccupyResult::DUPLICATE;\n  if (found_) {\n    auto current_key = BUCKET::keys(bucket_keys_ptr, key_pos);\n    K expected_key = key;\n    // Modifications to the bucket will not before this instruction.\n    bool result = current_key->compare_exchange_strong(\n        expected_key, static_cast<K>(LOCKED_KEY),\n        cuda::std::memory_order_relaxed, cuda::std::memory_order_relaxed);\n    if (not result) {\n      found_ = false;\n    } else {\n      ScoreFunctor::update_with_digest(bucket_keys_ptr, key_pos, scores, kv_idx,\n                                       score, bucket_capacity,\n                                       get_digest<K>(key), false);\n    }\n  }\n  if (found_) {\n    locked_keys_ptr[kv_idx] = bucket_keys_ptr + key_pos;\n  } else {\n    locked_keys_ptr[kv_idx] = nullptr;\n  }\n  if (succeed) {\n    succeed[kv_idx] = found_;\n  }\n}\n\ntemplate <typename KeyT, typename ValueT>\nstruct SortPairOp {\n  SortPairOp() : d_temp_storage(nullptr), temp_storage_bytes(0) {}\n\n  size_t get_storage_bytes(int batch, cudaStream_t stream) {\n    num_items = batch;\n    cub::DeviceRadixSort::SortPairs<KeyT, ValueT>(\n        d_temp_storage, temp_storage_bytes, nullptr, nullptr, nullptr, nullptr,\n        num_items, 0, sizeof(KeyT) * 8, stream);\n\n    return temp_storage_bytes;\n  }\n\n  void set_storage(void* storage) { d_temp_storage = storage; }\n\n  void sort(int batch, KeyT const* d_keys_in, KeyT* d_keys_out,\n            ValueT const* d_values_in, ValueT* d_values_out,\n            cudaStream_t stream) {\n    if (batch != num_items) {\n      throw std::runtime_error(\"Number of items is not matched when sort.\");\n    }\n    cub::DeviceRadixSort::SortPairs(\n        d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in,\n        d_values_out, num_items, 0, sizeof(KeyT) * 8, stream);\n  }\n\n  void* d_temp_storage{nullptr};\n  size_t temp_storage_bytes{0};\n  int num_items{0};\n};\n\ntemplate <typename InputT, typename OutputT>\nstruct SumOp {\n  using InputIteratorT = InputT const*;\n  using OutputIteratorT = OutputT*;\n  SumOp() : d_temp_storage(nullptr), temp_storage_bytes(0) {}\n\n  size_t get_storage_bytes(int batch, cudaStream_t stream) {\n    num_items = batch;\n    cub::DeviceReduce::Reduce<InputIteratorT, OutputIteratorT>(\n        d_temp_storage, temp_storage_bytes, nullptr, nullptr, num_items,\n        cuda::std::plus<>(), 0, stream);\n    return temp_storage_bytes;\n  }\n\n  void set_storage(void* storage) { d_temp_storage = storage; }\n\n  void sum(int batch, InputIteratorT d_in, OutputIteratorT d_out,\n           cudaStream_t stream) {\n    if (batch != num_items) {\n      throw std::runtime_error(\"Number of items is not matched when sum.\");\n    }\n    cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out,\n                              num_items, cuda::std::plus<>(), 0, stream);\n  }\n\n  void* d_temp_storage{nullptr};\n  size_t temp_storage_bytes{0};\n  int num_items{0};\n};\n\n}  // namespace merlin\n}  // namespace nv\n"
  },
  {
    "path": "include/merlin/debug.hpp",
    "content": "/*\n * Copyright (c) 2022, NVIDIA CORPORATION.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *     http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n#pragma once\n\n#include <cuda_runtime_api.h>\n#include <sstream>\n#include <stdexcept>\n#include <string>\n\nnamespace nv {\nnamespace merlin {\n\nclass CudaException : public std::runtime_error {\n public:\n  CudaException(const std::string& what) : runtime_error(what) {}\n};\n\ninline void cuda_check_(cudaError_t val, const char* file, int line) {\n  if (val != cudaSuccess) {\n    std::ostringstream os;\n    os << file << ':' << line << \": CUDA error \" << cudaGetErrorName(val)\n       << \" (#\" << val << \"): \" << cudaGetErrorString(val);\n    throw CudaException(os.str());\n  }\n}\n\n#ifdef CUDA_CHECK\n#error Unexpected redfinition of CUDA_CHECK! Something is wrong.\n#endif\n\n#define CUDA_CHECK(val)                                 \\\n  do {                                                  \\\n    nv::merlin::cuda_check_((val), __FILE__, __LINE__); \\\n  } while (0)\n\nclass MerlinException : public std::runtime_error {\n public:\n  MerlinException(const std::string& what) : runtime_error(what) {}\n};\n\ntemplate <class Msg>\ninline void merlin_check_(bool cond, const Msg& msg, const char* file,\n                          int line) {\n  if (!cond) {\n    std::ostringstream os;\n    os << file << ':' << line << \": HierarchicalKV error \" << msg;\n    throw MerlinException(os.str());\n  }\n}\n\n#ifdef MERLIN_CHECK\n#error Unexpected redfinition of MERLIN_CHECK! Something is wrong.\n#endif\n\n#define MERLIN_CHECK(cond, msg)                                   \\\n  do {                                                            \\\n    nv::merlin::merlin_check_((cond), (msg), __FILE__, __LINE__); \\\n  } while (0)\n\n}  // namespace merlin\n}  // namespace nv\n"
  },
  {
    "path": "include/merlin/flexible_buffer.cuh",
    "content": "/*\n * Copyright (c) 2022, NVIDIA CORPORATION.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *     http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n#pragma once\n\n#include <iostream>\n#include \"utils.cuh\"\n\nusing std::cerr;\nusing std::endl;\n\nnamespace nv {\nnamespace merlin {\n\ntemplate <class T>\nclass FlexPinnedBuffer {\n public:\n  FlexPinnedBuffer(const size_t size = 1) : ptr_(nullptr) {\n    if (!ptr_) {\n      size_ = size;\n      CUDA_CHECK(cudaMallocHost(&ptr_, sizeof(T) * size_));\n    }\n  }\n  ~FlexPinnedBuffer() {\n    try {\n      if (ptr_) CUDA_CHECK(cudaFreeHost(ptr_));\n    } catch (const nv::merlin::CudaException& e) {\n      cerr << \"[HierarchicalKV] Failed to free FlexPinnedBuffer!\" << endl;\n    }\n  }\n\n  __inline__ T* alloc_or_reuse(const size_t size = 0) {\n    if (size > size_) {\n      CUDA_CHECK(cudaFreeHost(ptr_));\n      size_ = size;\n      CUDA_CHECK(cudaMallocHost(&ptr_, sizeof(T) * size_));\n    }\n    return ptr_;\n  }\n\n private:\n  T* ptr_;\n  size_t size_;\n};\n\n}  // namespace merlin\n}  // namespace nv"
  },
  {
    "path": "include/merlin/group_lock.cuh",
    "content": "/*\n * Copyright (c) 2023, NVIDIA CORPORATION.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *     http:///www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n#pragma once\n#include <atomic>\n#include <cassert>\n#include <mutex>\n#include <system_error>\n#include <thread>\n#include \"core_kernels/group_lock_kernels.cuh\"\n#include \"utils.cuh\"\n\nnamespace nv {\nnamespace merlin {\n\n/*\n * Thread-local cached CUDA stream for lock acquisition kernels.\n * Eliminates per-call cudaStreamCreate/Destroy overhead which causes\n * CUDA driver contention when multiple threads acquire locks concurrently.\n */\ninline cudaStream_t get_lock_stream() {\n  thread_local cudaStream_t stream = [] {\n    cudaStream_t s;\n    CUDA_CHECK(cudaStreamCreate(&s));\n    return s;\n  }();\n  return stream;\n}\n\n/*\n * Implementing a triple-group, mutex and relative lock guard for better E2E\n * performance:\n * - There are three roles: `inserter`, `updater`, and `reader`.\n * - Allow only one inserter to be executed concurrently.  (like\n * `insert_or_assign` 'insert_and_evict`, `find_or_insert` etc.).\n * - Allow multiple updaters to be executed concurrently. (like `assign`, etc.)\n * The CUDA kernels guarantee the data consistency in this situation.\n * - Allow multiple readers to be executed concurrently. (like `find` 'size`\n * etc.)\n * - Not allow inserter, readers and updaters to run concurrently\n * - The `update_read_lock` is exclusive and used for special APIs (like\n * `reserve` `erase` `clear` etc.)\n */\nclass group_shared_mutex {\n public:\n  group_shared_mutex(const group_shared_mutex&) = delete;\n  group_shared_mutex& operator=(const group_shared_mutex&) = delete;\n\n  group_shared_mutex() noexcept\n      : h_update_count_(0), h_read_count_(0), h_unique_flag_(false) {\n    CUDA_CHECK(\n        cudaMalloc(&d_update_count_,\n                   sizeof(cuda::atomic<int, cuda::thread_scope_device>)));\n    CUDA_CHECK(cudaMalloc(\n        &d_read_count_, sizeof(cuda::atomic<int, cuda::thread_scope_device>)));\n    CUDA_CHECK(\n        cudaMalloc(&d_unique_flag_,\n                   sizeof(cuda::atomic<bool, cuda::thread_scope_device>)));\n    group_lock::init_kernel<<<1, 1, 0>>>(d_update_count_, d_read_count_,\n                                         d_unique_flag_);\n    CUDA_CHECK(cudaDeviceSynchronize());\n  }\n\n  ~group_shared_mutex() noexcept {\n    CUDA_CHECK(cudaDeviceSynchronize());\n    CUDA_CHECK(cudaFree(d_update_count_));\n    CUDA_CHECK(cudaFree(d_read_count_));\n    CUDA_CHECK(cudaFree(d_unique_flag_));\n  }\n\n  void lock_read() {\n    for (;;) {\n      while (h_update_count_.load(std::memory_order_acquire)) {\n      }\n      h_read_count_.fetch_add(1, std::memory_order_acq_rel);\n      if (h_update_count_.load(std::memory_order_acquire) == 0) {\n        {\n          cudaStream_t stream = get_lock_stream();\n          group_lock::lock_read_kernel<<<1, 1, 0, stream>>>(d_update_count_,\n                                                            d_read_count_);\n          CUDA_CHECK(cudaStreamSynchronize(stream));\n        }\n        break;\n      }\n      h_read_count_.fetch_sub(1, std::memory_order_acq_rel);\n    }\n  }\n\n  void unlock_read(cudaStream_t stream) {\n    { group_lock::unlock_read_kernel<<<1, 1, 0, stream>>>(d_read_count_); }\n    h_read_count_.fetch_sub(1, std::memory_order_release);\n  }\n\n  void lock_update() {\n    for (;;) {\n      while (h_read_count_.load(std::memory_order_acquire)) {\n      }\n      h_update_count_.fetch_add(1, std::memory_order_acq_rel);\n      if (h_read_count_.load(std::memory_order_acquire) == 0) {\n        {\n          cudaStream_t stream = get_lock_stream();\n          group_lock::lock_update_kernel<<<1, 1, 0, stream>>>(d_update_count_,\n                                                              d_read_count_);\n          CUDA_CHECK(cudaStreamSynchronize(stream));\n        }\n        break;\n      }\n      h_update_count_.fetch_sub(1, std::memory_order_acq_rel);\n    }\n  }\n\n  void unlock_update(cudaStream_t stream) {\n    { group_lock::unlock_update_kernel<<<1, 1, 0, stream>>>(d_update_count_); }\n    h_update_count_.fetch_sub(1, std::memory_order_release);\n  }\n\n  void lock_update_read() {\n    /* Lock unique flag */\n    bool expected = false;\n    while (!h_unique_flag_.compare_exchange_weak(expected, true,\n                                                 std::memory_order_acq_rel)) {\n      expected = false;\n    }\n\n    /* Ban update */\n    for (;;) {\n      while (h_update_count_.load(std::memory_order_acquire)) {\n      }\n      h_read_count_.fetch_add(1, std::memory_order_acq_rel);\n      if (h_update_count_.load(std::memory_order_acquire) == 0) {\n        break;\n      }\n      h_read_count_.fetch_sub(1, std::memory_order_acq_rel);\n    }\n\n    /* Ban read */\n    for (;;) {\n      while (h_read_count_.load(std::memory_order_acquire) > 1) {\n      }\n      h_update_count_.fetch_add(1, std::memory_order_acq_rel);\n      if (h_read_count_.load(std::memory_order_acquire) == 1) {\n        break;\n      }\n      h_update_count_.fetch_sub(1, std::memory_order_acq_rel);\n    }\n\n    {\n      cudaStream_t stream = get_lock_stream();\n      group_lock::lock_update_read_kernel<<<1, 1, 0, stream>>>(\n          d_update_count_, d_read_count_, d_unique_flag_);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n    }\n  }\n\n  void unlock_update_read(cudaStream_t stream) {\n    {\n      group_lock::unlock_update_read_kernel<<<1, 1, 0, stream>>>(\n          d_update_count_, d_read_count_, d_unique_flag_);\n    }\n    h_read_count_.fetch_sub(1, std::memory_order_release);\n    h_update_count_.fetch_sub(1, std::memory_order_release);\n    h_unique_flag_.store(false, std::memory_order_release);\n  }\n\n  int update_count() noexcept {\n    int count = 0;\n    int* d_count;\n    cudaStream_t stream;\n    CUDA_CHECK(cudaStreamCreate(&stream));\n    CUDA_CHECK(cudaMalloc(&d_count, sizeof(int)));\n    group_lock::update_count_kernel<<<1, 1, 0, stream>>>(d_count,\n                                                         d_update_count_);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    CUDA_CHECK(cudaMemcpy(&count, d_count, sizeof(int), cudaMemcpyDefault));\n    CUDA_CHECK(cudaFree(d_count));\n    CUDA_CHECK(cudaStreamDestroy(stream));\n    return count;\n  }\n\n  int read_count() noexcept {\n    int count = 0;\n    int* d_count;\n    cudaStream_t stream;\n    CUDA_CHECK(cudaStreamCreate(&stream));\n    CUDA_CHECK(cudaMalloc(&d_count, sizeof(int)));\n    group_lock::read_count_kernel<<<1, 1, 0, stream>>>(d_count, d_read_count_);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    CUDA_CHECK(cudaMemcpy(&count, d_count, sizeof(int), cudaMemcpyDefault));\n    CUDA_CHECK(cudaFree(d_count));\n    CUDA_CHECK(cudaStreamDestroy(stream));\n    return count;\n  }\n\n private:\n  std::atomic<int> h_update_count_;\n  std::atomic<int> h_read_count_;\n  std::atomic<bool> h_unique_flag_;\n\n  cuda::atomic<int, cuda::thread_scope_device>* d_update_count_;\n  cuda::atomic<int, cuda::thread_scope_device>* d_read_count_;\n  cuda::atomic<bool, cuda::thread_scope_device>* d_unique_flag_;\n};\n\nclass read_shared_lock {\n public:\n  read_shared_lock(const read_shared_lock&) = delete;\n  read_shared_lock(read_shared_lock&&) = delete;\n\n  read_shared_lock& operator=(const read_shared_lock&) = delete;\n  read_shared_lock& operator=(read_shared_lock&&) = delete;\n\n  explicit read_shared_lock(group_shared_mutex& mutex, cudaStream_t stream = 0)\n      : mutex_(&mutex) {\n    mutex_->lock_read();\n    owns_ = true;\n    stream_ = stream;\n  }\n\n  explicit read_shared_lock(group_shared_mutex& mutex, std::defer_lock_t,\n                            cudaStream_t stream = 0)\n      : mutex_(&mutex), stream_(stream), owns_(false) {}\n\n  ~read_shared_lock() {\n    if (owns_) {\n      mutex_->unlock_read(stream_);\n    }\n  }\n\n  void lock() noexcept {\n    if (!owns_) {\n      mutex_->lock_read();\n      owns_ = true;\n    }\n  }\n\n  bool owns_lock() const noexcept { return owns_; }\n\n private:\n  group_shared_mutex* const mutex_;\n  bool owns_;\n  cudaStream_t stream_;\n};\n\nclass update_shared_lock {\n public:\n  update_shared_lock(const update_shared_lock&) = delete;\n  update_shared_lock(update_shared_lock&&) = delete;\n\n  update_shared_lock& operator=(const update_shared_lock&) = delete;\n  update_shared_lock& operator=(update_shared_lock&&) = delete;\n\n  explicit update_shared_lock(group_shared_mutex& mutex,\n                              cudaStream_t stream = 0)\n      : mutex_(&mutex) {\n    mutex_->lock_update();\n    owns_ = true;\n    stream_ = stream;\n  }\n\n  explicit update_shared_lock(group_shared_mutex& mutex, std::defer_lock_t,\n                              cudaStream_t stream = 0)\n      : mutex_(&mutex), stream_(stream), owns_(false) {}\n\n  ~update_shared_lock() {\n    if (owns_) {\n      mutex_->unlock_update(stream_);\n    }\n  }\n\n  void lock() noexcept {\n    if (!owns_) {\n      mutex_->lock_update();\n      owns_ = true;\n    }\n  }\n\n  bool owns_lock() const noexcept { return owns_; }\n\n private:\n  group_shared_mutex* const mutex_;\n  bool owns_;\n  cudaStream_t stream_;\n};\n\nclass update_read_lock {\n public:\n  update_read_lock(const update_read_lock&) = delete;\n  update_read_lock(update_read_lock&&) = delete;\n\n  update_read_lock& operator=(const update_read_lock&) = delete;\n  update_read_lock& operator=(update_read_lock&&) = delete;\n\n  explicit update_read_lock(group_shared_mutex& mutex, cudaStream_t stream = 0)\n      : mutex_(&mutex) {\n    mutex_->lock_update_read();\n    owns_ = true;\n    stream_ = stream;\n  }\n\n  explicit update_read_lock(group_shared_mutex& mutex, std::defer_lock_t,\n                            cudaStream_t stream = 0) noexcept\n      : mutex_(&mutex), stream_(stream), owns_(false) {}\n\n  ~update_read_lock() {\n    if (owns_) {\n      mutex_->unlock_update_read(stream_);\n    }\n  }\n\n  void lock() {\n    assert(!owns_ && \"[update_read_lock] trying to lock twice!\");\n    mutex_->lock_update_read();\n    owns_ = true;\n  }\n\n  bool owns_lock() const noexcept { return owns_; }\n\n private:\n  group_shared_mutex* const mutex_;\n  bool owns_;\n  cudaStream_t stream_;\n};\n\nusing insert_unique_lock = update_read_lock;\n\n}  // namespace merlin\n}  // namespace nv"
  },
  {
    "path": "include/merlin/memory_pool.cuh",
    "content": "/*\n * Copyright (c) 2022, NVIDIA CORPORATION.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *     http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n#pragma once\n\n#include <algorithm>\n#include <array>\n#include <functional>\n#include <iostream>\n#include <memory>\n#include <mutex>\n#include <thread>\n#include <vector>\n#include \"allocator.cuh\"\n#include \"debug.hpp\"\n\nnamespace nv {\nnamespace merlin {\n\n/**\n * Allocators are used by the memory pool (and maybe other classes) to create\n * RAII complient containers for buffers allocated in different memory areas.\n */\ntemplate <class T, class Allocator>\nstruct AllocatorBase {\n  using type = T;\n  using sync_unique_ptr = std::unique_ptr<type, std::function<void(type*)>>;\n  using async_unique_ptr = std::unique_ptr<type, std::function<void(type*)>>;\n  using shared_ptr = std::shared_ptr<type>;\n\n  inline static sync_unique_ptr make_unique(size_t n,\n                                            BaseAllocator* allocator) {\n    return {Allocator::alloc(n, allocator),\n            [allocator](type* p) { Allocator::free(p, allocator); }};\n  }\n\n  inline static async_unique_ptr make_unique(size_t n, BaseAllocator* allocator,\n                                             cudaStream_t stream) {\n    return {Allocator::alloc(n, allocator, stream),\n            [stream, allocator](type* p) { Allocator::free(p, allocator); }};\n  }\n\n  inline static shared_ptr make_shared(size_t n, BaseAllocator* allocator,\n                                       cudaStream_t stream = 0) {\n    return {Allocator::alloc(n, allocator, stream),\n            [stream, allocator](type* p) {\n              Allocator::free(p, allocator, stream);\n            }};\n  }\n};\n\n/**\n * Trivial fallback implementation using the standard C++ allocator. This mostly\n * exists to ensure interface correctness, and as an illustration of what a\n * proper allocator implementation should look like.\n */\ntemplate <class T>\nstruct StandardAllocator final : AllocatorBase<T, StandardAllocator<T>> {\n  using type = typename AllocatorBase<T, StandardAllocator<T>>::type;\n\n  static constexpr const char* name{\"StandardAllocator\"};\n\n  inline static type* alloc(size_t n, BaseAllocator* allocator,\n                            cudaStream_t stream = 0) {\n    type* ptr;\n    allocator->alloc(MemoryType::Host, (void**)&ptr, n * sizeof(T));\n    return ptr;\n  }\n\n  inline static void free(type* ptr, BaseAllocator* allocator,\n                          cudaStream_t stream = 0) {\n    allocator->free(MemoryType::Host, ptr);\n  }\n};\n\n/**\n * Claim/release buffers in pinned host memory.\n */\ntemplate <class T>\nstruct HostAllocator final : AllocatorBase<T, HostAllocator<T>> {\n  using type = typename AllocatorBase<T, HostAllocator<T>>::type;\n\n  static constexpr const char* name{\"HostAllocator\"};\n\n  inline static type* alloc(size_t n, BaseAllocator* allocator,\n                            cudaStream_t stream = 0) {\n    void* ptr;\n    allocator->alloc(MemoryType::Pinned, (void**)&ptr, n * sizeof(T));\n    return reinterpret_cast<type*>(ptr);\n  }\n\n  inline static void free(type* ptr, BaseAllocator* allocator,\n                          cudaStream_t stream = 0) {\n    allocator->free(MemoryType::Pinned, ptr);\n  }\n};\n\n/**\n * Claim/release buffers in the active CUDA device. Will not test if the correct\n * device was used, and throw if CUDA runtime API response is negative.\n */\ntemplate <class T>\nstruct DeviceAllocator final : AllocatorBase<T, DeviceAllocator<T>> {\n  using type = typename AllocatorBase<T, DeviceAllocator<T>>::type;\n\n  static constexpr const char* name{\"DeviceAllocator\"};\n\n  inline static type* alloc(size_t n, BaseAllocator* allocator,\n                            cudaStream_t stream = 0) {\n    void* ptr;\n\n    allocator->alloc_async(MemoryType::Device, (void**)&ptr, n * sizeof(T),\n                           stream);\n    return reinterpret_cast<type*>(ptr);\n  }\n\n  inline static void free(type* ptr, BaseAllocator* allocator,\n                          cudaStream_t stream = 0) {\n    allocator->free_async(MemoryType::Device, ptr, stream);\n  }\n};\n\n/**\n * Helper structure to configure a memory pool.\n */\nstruct MemoryPoolOptions {\n  size_t max_stock{4};     ///< Amount of buffers to keep in reserve.\n  size_t max_pending{16};  ///< Maximum amount of awaitable buffers. If this\n                           ///< limit is exceeded threads will start to block.\n};\n\n/**\n * Forward declares required to make templated ostream overload work.\n */\ntemplate <class Allocator>\nclass MemoryPool;\n\ntemplate <class Allocator>\nstd::ostream& operator<<(std::ostream&, const MemoryPool<Allocator>&);\n\n/**\n * CUDA deferred execution aware memory pool implementation. As for every memory\n * pool, the general idea is to have resuable buffers. All buffers have the same\n * size.\n *\n * General behavior:\n *\n * This memory pool implementation attempts to avoid blocking before the fact,\n * but also avoids relying on a background worker.\n *\n * Buffer borrow and return semantics tightly align with C++ RAII principles.\n * That is, if a workspace is requested, any borrowed buffers will be returned\n * automatically when leaving the scope.\n *\n * You can either borrow a single buffer, or a workspace (that is multiple\n * buffers). We support dynamic and static workspaces. Static workspaces have\n * the benefit that they will never require heap memory (no hidden allocations).\n *\n *\n * Buffer borrowing:\n *\n * If buffers are requested, we take them from the stock, if available. If the\n * stock is depleted, we check if any pending buffer has been used up by the GPU\n * and adds them to the stock. If was also not successful, we allocate a new\n * buffer. Buffers or workspaces (groups of buffers).\n *\n * When borrowing a buffer a streaming context can be specified. This context is\n * relevant for allocation and during returns. It is assumed that the stream you\n * provide as context will be the stream where you queue the workload. Not doing\n * so may lead to undefined behavior.\n *\n * Buffer return:\n *\n * If no context is provided, we cannot make any assumptions regarding the usage\n * one the device. So we sychronize the device first and then return the buffer\n * to the stock. If a streaming context was provided, we queue an event and add\n * the buffer to the `pending` pool. That means, the buffer has been\n * reqlinquished by the CPU, but may still be used by the GPU. If no pending\n * slot is available, we probe the currently pending buffers events for\n * completion. Completed pending buffers are returned to the reserve. If so, we\n * queue the buffer in the freed slot. If that was unsucessful (i.e., all\n * currently pending buffers are still in use by the GPU), we have no choice but\n * the free the buffer using the current stream.\n *\n * In either case, `max_reserve` represents the maxmum size of the stock. If\n * returning a buffer would lead to the stock exeeding this quantity, the buffer\n * is queued for destruction.\n */\ntemplate <class Allocator>\nclass MemoryPool final {\n public:\n  using pool_type = MemoryPool<Allocator>;\n  using alloc_type = typename Allocator::type;\n  template <class Container>\n  class Workspace {\n   public:\n    inline Workspace() : pool_{nullptr}, buffer_size_{0}, stream_{0} {}\n\n    inline Workspace(pool_type* pool, cudaStream_t stream)\n        : pool_{pool}, buffer_size_{0}, stream_{stream} {}\n\n    Workspace(const Workspace&) = delete;\n\n    Workspace& operator=(const Workspace&) = delete;\n\n    inline Workspace(Workspace&& other)\n        : pool_{other.pool_},\n          buffer_size_{other.buffer_size_},\n          stream_{other.stream_},\n          buffers_{std::move(other.buffers_)} {}\n\n    inline Workspace& operator=(Workspace&& other) {\n      if (pool_) {\n        pool_->put_raw(buffers_.begin(), buffers_.end(), buffer_size_, stream_);\n      }\n      pool_ = other.pool_;\n      buffer_size_ = other.buffer_size_;\n      stream_ = other.stream_;\n      buffers_ = std::move(other.buffers_);\n      other.pool_ = nullptr;\n      return *this;\n    }\n\n    inline ~Workspace() {\n      if (pool_) {\n        pool_->put_raw(buffers_.begin(), buffers_.end(), buffer_size_, stream_);\n      }\n    }\n\n    template <class T>\n    constexpr void at(const size_t n, T* ptr) const {\n      *ptr = at<T>(n);\n    }\n\n    template <class T>\n    constexpr T at(const size_t n) const {\n      return reinterpret_cast<T>(buffers_.at(n));\n    }\n\n    template <class T>\n    constexpr void get(const size_t n, T* ptr) const {\n      *ptr = get<T>(n);\n    }\n\n    template <class T>\n    constexpr T get(const size_t n) const {\n      return reinterpret_cast<T>(buffers_[n]);\n    }\n\n    constexpr alloc_type* operator[](const size_t n) const {\n      return buffers_[n];\n    }\n\n   protected:\n    pool_type* pool_;\n    size_t buffer_size_;\n    cudaStream_t stream_;\n    Container buffers_;\n  };\n\n  template <size_t N>\n  class StaticWorkspace final : public Workspace<std::array<alloc_type*, N>> {\n   public:\n    using base_type = Workspace<std::array<alloc_type*, N>>;\n\n    friend class MemoryPool<Allocator>;\n\n    inline StaticWorkspace() : base_type() {}\n\n    StaticWorkspace(const StaticWorkspace&) = delete;\n\n    StaticWorkspace& operator=(const StaticWorkspace&) = delete;\n\n    inline StaticWorkspace(StaticWorkspace&& other)\n        : base_type(std::move(other)) {}\n\n    inline StaticWorkspace& operator=(StaticWorkspace&& other) {\n      base_type::operator=(std::move(other));\n      return *this;\n    }\n\n   private:\n    inline StaticWorkspace(pool_type* pool, size_t requested_buffer_size,\n                           cudaStream_t stream)\n        : base_type(pool, stream) {\n      auto& buffers = this->buffers_;\n      this->buffer_size_ = pool->get_raw(buffers.begin(), buffers.end(),\n                                         requested_buffer_size, stream);\n    }\n  };\n\n  class DynamicWorkspace final : public Workspace<std::vector<alloc_type*>> {\n   public:\n    using base_type = Workspace<std::vector<alloc_type*>>;\n\n    friend class MemoryPool<Allocator>;\n\n    inline DynamicWorkspace() : base_type() {}\n\n    DynamicWorkspace(const DynamicWorkspace&) = delete;\n\n    DynamicWorkspace& operator=(const DynamicWorkspace&) = delete;\n\n    inline DynamicWorkspace(DynamicWorkspace&& other)\n        : base_type(std::move(other)) {}\n\n    inline DynamicWorkspace& operator=(DynamicWorkspace&& other) {\n      base_type::operator=(std::move(other));\n      return *this;\n    }\n\n   private:\n    inline DynamicWorkspace(pool_type* pool, size_t n,\n                            size_t requested_buffer_size, cudaStream_t stream)\n        : base_type(pool, stream) {\n      auto& buffers = this->buffers_;\n      buffers.resize(n);\n      this->buffer_size_ = pool->get_raw(buffers.begin(), buffers.end(),\n                                         requested_buffer_size, stream);\n    }\n  };\n\n  MemoryPool(const MemoryPoolOptions& options, BaseAllocator* allocator)\n      : options_{options}, allocator_{allocator} {\n    // Create initial buffer stock.\n    stock_.reserve(options_.max_stock);\n\n    // Create enough events, so we have one per potentially pending buffer.\n    ready_events_.resize(options_.max_pending);\n    for (auto& ready_event : ready_events_) {\n      CUDA_CHECK(cudaEventCreate(&ready_event));\n    }\n\n    // Preallocate pending.\n    pending_.reserve(options_.max_pending);\n  }\n\n  ~MemoryPool() {\n    // Make sure all queued tasks are complete.\n    await_pending();\n\n    // Free event and buffer memory.\n    for (auto& ready_event : ready_events_) {\n      CUDA_CHECK(cudaEventDestroy(ready_event));\n    }\n\n    // Any remaining buffers need to be properly unallocated.\n    deplete_stock();\n  }\n\n  inline size_t buffer_size() const { return buffer_size_; }\n\n  inline size_t max_batch_size(size_t max_item_size) const {\n    return buffer_size_ / max_item_size;\n  }\n\n  template <class T>\n  inline size_t max_batch_size() const {\n    return max_batch_size(sizeof(T));\n  }\n\n  size_t current_stock() const {\n    std::lock_guard<std::mutex> lock(mutex_);\n    return stock_.size();\n  }\n\n  size_t num_pending() const {\n    std::lock_guard<std::mutex> lock(mutex_);\n    return pending_.size();\n  }\n\n  void await_pending(cudaStream_t stream = 0) {\n    std::lock_guard<std::mutex> lock(mutex_);\n    while (!pending_.empty()) {\n      collect_pending_unsafe(stream);\n      if (pending_.empty()) {\n        break;\n      }\n      std::this_thread::yield();\n    }\n  }\n\n  void deplete_stock() {\n    std::lock_guard<std::mutex> lock(mutex_);\n    for (auto& ptr : stock_) {\n      Allocator::free(ptr, allocator_);\n    }\n    stock_.clear();\n  }\n\n  inline std::unique_ptr<alloc_type, std::function<void(alloc_type*)>>\n  get_unique(size_t requested_buffer_size, cudaStream_t stream = 0) {\n    alloc_type* ptr;\n    const size_t allocation_size =\n        get_raw(&ptr, (&ptr) + 1, requested_buffer_size, stream);\n    return {ptr, [this, allocation_size, stream](alloc_type* p) {\n              put_raw(&p, (&p) + 1, allocation_size, stream);\n            }};\n  }\n\n  inline std::shared_ptr<alloc_type> get_shared(size_t requested_buffer_size,\n                                                cudaStream_t stream = 0) {\n    alloc_type* ptr;\n    const size_t allocation_size =\n        get_raw(&ptr, (&ptr) + 1, requested_buffer_size, stream);\n    return {ptr, [this, allocation_size, stream](alloc_type* p) {\n              put_raw(&p, (&p) + 1, allocation_size, stream);\n            }};\n  }\n\n  template <size_t N>\n  inline StaticWorkspace<N> get_workspace(size_t requested_buffer_size,\n                                          cudaStream_t stream = 0) {\n    return {this, requested_buffer_size, stream};\n  }\n\n  inline DynamicWorkspace get_workspace(size_t n, size_t requested_buffer_size,\n                                        cudaStream_t stream = 0) {\n    return {this, n, requested_buffer_size, stream};\n  }\n\n  friend std::ostream& operator<< <Allocator>(std::ostream&, const MemoryPool&);\n\n private:\n  inline void collect_pending_unsafe(cudaStream_t stream) {\n    auto it{std::remove_if(\n        pending_.begin(), pending_.end(), [this, stream](const auto& pending) {\n          const cudaError_t state{cudaEventQuery(std::get<2>(pending))};\n          switch (state) {\n            case cudaSuccess:\n              // Stock buffers and destroy those that are no\n              // longer needed, but only if the allocation_size\n              // is still the same as the current buffer_size.\n              if (stock_.size() < options_.max_stock &&\n                  std::get<1>(pending) == buffer_size_) {\n                stock_.emplace_back(std::get<0>(pending));\n              } else {\n                Allocator::free(std::get<0>(pending), allocator_, stream);\n              }\n              ready_events_.emplace_back(std::get<2>(pending));\n              return true;\n            case cudaErrorNotReady:\n              return false;\n            default:\n              CUDA_CHECK(state);\n              return false;\n          }\n        })};\n    pending_.erase(it, pending_.end());\n  }\n\n  inline void clear_stock_unsafe(cudaStream_t stream) {\n    for (auto& ptr : stock_) {\n      Allocator::free(ptr, allocator_, stream);\n    }\n    stock_.clear();\n  }\n\n  template <class Iterator>\n  inline size_t get_raw(Iterator first, Iterator const last,\n                        size_t requested_buffer_size, cudaStream_t stream) {\n    // Get pre-allocated buffers if stock available.\n    size_t allocation_size;\n    {\n      std::lock_guard<std::mutex> lock(mutex_);\n\n      // If requested_buffer_size is within current buffer_size margins can\n      // reuse current buffers.\n      if (requested_buffer_size <= buffer_size_) {\n        while (first != last) {\n          // If no buffers available, try to make some available.\n          if (stock_.empty()) {\n            collect_pending_unsafe(stream);\n            if (stock_.empty()) {\n              // No buffers available.\n              break;\n            }\n          }\n\n          // Just take the next available buffer.\n          *first++ = stock_.back();\n          stock_.pop_back();\n        }\n      } else {\n        // Drop the stock because we need more memory and those buffers have\n        // become useless to that end.\n        clear_stock_unsafe(stream);\n        buffer_size_ = requested_buffer_size;\n      }\n\n      allocation_size = buffer_size_;\n    }\n\n    // Forge new buffers until request can be filled.\n    for (; first != last; ++first) {\n      *first = Allocator::alloc(allocation_size, allocator_, stream);\n    }\n\n    return allocation_size;\n  }\n\n  template <class Iterator>\n  inline void put_raw(Iterator first, Iterator const last,\n                      size_t allocation_size, cudaStream_t stream) {\n    std::lock_guard<std::mutex> lock(mutex_);\n\n    // If allocation_size of the workspace differs from the current buffer_size\n    // (i.e., somebody else requested a larger buffer since the original request\n    // occured), the provided buffers are incompatible and have to be discarded.\n    if (allocation_size != buffer_size_) {\n      while (first != last) {\n        Allocator::free(*first++, allocator_);\n      }\n      return;\n    }\n\n    // If the workspace that borrowed a stream was moved out of the RAII scope\n    // where it was created, it could happen that the stream was destroyed when\n    // we return the buffer ownership. This will prevent that.\n    //\n    // Note that `cudaStreamQuery` isn't designed to track stream destruction.\n    // This check is a last resort, and may not work reliably. The recommended\n    // best practice is to simply ensure streams you use are alive and well.\n    if (cudaStreamQuery(stream) != cudaErrorInvalidResourceHandle) {\n      for (; first != last; ++first) {\n        // Avoid adding already deallocated buffers.\n        if (*first == nullptr) {\n          continue;\n        }\n\n        // Spin lock if too many pending buffers (i.e., let CPU wait for GPU).\n        while (ready_events_.empty()) {\n          collect_pending_unsafe(stream);\n          if (!ready_events_.empty()) {\n            break;\n          }\n          std::this_thread::yield();\n        }\n\n        // Queue buffer.\n        cudaEvent_t ready_event{ready_events_.back()};\n        ready_events_.pop_back();\n        CUDA_CHECK(cudaEventRecord(ready_event, stream));\n        pending_.emplace_back(*first, allocation_size, ready_event);\n      }\n    } else {\n      // Without stream context, we must force a hard sync with the GPU.\n      CUDA_CHECK(cudaDeviceSynchronize());\n\n      for (; first != last; ++first) {\n        // Avoid adding already deallocated buffers.\n        if (*first == nullptr) {\n          continue;\n        }\n\n        // Stock buffers and destroy those that are no longer needed.\n        if (stock_.size() < options_.max_stock) {\n          stock_.emplace_back(*first);\n        } else {\n          Allocator::free(*first, allocator_);\n        }\n      }\n    }\n  }\n\n  const MemoryPoolOptions options_;\n\n  mutable std::mutex mutex_;\n  size_t buffer_size_{1};\n  std::vector<alloc_type*> stock_;\n  std::vector<cudaEvent_t> ready_events_;\n\n  std::vector<std::tuple<alloc_type*, size_t, cudaEvent_t>> pending_;\n  BaseAllocator* allocator_;\n};\n\ntemplate <class Allocator>\nstd::ostream& operator<<(std::ostream& os, const MemoryPool<Allocator>& pool) {\n  std::lock_guard<std::mutex> lock(pool.mutex_);\n\n  for (size_t i{0}; i < 80; ++i) {\n    os << '-';\n  }\n\n  // Current stock.\n  os << \"\\nStock =\\n\";\n  for (size_t i{0}; i < pool.stock_.size(); ++i) {\n    os << \"[ \" << i << \" ] buffer \" << static_cast<void*>(pool.stock_[i])\n       << \", size = \" << pool.buffer_size_ << '\\n';\n  }\n\n  // Pending buffers.\n  os << \"\\nPending =\\n\";\n  for (size_t i{0}; i < pool.pending_.size(); ++i) {\n    os << \"[ \" << i\n       << \" ] buffer = \" << static_cast<void*>(std::get<0>(pool.pending_[i]))\n       << \", size = \" << std::get<1>(pool.pending_[i]) << \", ready_event = \"\n       << static_cast<void*>(std::get<2>(pool.pending_[i])) << '\\n';\n  }\n\n  // Available ready events.\n  os << \"\\nReady Events =\\n\";\n  for (size_t i{0}; i < pool.ready_events_.size(); ++i) {\n    os << \"[ \" << i << \" ] \" << static_cast<void*>(pool.ready_events_[i])\n       << '\\n';\n  }\n\n  for (size_t i{0}; i < 80; ++i) {\n    os << '-';\n  }\n\n  os << '\\n';\n  return os;\n}\n\n}  // namespace merlin\n}  // namespace nv\n"
  },
  {
    "path": "include/merlin/multi_vector.hpp",
    "content": "/*\n * Copyright (c) 2025, NVIDIA CORPORATION.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *     http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n#pragma once\n\n#include <array>\n#include <cstddef>\n#include <cstdint>\n#include <initializer_list>\n#include <tuple>\n#include <type_traits>\n\nnamespace nv {\nnamespace merlin {\n\n/*\nMultiVector supports:\n\n1.Different types (any T1, T2, ...)\n\n2.Each block of memory is 16-byte aligned\n\n3.The first address of the i-th element can be retrieved using get<i>() (a\npointer of the correct type)\n\n4.The total size of the entire multivector can be obtained\n\n5.Large blocks of memory are allocated at once, with manual internal\npartitioning (to improve memory locality)\n*/\ntemplate <typename... Ts>\nclass MultiVector {\n public:\n  static constexpr size_t Alignment = 16;\n\n  template <typename... Lens, typename = typename std::enable_if<\n                                  sizeof...(Lens) == sizeof...(Ts)>::type>\n  explicit MultiVector(Lens... lens) {\n    size_t tmp[] = {static_cast<size_t>(lens)...};\n    for (size_t i = 0; i < sizeof...(Ts); ++i) {\n      lengths_[i] = tmp[i];\n    }\n    compute_offsets();\n  }\n\n  ~MultiVector() {}\n\n  template <size_t I>\n  auto get(uint8_t* data) {\n    using T = typename std::tuple_element<I, std::tuple<Ts...>>::type;\n    return reinterpret_cast<T*>(data + offsets_[I]);\n  }\n\n  size_t length(size_t idx) const { return lengths_[idx]; }\n\n  size_t offset(size_t idx) const { return offsets_[idx]; }\n\n  size_t total_size() const { return total_size_; }\n\n private:\n  std::array<size_t, sizeof...(Ts)> lengths_{};\n  std::array<size_t, sizeof...(Ts)> offsets_{};\n  size_t total_size_{0};\n\n  constexpr size_t align_up(size_t n, size_t alignment) {\n    return (n + alignment - 1) / alignment * alignment;\n  }\n\n  void compute_offsets() {\n    size_t offset = 0;\n    size_t idx = 0;\n\n    (void)std::initializer_list<int>{\n        (offset = align_up(offset, Alignment), offsets_[idx] = offset,\n         offset += lengths_[idx] * sizeof(Ts), ++idx, 0)...};\n\n    total_size_ = align_up(offset, Alignment);\n  }\n};\n\ntemplate <size_t I, typename... Ts>\nauto get_vector(MultiVector<Ts...>& mv, uint8_t* data) {\n  return mv.template get<I>(data);\n}\n\n}  // namespace merlin\n}  // namespace nv"
  },
  {
    "path": "include/merlin/optimizers.cuh",
    "content": "/*\n * Copyright (c) 2022, NVIDIA CORPORATION.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *     http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n#pragma once\n\n#include <cuda_runtime.h>\n#include \"types.cuh\"\n#include \"utils.cuh\"\n\nnamespace nv {\nnamespace merlin {\nnamespace optimizers {\n\ntemplate <class T>\n__global__ void adam_update_kernel(int len, float* weight, T* m, T* v,\n                                   const T* wgrad, float alpha_t, float beta1,\n                                   float beta2, float epsilon, float scaler) {\n  const int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < len) {\n    float gi = TypeConvertFunc<float, T>::convert(wgrad[i]) / scaler;\n    float mi =\n        beta1 * TypeConvertFunc<float, T>::convert(m[i]) + (1.f - beta1) * gi;\n    float vi = beta2 * TypeConvertFunc<float, T>::convert(v[i]) +\n               (1.f - beta2) * gi * gi;\n    m[i] = TypeConvertFunc<T, float>::convert(mi);\n    v[i] = TypeConvertFunc<T, float>::convert(vi);\n    weight[i] -= alpha_t * mi / (sqrt(vi) + epsilon);\n  }\n}\n\ntemplate <class T>\n__global__ void ada_grad_update_kernel(int len, float* weight, const T* wgrad,\n                                       T* sum, float lr, const float epsilon,\n                                       float scaler) {\n  const int i = blockIdx.x * blockDim.x + threadIdx.x;\n  if (i < len) {\n    float gi = TypeConvertFunc<float, T>::convert(wgrad[i]) / scaler;\n    float accum_ = TypeConvertFunc<float, T>::convert(__ldg(&sum[i]));\n    accum_ += gi * gi;\n    float std_ = epsilon + sqrtf(accum_);\n    weight[i] -= lr * gi / std_;\n    sum[i] = TypeConvertFunc<T, float>::convert(accum_);\n  }\n}\n\ntemplate <class T>\n__global__ void momentum_sgd_update_kernel(int len, float* weight, T* momentum,\n                                           const T* wgrad, float lr,\n                                           float momentum_factor,\n                                           float scaler) {\n  int idx = blockDim.x * blockIdx.x + threadIdx.x;\n  if (idx < len) {\n    float mv =\n        momentum_factor * TypeConvertFunc<float, T>::convert(momentum[idx]) -\n        lr * TypeConvertFunc<float, T>::convert(wgrad[idx]) / scaler;\n    momentum[idx] = TypeConvertFunc<T, float>::convert(mv);\n    weight[idx] += mv;\n  }\n  return;\n}\n\n}  // namespace optimizers\n}  // namespace merlin\n}  // namespace nv"
  },
  {
    "path": "include/merlin/types.cuh",
    "content": "/*\n * Copyright (c) 2022, NVIDIA CORPORATION.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *     http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n#pragma once\n\n#include <stddef.h>\n#include <cstdint>\n#include <cuda/atomic>\n#include <cuda/std/semaphore>\n#include \"debug.hpp\"\n\nnamespace nv {\nnamespace merlin {\n\n/**\n * Shorthand for a Key-Value-score tuple.\n */\ntemplate <class K, class V, class S>\nstruct KVM {\n  K key;\n  V* value;\n  S score;\n};\n\n// Storage size.\nusing byte16 = uint4;\nusing byte8 = uint2;\nusing byte4 = uint32_t;\nusing byte2 = uint16_t;\nusing byte = uint8_t;\n\n// Digest.\nusing D = byte;\nconstexpr uint64_t DEFAULT_EMPTY_KEY = UINT64_C(0xFFFFFFFFFFFFFFFF);\nconstexpr uint64_t DEFAULT_RECLAIM_KEY = UINT64_C(0xFFFFFFFFFFFFFFFE);\nconstexpr uint64_t DEFAULT_LOCKED_KEY = UINT64_C(0xFFFFFFFFFFFFFFFD);\n\nconstexpr uint64_t DEFAULT_RESERVED_KEY_MASK = UINT64_C(0xFFFFFFFFFFFFFFFC);\nconstexpr uint64_t DEFAULT_VACANT_KEY_MASK = UINT64_C(0xFFFFFFFFFFFFFFFE);\n\nconstexpr uint64_t MAX_SCORE = UINT64_C(0xFFFFFFFFFFFFFFFF);\nconstexpr uint64_t EMPTY_SCORE = UINT64_C(0);\nconstexpr uint64_t IGNORED_GLOBAL_EPOCH = UINT64_C(0xFFFFFFFFFFFFFFFF);\n\nstatic uint64_t EMPTY_KEY_CPU = DEFAULT_EMPTY_KEY;\n__constant__ uint64_t EMPTY_KEY = DEFAULT_EMPTY_KEY;\n__constant__ uint64_t RECLAIM_KEY = DEFAULT_RECLAIM_KEY;\n__constant__ uint64_t LOCKED_KEY = DEFAULT_LOCKED_KEY;\n\n__constant__ uint64_t RESERVED_KEY_MASK_1 = DEFAULT_RESERVED_KEY_MASK;\n__constant__ uint64_t RESERVED_KEY_MASK_2 = DEFAULT_RESERVED_KEY_MASK;\n__constant__ uint64_t VACANT_KEY_MASK_1 = DEFAULT_VACANT_KEY_MASK;\n__constant__ uint64_t VACANT_KEY_MASK_2 = DEFAULT_VACANT_KEY_MASK;\n\nconstexpr int MAX_RESERVED_KEY_BIT = 62;\n\ntemplate <class K>\n__forceinline__ __device__ bool IS_RESERVED_KEY(K key) {\n  return (RESERVED_KEY_MASK_1 & key) == RESERVED_KEY_MASK_2;\n}\n\ntemplate <class K>\n__forceinline__ __device__ bool IS_VACANT_KEY(K key) {\n  return (VACANT_KEY_MASK_1 & key) == VACANT_KEY_MASK_2;\n}\n\nstatic cudaError_t init_reserved_keys(int index) {\n  if (index < 1 || index > MAX_RESERVED_KEY_BIT) {\n    // index = 0 is the default,\n    // index = 62 is the maximum index can be set for reserved keys.\n    return cudaSuccess;\n  }\n  uint64_t reservedKeyMask1 = ~(UINT64_C(3) << index);\n  uint64_t reservedKeyMask2 = reservedKeyMask1 & ~UINT64_C(1);\n  uint64_t vacantKeyMask1 = ~(UINT64_C(1) << index);\n  uint64_t vacantKeyMask2 = vacantKeyMask1 & ~UINT64_C(1);\n\n  uint64_t emptyKey = reservedKeyMask2 | (UINT64_C(3) << index);\n  uint64_t reclaimKey = vacantKeyMask2;\n  uint64_t lockedKey = emptyKey & ~(UINT64_C(2) << index);\n  EMPTY_KEY_CPU = emptyKey;\n\n  CUDA_CHECK(cudaMemcpyToSymbol(EMPTY_KEY, &emptyKey, sizeof(uint64_t)));\n  CUDA_CHECK(cudaMemcpyToSymbol(RECLAIM_KEY, &reclaimKey, sizeof(uint64_t)));\n  CUDA_CHECK(cudaMemcpyToSymbol(LOCKED_KEY, &lockedKey, sizeof(uint64_t)));\n\n  CUDA_CHECK(cudaMemcpyToSymbol(RESERVED_KEY_MASK_1, &reservedKeyMask1,\n                                sizeof(uint64_t)));\n  CUDA_CHECK(cudaMemcpyToSymbol(RESERVED_KEY_MASK_2, &reservedKeyMask2,\n                                sizeof(uint64_t)));\n  CUDA_CHECK(\n      cudaMemcpyToSymbol(VACANT_KEY_MASK_1, &vacantKeyMask1, sizeof(uint64_t)));\n  CUDA_CHECK(\n      cudaMemcpyToSymbol(VACANT_KEY_MASK_2, &vacantKeyMask2, sizeof(uint64_t)));\n  return cudaGetLastError();\n}\n\ntemplate <class K>\nusing AtomicKey = cuda::atomic<K, cuda::thread_scope_device>;\n\ntemplate <class S>\nusing AtomicScore = cuda::atomic<S, cuda::thread_scope_device>;\n\ntemplate <class T>\nusing AtomicPos = cuda::atomic<T, cuda::thread_scope_device>;\n\ntemplate <class K, class V, class S>\nstruct Bucket {\n  AtomicKey<K>* keys_;\n  /// TODO: compute the pointer of scores and digests using bucket_max_size\n  AtomicScore<S>* scores_;\n  /// @brief not visible to users\n  D* digests_;\n  V* vectors;  // Pinned memory or HBM\n\n  __forceinline__ __device__ D* digests(int index) const {\n    return digests_ + index;\n  }\n\n  __forceinline__ __device__ AtomicKey<K>* keys(int index) const {\n    return keys_ + index;\n  }\n\n  __forceinline__ __device__ AtomicScore<S>* scores(int index) const {\n    return scores_ + index;\n  }\n\n  __forceinline__ __device__ K** keys_addr() {\n    return reinterpret_cast<K**>(&keys_);\n  }\n\n  static __forceinline__ __device__ AtomicKey<K>* keys(K* keys,\n                                                       uint32_t offset) {\n    return reinterpret_cast<AtomicKey<K>*>(keys) + offset;\n  }\n\n  static __forceinline__ __device__ D* digests(K* keys,\n                                               uint32_t bucket_capacity,\n                                               uint32_t offset) {\n    bucket_capacity = umax(bucket_capacity, 128);\n    return reinterpret_cast<D*>(keys) - bucket_capacity + offset;\n  }\n\n  static __forceinline__ __device__ S* scores(K* keys, uint32_t bucket_capacity,\n                                              uint32_t offset) {\n    return reinterpret_cast<S*>(keys + bucket_capacity) + offset;\n  }\n};\n\ntemplate <cuda::thread_scope Scope, class T = int>\nclass Lock {\n  mutable cuda::atomic<T, Scope> _lock;\n\n public:\n  __device__ Lock() : _lock{1} {}\n\n  template <typename CG>\n  __forceinline__ __device__ void acquire(CG const& g,\n                                          unsigned long long lane = 0) const {\n    if (g.thread_rank() == lane) {\n      T expected = 1;\n      while (!_lock.compare_exchange_weak(expected, 2,\n                                          cuda::std::memory_order_acquire)) {\n        expected = 1;\n      }\n    }\n    g.sync();\n  }\n\n  template <typename CG>\n  __forceinline__ __device__ void release(CG const& g,\n                                          unsigned long long lane = 0) const {\n    g.sync();\n    if (g.thread_rank() == lane) {\n      _lock.store(1, cuda::std::memory_order_release);\n    }\n  }\n};\n\nusing Mutex = Lock<cuda::thread_scope_device>;\n\ntemplate <class K, class V, class S>\nstruct Table {\n  Bucket<K, V, S>* buckets;\n  Mutex* locks;                 // mutex for write buckets\n  int* buckets_size;            // size of each buckets.\n  V** slices;                   // Handles of the HBM/ HMEM slices.\n  size_t dim;                   // Dimension of the `vectors`.\n  size_t bytes_per_slice;       // Size by byte of one slice.\n  size_t num_of_memory_slices;  // Number of vectors memory slices.\n  size_t capacity = 134217728;  // Initial capacity.\n  size_t max_size =\n      std::numeric_limits<uint64_t>::max();  // Up limit of the table capacity.\n  size_t buckets_num;                        // Number of the buckets.\n  size_t bucket_max_size = 128;              // Volume of each buckets.\n  size_t max_hbm_for_vectors = 0;            // Max HBM allocated for vectors\n  size_t remaining_hbm_for_vectors = 0;  // Remaining HBM allocated for vectors\n  size_t num_of_buckets_per_alloc = 1;   // Number of buckets allocated in each\n                                         // HBM allocation, must be power of 2.\n  bool is_pure_hbm = true;               // unused\n  bool primary = true;                   // unused\n  bool dual_bucket_mode = false;         // Enable dual-bucket addressing\n  int slots_offset = 0;                  // unused\n  int slots_number = 0;                  // unused\n  int device_id = 0;                     // Device id\n  int tile_size;\n};\n\ntemplate <class K, class S>\nusing EraseIfPredictInternal =\n    bool (*)(const K& key,       ///< iterated key in table\n             S& score,           ///< iterated score in table\n             const K& pattern,   ///< input key from caller\n             const S& threshold  ///< input score from caller\n    );\n\n/**\n * An abstract class provides interface between the nv::merlin::HashTable\n * and a file, which enables the table to save to the file or load from\n * the file, by overriding the `read` and `write` method.\n *\n * @tparam K The data type of the key.\n * @tparam V The data type of the vector's elements.\n *         The item data type should be a basic data type of C++/CUDA.\n * @tparam S The data type for `score`.\n *           The currently supported data type is only `uint64_t`.\n *\n */\ntemplate <class K, class V, class S>\nclass BaseKVFile {\n public:\n  virtual ~BaseKVFile() {}\n\n  /**\n   * Read from file and fill into the keys, values, and scores buffer.\n   * When calling save/load method from table, it can assume that the\n   * received buffer of keys, vectors, and scores are automatically\n   * pre-allocated.\n   *\n   * @param n The number of KV pairs expect to read. `int64_t` was used\n   *          here to adapt to various filesytem and formats.\n   * @param dim The dimension of the `vectors`.\n   * @param keys The pointer to received buffer for keys.\n   * @param vectors The pointer to received buffer for vectors.\n   * @param scores The pointer to received buffer for scores.\n   *\n   * @return Number of KV pairs have been successfully read.\n   */\n  virtual size_t read(const size_t n, const size_t dim, K* keys, V* vectors,\n                      S* scores) = 0;\n\n  /**\n   * Write keys, values, scores from table to the file. It defines\n   * an abstract method to get batch of KV pairs and write them into\n   * file.\n   *\n   * @param n The number of KV pairs to be written. `int64_t` was used\n   *          here to adapt to various filesytem and formats.\n   * @param dim The dimension of the `vectors`.\n   * @param keys The keys will be written to file.\n   * @param vectors The vectors of values will be written to file.\n   * @param scores The scores will be written to file.\n   *\n   * @return Number of KV pairs have been successfully written.\n   */\n  virtual size_t write(const size_t n, const size_t dim, const K* keys,\n                       const V* vectors, const S* scores) = 0;\n};\n\nenum class OccupyResult {\n  INITIAL,         ///< Initial status\n  CONTINUE,        ///< Insert did not succeed, continue trying to insert\n  OCCUPIED_EMPTY,  ///< New pair inserted successfully\n  OCCUPIED_RECLAIMED,\n  DUPLICATE,  ///< Insert did not succeed, key is already present\n  EVICT,      ///< Insert succeeded by evicting one key with minimum score.\n  REFUSED,    ///< Insert did not succeed, insert score is too low.\n  ILLEGAL,    ///< Illegal state, and don't need to do anything.\n};\n\nenum class OverrideResult {\n  INITIAL,   ///< Initial status\n  CONTINUE,  ///< Override did not succeed, continue trying to override\n  SUCCESS,   ///< Override successfully\n  REFUSED,   ///< Override is refused.\n};\n\nstruct Sm70 {\n  static int const kComputeCapability = 70;\n};\nstruct Sm72 {\n  static int const kComputeCapability = 72;\n};\nstruct Sm75 {\n  static int const kComputeCapability = 75;\n};\nstruct Sm80 {\n  static int const kComputeCapability = 80;\n};\nstruct Sm86 {\n  static int const kComputeCapability = 86;\n};\n\nstruct Sm90 {\n  static int const kComputeCapability = 90;\n};\n\n/* This struct is mainly for keeping the code readable, it should be strictly\n * consistent with `EvictStrategy::EvictStrategyEnum`.\n */\nstruct EvictStrategyInternal {\n  constexpr static int kLru = 0;         ///< LRU mode.\n  constexpr static int kLfu = 1;         ///< LFU mode.\n  constexpr static int kEpochLru = 2;    ///< Epoch + LRU mode.\n  constexpr static int kEpochLfu = 3;    ///< Epoch + LFU mode.\n  constexpr static int kCustomized = 4;  ///< Customized mode.\n};\n\n}  // namespace merlin\n}  // namespace nv\n"
  },
  {
    "path": "include/merlin/utils.cuh",
    "content": "/*\n * Copyright (c) 2022, NVIDIA CORPORATION.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *     http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n#pragma once\n\n#include <cooperative_groups.h>\n#include <stdarg.h>\n#include <cstdint>\n#include <cstdlib>\n#include <cstring>\n#include <exception>\n#include <string>\n#include \"cuda_fp16.h\"\n#include \"debug.hpp\"\n\nusing namespace cooperative_groups;\nnamespace cg = cooperative_groups;\n\n__inline__ __device__ uint64_t atomicCAS(uint64_t* address, uint64_t compare,\n                                         uint64_t val) {\n  return (uint64_t)atomicCAS((unsigned long long*)address,\n                             (unsigned long long)compare,\n                             (unsigned long long)val);\n}\n\n__inline__ __device__ int64_t atomicCAS(int64_t* address, int64_t compare,\n                                        int64_t val) {\n  return (int64_t)atomicCAS((unsigned long long*)address,\n                            (unsigned long long)compare,\n                            (unsigned long long)val);\n}\n\n__inline__ __device__ uint64_t atomicExch(uint64_t* address, uint64_t val) {\n  return (uint64_t)atomicExch((unsigned long long*)address,\n                              (unsigned long long)val);\n}\n\n__inline__ __device__ int64_t atomicExch(int64_t* address, int64_t val) {\n  return (int64_t)atomicExch((unsigned long long*)address,\n                             (unsigned long long)val);\n}\n\n__inline__ __device__ signed char atomicExch(signed char* address,\n                                             signed char val) {\n  signed char old = *address;\n  *address = val;\n  return old;\n}\n\n__inline__ __device__ int64_t atomicAdd(int64_t* address, const int64_t val) {\n  return (int64_t)atomicAdd((unsigned long long*)address, val);\n}\n\n__inline__ __device__ uint64_t atomicAdd(uint64_t* address,\n                                         const uint64_t val) {\n  return (uint64_t)atomicAdd((unsigned long long*)address, val);\n}\n\nnamespace nv {\nnamespace merlin {\n\ntemplate <class S>\nstatic __forceinline__ __device__ S device_nano() {\n  S mclk;\n  asm volatile(\"mov.u64 %0,%%globaltimer;\" : \"=l\"(mclk));\n  return mclk;\n}\n\ninline void __cudaCheckError(const char* file, const int line) {\n#ifdef CUDA_ERROR_CHECK\n  cudaError err = cudaGetLastError();\n  if (cudaSuccess != err) {\n    fprintf(stderr, \"cudaCheckError() failed at %s:%i : %s\\n\", file, line,\n            cudaGetErrorString(err));\n    exit(-1);\n  }\n\n  // More careful checking. However, this will affect performance.\n  // Comment away if needed.\n  err = cudaDeviceSynchronize();\n  if (cudaSuccess != err) {\n    fprintf(stderr, \"cudaCheckError() with sync failed at %s:%i : %s\\n\", file,\n            line, cudaGetErrorString(err));\n    exit(-1);\n  }\n#endif\n\n  return;\n}\n#define CudaCheckError() nv::merlin::__cudaCheckError(__FILE__, __LINE__)\n\nstatic inline size_t SAFE_GET_GRID_SIZE(size_t N, int block_size) {\n  return ((N) > std::numeric_limits<int>::max())\n             ? (((1 << 30) - 1) / block_size + 1)\n             : (((N)-1) / block_size + 1);\n}\n\nstatic inline int SAFE_GET_BLOCK_SIZE(int block_size, int device = -1) {\n  cudaDeviceProp prop;\n  int current_device = device;\n  if (current_device == -1) {\n    CUDA_CHECK(cudaGetDevice(&current_device));\n  }\n  CUDA_CHECK(cudaGetDeviceProperties(&prop, current_device));\n  if (block_size > prop.maxThreadsPerBlock) {\n    fprintf(stdout,\n            \"The requested block_size=%d exceeds the device limit, \"\n            \"the maxThreadsPerBlock=%d will be applied.\\n\",\n            block_size, prop.maxThreadsPerBlock);\n  }\n  return std::min(prop.maxThreadsPerBlock, block_size);\n}\n\ninline uint64_t Murmur3HashHost(const uint64_t& key) {\n  uint64_t k = key;\n  k ^= k >> 33;\n  k *= UINT64_C(0xff51afd7ed558ccd);\n  k ^= k >> 33;\n  k *= UINT64_C(0xc4ceb9fe1a85ec53);\n  k ^= k >> 33;\n  return k;\n}\n\n__inline__ __device__ uint64_t Murmur3HashDevice(uint64_t const& key) {\n  uint64_t k = key;\n  k ^= k >> 33;\n  k *= UINT64_C(0xff51afd7ed558ccd);\n  k ^= k >> 33;\n  k *= UINT64_C(0xc4ceb9fe1a85ec53);\n  k ^= k >> 33;\n  return k;\n}\n\n__inline__ __device__ int64_t Murmur3HashDevice(int64_t const& key) {\n  uint64_t k = uint64_t(key);\n  k ^= k >> 33;\n  k *= UINT64_C(0xff51afd7ed558ccd);\n  k ^= k >> 33;\n  k *= UINT64_C(0xc4ceb9fe1a85ec53);\n  k ^= k >> 33;\n  return int64_t(k);\n}\n\n__inline__ __device__ uint32_t Murmur3HashDevice(uint32_t const& key) {\n  uint32_t k = key;\n  k ^= k >> 16;\n  k *= UINT32_C(0x85ebca6b);\n  k ^= k >> 13;\n  k *= UINT32_C(0xc2b2ae35);\n  k ^= k >> 16;\n\n  return k;\n}\n\n__inline__ __device__ int32_t Murmur3HashDevice(int32_t const& key) {\n  uint32_t k = uint32_t(key);\n  k ^= k >> 16;\n  k *= UINT32_C(0x85ebca6b);\n  k ^= k >> 13;\n  k *= UINT32_C(0xc2b2ae35);\n  k ^= k >> 16;\n\n  return int32_t(k);\n}\n\nclass CudaDeviceRestorer {\n public:\n  CudaDeviceRestorer() { CUDA_CHECK(cudaGetDevice(&dev_)); }\n  ~CudaDeviceRestorer() { CUDA_CHECK(cudaSetDevice(dev_)); }\n\n private:\n  int dev_;\n};\n\nstatic inline int get_dev(const void* ptr) {\n  cudaPointerAttributes attr;\n  CUDA_CHECK(cudaPointerGetAttributes(&attr, ptr));\n  int dev = -1;\n\n#if CUDART_VERSION >= 10000\n  if (attr.type == cudaMemoryTypeDevice)\n#else\n  if (attr.memoryType == cudaMemoryTypeDevice)\n#endif\n  {\n    dev = attr.device;\n  }\n  return dev;\n}\n\nstatic inline void switch_to_dev(const void* ptr) {\n  int dev = get_dev(ptr);\n  if (dev >= 0) {\n    CUDA_CHECK(cudaSetDevice(dev));\n  }\n}\n\nstatic inline bool is_on_device(const void* ptr) {\n  cudaPointerAttributes attr;\n  CUDA_CHECK(cudaPointerGetAttributes(&attr, ptr));\n\n#if CUDART_VERSION >= 10000\n  return (attr.type == cudaMemoryTypeDevice);\n#else\n  return (attr.memoryType == cudaMemoryTypeDevice);\n#endif\n}\n\ntemplate <typename TOUT, typename TIN>\nstruct TypeConvertFunc;\n\ntemplate <>\nstruct TypeConvertFunc<__half, float> {\n  static __forceinline__ __device__ __half convert(float val) {\n    return __float2half(val);\n  }\n};\n\ntemplate <>\nstruct TypeConvertFunc<float, __half> {\n  static __forceinline__ __device__ float convert(__half val) {\n    return __half2float(val);\n  }\n};\n\ntemplate <>\nstruct TypeConvertFunc<float, float> {\n  static __forceinline__ __device__ float convert(float val) { return val; }\n};\n\ntemplate <>\nstruct TypeConvertFunc<float, long long> {\n  static __forceinline__ __device__ float convert(long long val) {\n    return static_cast<float>(val);\n  }\n};\n\ntemplate <>\nstruct TypeConvertFunc<float, unsigned int> {\n  static __forceinline__ __device__ float convert(unsigned int val) {\n    return static_cast<float>(val);\n  }\n};\n\ntemplate <>\nstruct TypeConvertFunc<int, long long> {\n  static __forceinline__ __device__ int convert(long long val) {\n    return static_cast<int>(val);\n  }\n};\n\ntemplate <>\nstruct TypeConvertFunc<int, unsigned int> {\n  static __forceinline__ __device__ int convert(unsigned int val) {\n    return static_cast<int>(val);\n  }\n};\n\ntemplate <typename mutex, uint32_t TILE_SIZE, bool THREAD_SAFE = true>\n__forceinline__ __device__ void lock(\n    const cg::thread_block_tile<TILE_SIZE>& tile, mutex& set_mutex,\n    unsigned long long lane = 0) {\n  if (THREAD_SAFE) {\n    set_mutex.acquire(tile, lane);\n  }\n}\n\ntemplate <typename mutex, uint32_t TILE_SIZE, bool THREAD_SAFE = true>\n__forceinline__ __device__ void unlock(\n    const cg::thread_block_tile<TILE_SIZE>& tile, mutex& set_mutex,\n    unsigned long long lane = 0) {\n  if (THREAD_SAFE) {\n    set_mutex.release(tile, lane);\n  }\n}\n\ninline void free_pointers(cudaStream_t stream, int n, ...) {\n  va_list args;\n  va_start(args, n);\n  void* ptr = nullptr;\n  for (int i = 0; i < n; i++) {\n    ptr = va_arg(args, void*);\n    if (ptr) {\n      cudaPointerAttributes attr;\n      memset(&attr, 0, sizeof(cudaPointerAttributes));\n      try {\n        CUDA_CHECK(cudaPointerGetAttributes(&attr, ptr));\n        if (attr.devicePointer && (!attr.hostPointer)) {\n          CUDA_CHECK(cudaFreeAsync(ptr, stream));\n        } else if (attr.devicePointer && attr.hostPointer) {\n          CUDA_CHECK(cudaFreeHost(ptr));\n        } else {\n          free(ptr);\n        }\n      } catch (const nv::merlin::CudaException& e) {\n        va_end(args);\n        throw e;\n      }\n    }\n  }\n  va_end(args);\n}\n\nstatic __global__ void memset64bitKernel(void* devPtr, uint64_t value,\n                                         size_t count) {\n  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;\n  if (idx < count) {\n    static_cast<uint64_t*>(devPtr)[idx] = value;\n  }\n}\n\n__forceinline__ __host__ cudaError_t memset64Async(void* devPtr, uint64_t value,\n                                                   size_t count,\n                                                   cudaStream_t stream = 0) {\n  int blockSize = 256;\n  int numBlocks = (count + blockSize - 1) / blockSize;\n  memset64bitKernel<<<numBlocks, blockSize, 0, stream>>>(devPtr, value, count);\n  return cudaGetLastError();\n}\n\n#define CUDA_FREE_POINTERS(stream, ...) \\\n  nv::merlin::free_pointers(            \\\n      stream, (sizeof((void*[]){__VA_ARGS__}) / sizeof(void*)), __VA_ARGS__);\n\nstatic inline size_t GB(size_t n) { return n << 30; }\n\nstatic inline size_t MB(size_t n) { return n << 20; }\n\nstatic inline size_t KB(size_t n) { return n << 10; }\n\nconstexpr inline bool ispow2(unsigned x) { return x && (!(x & (x - 1))); }\n\n}  // namespace merlin\n}  // namespace nv\n"
  },
  {
    "path": "include/merlin_hashtable.cuh",
    "content": "/*\n * Copyright (c) 2022, NVIDIA CORPORATION.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *     http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n#pragma once\n\n#include <thrust/device_vector.h>\n#include <thrust/execution_policy.h>\n#include <thrust/sort.h>\n#include <atomic>\n#include <cstdint>\n#include <cub/cub.cuh>\n#include <iostream>\n#include <limits>\n#include <memory>\n#include <mutex>\n#include <shared_mutex>\n#include <type_traits>\n#include \"merlin/allocator.cuh\"\n#include \"merlin/array_kernels.cuh\"\n#include \"merlin/core_kernels.cuh\"\n#include \"merlin/flexible_buffer.cuh\"\n#include \"merlin/group_lock.cuh\"\n#include \"merlin/memory_pool.cuh\"\n#include \"merlin/multi_vector.hpp\"\n#include \"merlin/types.cuh\"\n#include \"merlin/utils.cuh\"\n\nnamespace nv {\nnamespace merlin {\n\n/**\n * @brief The eviction strategies.\n *\n * @note The `Score` concept is introduced to define the importance of each key,\n * the larger, the more important, the less likely they will be evicted. On\n * `kLru` mode, the `scores` parameter of the APIs should keep `nullptr`, the\n * score for each key is assigned internally in LRU(Least Recently Used) policy.\n * On `kCustomized` mode, the `scores` should be provided by caller.\n *\n * @note Eviction occurs automatically when a bucket is full. The keys with the\n * minimum `score` value are evicted first.\n *\n * @note on `kLru`, Set the score to the Device clock in a nanosecond, which\n * could differ slightly from the host clock.\n *\n * @note For `kEpochLru` and `kEpochLfu`, the high 32bits would be set to\n * `global_epoch` while the low 32 bits is `timestamp` or `frequency`.\n *\n * @note on `kLfu`, Frequency increment provided by caller via the input\n * parameter of `scores` of `insert-like` APIs as the increment of frequency.\n * when the scores reaches to the max of `uint64_t`, it will not increase any\n * more.\n *\n * @note On `kEpochLru`, the high 32bits is the global epoch provided via the\n * input parameter of `global_epoch`, the low 32bits is equal to `(device_clock\n * >> 20) & 0xffffffff` with granularity close to 1 ms.\n *\n * @note On `kEpochLfu`, the high 32bits is the global epoch provided via the\n * input parameter of `global_epoch`, the low 32bits is the frequency, the\n * frequency will keep constant after reaching the max value of `0xffffffff`.\n *\n * @note On `kCustomized`, fully provided by the caller via the input parameter\n * of `scores` of `insert-like` APIs.\n *\n */\nstruct EvictStrategy {\n  enum EvictStrategyEnum {\n    kLru = 0,         ///< LRU mode.\n    kLfu = 1,         ///< LFU mode.\n    kEpochLru = 2,    ///< Epoch Lru mode.\n    kEpochLfu = 3,    ///< Epoch Lfu mode.\n    kCustomized = 4,  ///< Customized mode.\n  };\n};\n\n/**\n * @brief Table operation mode.\n *\n * kThroughput: Default mode, single-bucket addressing, throughput-optimized.\n * kMemory: Dual-bucket addressing, memory-efficiency-optimized (higher LF).\n */\nenum class TableMode {\n  kThroughput = 0,  ///< Default: single-bucket, max throughput.\n  kMemory = 1,      ///< Dual-bucket, higher load factor.\n};\n\n/**\n * @brief The options struct of HierarchicalKV.\n */\nstruct HashTableOptions {\n  size_t init_capacity = 0;        ///< The initial capacity of the hash table.\n  size_t max_capacity = 0;         ///< The maximum capacity of the hash table.\n  size_t max_hbm_for_vectors = 0;  ///< The maximum HBM for vectors, in bytes.\n  size_t max_bucket_size = 128;    ///< The length of each bucket.\n  size_t dim = 64;                 ///< The dimension of the vectors.\n  float max_load_factor = 0.5f;    ///< The max load factor before rehashing.\n  int block_size = 128;            ///< The default block size for CUDA kernels.\n  int io_block_size = 1024;        ///< The block size for IO CUDA kernels.\n  int device_id = -1;              ///< The ID of device.\n  bool io_by_cpu = false;  ///< The flag indicating if the CPU handles IO.\n  bool use_constant_memory = false;  ///< reserved\n  /*\n   * reserved_key_start_bit = 0, is the default behavior, HKV reserves\n   * `0xFFFFFFFFFFFFFFFD`, `0xFFFFFFFFFFFFFFFE`, and `0xFFFFFFFFFFFFFFFF`  for\n   * internal using. if the default one conflicted with your keys, change the\n   * reserved_key_start_bit value to a numbers between 1 and 62,\n   * reserved_key_start_bit = 1 means using the insignificant bits index 1 and 2\n   * as the keys as the reserved keys and the index 0 bit is 0 and all the other\n   * bits are 1, the new reserved keys are `FFFFFFFFFFFFFFFE`,\n   * `0xFFFFFFFFFFFFFFFC`, `0xFFFFFFFFFFFFFFF8`, and `0xFFFFFFFFFFFFFFFA` the\n   * console log prints the reserved keys during the table initialization.\n   */\n  int reserved_key_start_bit = 0;       ///< The binary index of reserved key.\n  size_t num_of_buckets_per_alloc = 1;  ///< Number of buckets allocated in each\n                                        ///< HBM allocation, must be power of 2.\n  bool api_lock = true;  ///<  The flag indicating whether to lock the table\n                         ///<  once enters the API.\n  TableMode table_mode = TableMode::kThroughput;  ///< Table operation mode.\n  MemoryPoolOptions\n      device_memory_pool;  ///< Configuration options for device memory pool.\n  MemoryPoolOptions\n      host_memory_pool;  ///< Configuration options for host memory pool.\n};\n\n/**\n * @brief A customizable template function indicates which keys should be\n * erased from the hash table by returning `true`.\n *\n * @note The `erase_if` or `export_batch_if` API traverses all of the items by\n * this function and the items that return `true` are removed or exported.\n *\n *  Example for erase_if:\n *\n *    ```\n *    template <class K, class S>\n *    struct EraseIfPredFunctor {\n *      __forceinline__ __device__ bool operator()(const K& key,\n *                                                 S& score,\n *                                                 const K& pattern,\n *                                                 const S& threshold) {\n *        return ((key & 0xFFFF000000000000 == pattern) &&\n *                (score < threshold));\n *      }\n *    };\n *    ```\n *\n *  Example for export_batch_if:\n *    ```\n *    template <class K, class S>\n *    struct ExportIfPredFunctor {\n *      __forceinline__ __device__ bool operator()(const K& key,\n *                                                 S& score,\n *                                                 const K& pattern,\n *                                                 const S& threshold) {\n *        return score >= threshold;\n *      }\n *    };\n *    ```\n */\ntemplate <class K, class S>\nusing EraseIfPredict = bool (*)(\n    const K& key,       ///< The traversed key in a hash table.\n    S& score,           ///< The traversed score in a hash table.\n    const K& pattern,   ///< The key pattern to compare with the `key` argument.\n    const S& threshold  ///< The threshold to compare with the `score` argument.\n);\n\n#if THRUST_VERSION >= 101600\nstatic constexpr auto& thrust_par = thrust::cuda::par_nosync;\n#else\nstatic constexpr auto& thrust_par = thrust::cuda::par;\n#endif\n\ntemplate <typename K, typename V, typename S = uint64_t>\nclass HashTableBase {\n public:\n  using size_type = size_t;\n  using key_type = K;\n  using value_type = V;\n  using score_type = S;\n  using allocator_type = BaseAllocator;\n\n public:\n  virtual ~HashTableBase() {}\n\n  /**\n   * @brief Initialize a merlin::HashTable.\n   *\n   * @param options The configuration options.\n   */\n  virtual void init(const HashTableOptions& options,\n                    allocator_type* allocator = nullptr) = 0;\n\n  /**\n   * @brief Insert new key-value-score tuples into the hash table.\n   * If the key already exists, the values and scores are assigned new values.\n   *\n   * If the target bucket is full, the keys with minimum score will be\n   * overwritten by new key unless the score of the new key is even less than\n   * minimum score of the target bucket.\n   *\n   * @param n Number of key-value-score tuples to insert or assign.\n   * @param keys The keys to insert on GPU-accessible memory with shape\n   * (n).\n   * @param values The values to insert on GPU-accessible memory with\n   * shape (n, DIM).\n   * @param scores The scores to insert on GPU-accessible memory with shape\n   * (n).\n   * @parblock\n   * The scores should be a `uint64_t` value for built-in strategies. For\n   * `EvictStrategy::kCustomized`, `uint32_t` scores are also supported.\n   * You can specify a value such as the timestamp of the key insertion or\n   * number of key occurrences to perform a customized eviction strategy.\n   *\n   * The @p scores should be `nullptr`, when the LRU eviction strategy is\n   * applied.\n   * @endparblock\n   *\n   * @param stream The CUDA stream that is used to execute the operation.\n   * @param unique_key If all keys in the same batch are unique.\n   *\n   * @param ignore_evict_strategy A boolean option indicating whether if\n   * the insert_or_assign ignores the evict strategy of table with current\n   * scores anyway. If true, it does not check whether the scores conforms to\n   * the evict strategy. If false, it requires the scores follow the evict\n   * strategy of table.\n   */\n  virtual void insert_or_assign(const size_type n,\n                                const key_type* keys,                // (n)\n                                const value_type* values,            // (n, DIM)\n                                const score_type* scores = nullptr,  // (n)\n                                cudaStream_t stream = 0, bool unique_key = true,\n                                bool ignore_evict_strategy = false) = 0;\n\n  /**\n   * @brief Insert new key-value-score tuples into the hash table.\n   * If the key already exists, the values and scores are assigned new values.\n   *\n   * If the target bucket is full, the keys with minimum score will be\n   * overwritten by new key unless the score of the new key is even less than\n   * minimum score of the target bucket. The overwritten key with minimum\n   * score will be evicted, with its values and score, to evicted_keys,\n   * evicted_values, evcted_scores seperately in compact format.\n   *\n   * @param n Number of key-value-score tuples to insert or assign.\n   * @param keys The keys to insert on GPU-accessible memory with shape\n   * (n).\n   * @param values The values to insert on GPU-accessible memory with\n   * shape (n, DIM).\n   * @param scores The scores to insert on GPU-accessible memory with shape\n   * (n).\n   * @param scores The scores to insert on GPU-accessible memory with shape\n   * (n).\n   * @params evicted_keys The output of keys replaced with minimum score.\n   * @params evicted_values The output of values replaced with minimum score on\n   * keys.\n   * @params evicted_scores The output of scores replaced with minimum score on\n   * keys.\n   * @parblock\n   * The scores should be a `uint64_t` value for built-in strategies. For\n   * `EvictStrategy::kCustomized`, `uint32_t` scores are also supported.\n   * You can specify a value such as the timestamp of the key insertion or\n   * number of key occurrences to perform a customized eviction strategy.\n   *\n   * The @p scores should be `nullptr`, when the LRU eviction strategy is\n   * applied.\n   * @endparblock\n   *\n   * @param d_evicted_counter The number of elements evicted on GPU-accessible\n   * memory. @notice The caller should guarantee it is set to `0` before\n   * calling.\n   * @param stream The CUDA stream that is used to execute the operation.\n   * @param unique_key If all keys in the same batch are unique.\n   *\n   * @param ignore_evict_strategy A boolean option indicating whether if\n   * the insert_or_assign ignores the evict strategy of table with current\n   * scores anyway. If true, it does not check whether the scores confroms to\n   * the evict strategy. If false, it requires the scores follow the evict\n   * strategy of table.\n   */\n  virtual void insert_and_evict(const size_type n,\n                                const key_type* keys,          // (n)\n                                const value_type* values,      // (n, DIM)\n                                const score_type* scores,      // (n)\n                                key_type* evicted_keys,        // (n)\n                                value_type* evicted_values,    // (n, DIM)\n                                score_type* evicted_scores,    // (n)\n                                size_type* d_evicted_counter,  // (1)\n                                cudaStream_t stream = 0, bool unique_key = true,\n                                bool ignore_evict_strategy = false) = 0;\n\n  /**\n   * @brief Insert new key-value-score tuples into the hash table.\n   * If the key already exists, the values and scores are assigned new values.\n   *\n   * If the target bucket is full, the keys with minimum score will be\n   * overwritten by new key unless the score of the new key is even less than\n   * minimum score of the target bucket. The overwritten key with minimum\n   * score will be evicted, with its values and score, to evicted_keys,\n   * evicted_values, evcted_scores seperately in compact format.\n   *\n   * @param n Number of key-value-score tuples to insert or assign.\n   * @param keys The keys to insert on GPU-accessible memory with shape\n   * (n).\n   * @param values The values to insert on GPU-accessible memory with\n   * shape (n, DIM).\n   * @param scores The scores to insert on GPU-accessible memory with shape\n   * (n).\n   * @param scores The scores to insert on GPU-accessible memory with shape\n   * (n).\n   * @params evicted_keys The output of keys replaced with minimum score.\n   * @params evicted_values The output of values replaced with minimum score on\n   * keys.\n   * @params evicted_scores The output of scores replaced with minimum score on\n   * keys.\n   * @parblock\n   * The scores should be a `uint64_t` value for built-in strategies. For\n   * `EvictStrategy::kCustomized`, `uint32_t` scores are also supported.\n   * You can specify a value such as the timestamp of the key insertion or\n   * number of key occurrences to perform a customized eviction strategy.\n   *\n   * The @p scores should be `nullptr`, when the LRU eviction strategy is\n   * applied.\n   * @endparblock\n   *\n   * @param stream The CUDA stream that is used to execute the operation.\n   * @param unique_key If all keys in the same batch are unique.\n   *\n   * @param ignore_evict_strategy A boolean option indicating whether if\n   * the insert_or_assign ignores the evict strategy of table with current\n   * scores anyway. If true, it does not check whether the scores confroms to\n   * the evict strategy. If false, it requires the scores follow the evict\n   * strategy of table.\n   *\n   * @return The number of elements evicted.\n   */\n  virtual size_type insert_and_evict(const size_type n,\n                                     const key_type* keys,        // (n)\n                                     const value_type* values,    // (n, DIM)\n                                     const score_type* scores,    // (n)\n                                     key_type* evicted_keys,      // (n)\n                                     value_type* evicted_values,  // (n, DIM)\n                                     score_type* evicted_scores,  // (n)\n                                     cudaStream_t stream = 0,\n                                     bool unique_key = true,\n                                     bool ignore_evict_strategy = false) = 0;\n\n  /**\n   * Searches for each key in @p keys in the hash table.\n   * If the key is found and the corresponding value in @p accum_or_assigns is\n   * `true`, the @p vectors_or_deltas is treated as a delta to the old\n   * value, and the delta is added to the old value of the key.\n   *\n   * If the key is not found and the corresponding value in @p accum_or_assigns\n   * is `false`, the @p vectors_or_deltas is treated as a new value and the\n   * key-value pair is updated in the table directly.\n   *\n   * @note When the key is found and the value of @p accum_or_assigns is\n   * `false`, or when the key is not found and the value of @p accum_or_assigns\n   * is `true`, nothing is changed and this operation is ignored.\n   * The algorithm assumes these situations occur while the key was modified or\n   * removed by other processes just now.\n   *\n   * @param n The number of key-value-score tuples to process.\n   * @param keys The keys to insert on GPU-accessible memory with shape (n).\n   * @param value_or_deltas The values or deltas to insert on GPU-accessible\n   * memory with shape (n, DIM).\n   * @param accum_or_assigns The operation type with shape (n). A value of\n   * `true` indicates to accum and `false` indicates to assign.\n   * @param scores The scores to insert on GPU-accessible memory with shape (n).\n   * @parblock\n   * The scores should be a `uint64_t` value for built-in strategies. For\n   * `EvictStrategy::kCustomized`, `uint32_t` scores are also supported.\n   * You can specify a value such as the timestamp of the key insertion or\n   * number of key occurrences to perform a customized eviction strategy.\n   *\n   * The @p scores should be `nullptr`, when the LRU eviction strategy is\n   * applied.\n   * @endparblock\n   *\n   * @param stream The CUDA stream that is used to execute the operation.\n   *\n   * @param ignore_evict_strategy A boolean option indicating whether if\n   * the accum_or_assign ignores the evict strategy of table with current\n   * scores anyway. If true, it does not check whether the scores confroms to\n   * the evict strategy. If false, it requires the scores follow the evict\n   * strategy of table.\n   */\n  virtual void accum_or_assign(const size_type n,\n                               const key_type* keys,                // (n)\n                               const value_type* value_or_deltas,   // (n, DIM)\n                               const bool* accum_or_assigns,        // (n)\n                               const score_type* scores = nullptr,  // (n)\n                               cudaStream_t stream = 0,\n                               bool ignore_evict_strategy = false) = 0;\n\n  /**\n   * @brief Searches the hash table for the specified keys.\n   * When a key is missing, the value in @p values and @p scores will be\n   * inserted.\n   *\n   * @param n The number of key-value-score tuples to search or insert.\n   * @param keys The keys to search on GPU-accessible memory with shape (n).\n   * @param values The values to search on GPU-accessible memory with\n   * shape (n, DIM).\n   * @param scores The scores to search on GPU-accessible memory with shape (n).\n   * @parblock\n   * If @p scores is `nullptr`, the score for each key will not be returned.\n   * @endparblock\n   * @param stream The CUDA stream that is used to execute the operation.\n   * @param unique_key If all keys in the same batch are unique.\n   *\n   */\n  virtual void find_or_insert(const size_type n, const key_type* keys,  // (n)\n                              value_type* values,            // (n * DIM)\n                              score_type* scores = nullptr,  // (n)\n                              cudaStream_t stream = 0, bool unique_key = true,\n                              bool ignore_evict_strategy = false) = 0;\n\n  /**\n   * @brief Searches the hash table for the specified keys and returns address\n   * of the values. When a key is missing, the value in @p values and @p scores\n   * will be inserted.\n   *\n   * @warning This API returns internal addresses for high-performance but\n   * thread-unsafe. The caller is responsible for guaranteeing data consistency.\n   *\n   * @param n The number of key-value-score tuples to search or insert.\n   * @param keys The keys to search on GPU-accessible memory with shape (n).\n   * @param values  The addresses of values to search on GPU-accessible memory\n   * with shape (n).\n   * @param founds The status that indicates if the keys are found on\n   * @param scores The scores to search on GPU-accessible memory with shape (n).\n   * @parblock\n   * If @p scores is `nullptr`, the score for each key will not be returned.\n   * @endparblock\n   * @param stream The CUDA stream that is used to execute the operation.\n   * @param unique_key If all keys in the same batch are unique.\n   * @param locked_key_ptrs If it isn't nullptr then the keys in the table will\n   * be locked, and key's address will write to locked_key_ptrs. Using\n   * unlock_keys to unlock these keys.\n   *\n   */\n  virtual void find_or_insert(const size_type n, const key_type* keys,  // (n)\n                              value_type** values,                      // (n)\n                              bool* founds,                             // (n)\n                              score_type* scores = nullptr,             // (n)\n                              cudaStream_t stream = 0, bool unique_key = true,\n                              bool ignore_evict_strategy = false,\n                              key_type** locked_key_ptrs = nullptr) = 0;\n\n  /**\n   * @brief\n   * This function will lock the keys in the table and unexisted keys will be\n   * ignored.\n   *\n   * @param n The number of keys in the table to be locked.\n   * @param locked_key_ptrs The pointers of locked keys in the table with shape\n   * (n).\n   * @param keys The keys to search on GPU-accessible memory with shape (n).\n   * @param succeededs The status that indicates if the lock operation is\n   * succeed.\n   * @param scores The scores of the input keys will set to scores if provided.\n   * @param stream The CUDA stream that is used to execute the operation.\n   *\n   */\n  virtual void lock_keys(const size_type n,\n                         key_type const* keys,        // (n)\n                         key_type** locked_key_ptrs,  // (n)\n                         bool* succeededs = nullptr,  // (n)\n                         cudaStream_t stream = 0,\n                         score_type const* scores = nullptr) = 0;\n\n  /**\n   * @brief Using pointers to address the keys in the hash table and set them\n   * to target keys.\n   * This function will unlock the keys in the table which are locked by\n   * the previous call to find_or_insert.\n   *\n   * @param n The number of keys in the table to be unlocked.\n   * @param locked_key_ptrs The pointers of locked keys in the table with shape\n   * (n).\n   * @param keys The keys to search on GPU-accessible memory with shape (n).\n   * @param succeededs The status that indicates if the unlock operation is\n   * succeed.\n   * @param stream The CUDA stream that is used to execute the operation.\n   *\n   */\n  virtual void unlock_keys(const size_type n,\n                           key_type** locked_key_ptrs,  // (n)\n                           const key_type* keys,        // (n)\n                           bool* succeededs = nullptr,  // (n)\n                           cudaStream_t stream = 0) = 0;\n\n  /**\n   * @brief Assign new key-value-score tuples into the hash table.\n   * If the key doesn't exist, the operation on the key will be ignored.\n   *\n   * @param n Number of key-value-score tuples to insert or assign.\n   * @param keys The keys to insert on GPU-accessible memory with shape\n   * (n).\n   * @param values The values to insert on GPU-accessible memory with\n   * shape (n, DIM).\n   * @param scores The scores to insert on GPU-accessible memory with shape\n   * (n).\n   * @parblock\n   * The scores should be a `uint64_t` value for built-in strategies. For\n   * `EvictStrategy::kCustomized`, `uint32_t` scores are also supported.\n   * You can specify a value such as the timestamp of the key insertion or\n   * number of key occurrences to perform a customized eviction strategy.\n   *\n   * The @p scores should be `nullptr`, when the LRU eviction strategy is\n   * applied.\n   * @endparblock\n   *\n   * @param stream The CUDA stream that is used to execute the operation.\n   *\n   * @param unique_key If all keys in the same batch are unique.\n   */\n  virtual void assign(const size_type n,\n                      const key_type* keys,                // (n)\n                      const value_type* values,            // (n, DIM)\n                      const score_type* scores = nullptr,  // (n)\n                      cudaStream_t stream = 0, bool unique_key = true) = 0;\n\n  /**\n   * @brief Assign new scores for keys.\n   * If the key doesn't exist, the operation on the key will be ignored.\n   *\n   * @param n Number of key-score pairs to assign.\n   * @param keys The keys to insert on GPU-accessible memory with shape\n   * (n).\n   * @parblock\n   * The scores should be a `uint64_t` value for built-in strategies. For\n   * `EvictStrategy::kCustomized`, `uint32_t` scores are also supported.\n   * You can specify a value such as the timestamp of the key insertion or\n   * number of key occurrences to perform a customized eviction strategy.\n   *\n   * The @p scores should be `nullptr`, when the LRU eviction strategy is\n   * applied.\n   * @endparblock\n   *\n   * @param stream The CUDA stream that is used to execute the operation.\n   *\n   * @param unique_key If all keys in the same batch are unique.\n   */\n  virtual void assign_scores(const size_type n,\n                             const key_type* keys,                // (n)\n                             const score_type* scores = nullptr,  // (n)\n                             cudaStream_t stream = 0,\n                             bool unique_key = true) = 0;\n\n  /**\n   * @brief Alias of `assign_scores`.\n   */\n  virtual void assign(const size_type n,\n                      const key_type* keys,                // (n)\n                      const score_type* scores = nullptr,  // (n)\n                      cudaStream_t stream = 0, bool unique_key = true) = 0;\n\n  /**\n   * @brief Assign new values for each keys .\n   * If the key doesn't exist, the operation on the key will be ignored.\n   *\n   * @param n Number of key-value pairs to assign.\n   * @param keys The keys need to be operated, which must be on GPU-accessible\n   * memory with shape (n).\n   * @param values The values need to be updated, which must be on\n   * GPU-accessible memory with shape (n, DIM).\n   *\n   * @param stream The CUDA stream that is used to execute the operation.\n   *\n   * @param unique_key If all keys in the same batch are unique.\n   */\n  virtual void assign_values(const size_type n,\n                             const key_type* keys,      // (n)\n                             const value_type* values,  // (n, DIM)\n                             cudaStream_t stream = 0,\n                             bool unique_key = true) = 0;\n  /**\n   * @brief Searches the hash table for the specified keys.\n   *\n   * @note When a key is missing, the value in @p values is not changed.\n   *\n   * @param n The number of key-value-score tuples to search.\n   * @param keys The keys to search on GPU-accessible memory with shape (n).\n   * @param values The values to search on GPU-accessible memory with\n   * shape (n, DIM).\n   * @param founds The status that indicates if the keys are found on\n   * GPU-accessible memory with shape (n).\n   * @param scores The scores to search on GPU-accessible memory with shape (n).\n   * @parblock\n   * If @p scores is `nullptr`, the score for each key will not be returned.\n   * @endparblock\n   * @param stream The CUDA stream that is used to execute the operation.\n   *\n   */\n  virtual void find(const size_type n, const key_type* keys,  // (n)\n                    value_type* values,                       // (n, DIM)\n                    bool* founds,                             // (n)\n                    score_type* scores = nullptr,             // (n)\n                    cudaStream_t stream = 0) const = 0;\n\n  /**\n   * @brief Searches the hash table for the specified keys.\n   *\n   * @note When the searched keys are not hit, missed keys/indices/size can be\n   * obtained.\n   *\n   * @param n The number of key-value-score tuples to search.\n   * @param keys The keys to search on GPU-accessible memory with shape (n).\n   * @param values The values to search on GPU-accessible memory with\n   * shape (n, DIM).\n   * @param missed_keys The missed keys to search on GPU-accessible memory with\n   * shape (n).\n   * @param missed_indices The missed indices to search on GPU-accessible memory\n   * with shape (n).\n   * @param missed_size The size of `missed_keys` and `missed_indices`.\n   * @param scores The scores to search on GPU-accessible memory with shape (n).\n   * @parblock\n   * If @p scores is `nullptr`, the score for each key will not be returned.\n   * @endparblock\n   * @param stream The CUDA stream that is used to execute the operation.\n   */\n  virtual void find(const size_type n, const key_type* keys,  // (n)\n                    value_type* values,                       // (n, DIM)\n                    key_type* missed_keys,                    // (n)\n                    int* missed_indices,                      // (n)\n                    int* missed_size,                         // scalar\n                    score_type* scores = nullptr,             // (n)\n                    cudaStream_t stream = 0) const = 0;\n\n  /**\n   * @brief Searches the hash table for the specified keys and returns address\n   * of the values.\n   *\n   * @note When a key is missing, the data in @p values won't change.\n   * @warning This API returns internal addresses for high-performance but\n   * thread-unsafe. The caller is responsible for guaranteeing data consistency.\n   *\n   * @param n The number of key-value-score tuples to search.\n   * @param keys The keys to search on GPU-accessible memory with shape (n).\n   * @param values The addresses of values to search on GPU-accessible memory\n   * with shape (n).\n   * @param founds The status that indicates if the keys are found on\n   * GPU-accessible memory with shape (n).\n   * @param scores The scores to search on GPU-accessible memory with shape (n).\n   * @parblock\n   * If @p scores is `nullptr`, the score for each key will not be returned.\n   * @endparblock\n   * @param stream The CUDA stream that is used to execute the operation.\n   * @param unique_key If all keys in the same batch are unique.\n   *\n   */\n  virtual void find(const size_type n, const key_type* keys,  // (n)\n                    value_type** values,                      // (n)\n                    bool* founds,                             // (n)\n                    score_type* scores = nullptr,             // (n)\n                    cudaStream_t stream = 0, bool unique_key = true) const = 0;\n\n  /**\n   * @brief Searches the hash table for the specified keys and returns address\n   * of the values, and will update the scores.\n   *\n   * @note When a key is missing, the data in @p values won't change.\n   * @warning This API returns internal addresses for high-performance but\n   * thread-unsafe. The caller is responsible for guaranteeing data consistency.\n   *\n   * @param n The number of key-value-score tuples to search.\n   * @param keys The keys to search on GPU-accessible memory with shape (n).\n   * @param values The addresses of values to search on GPU-accessible memory\n   * with shape (n).\n   * @param founds The status that indicates if the keys are found on\n   * GPU-accessible memory with shape (n).\n   * @param scores The scores to search on GPU-accessible memory with shape (n).\n   * @parblock\n   * If @p scores is `nullptr`, the score for each key will not be returned.\n   * @endparblock\n   * @param stream The CUDA stream that is used to execute the operation.\n   * @param unique_key If all keys in the same batch are unique.\n   *\n   */\n  virtual void find_and_update(const size_type n, const key_type* keys,  // (n)\n                               value_type** values,                      // (n)\n                               bool* founds,                             // (n)\n                               score_type* scores = nullptr,             // (n)\n                               cudaStream_t stream = 0,\n                               bool unique_key = true) = 0;\n\n  /**\n   * @brief Checks if there are elements with key equivalent to `keys` in the\n   * table.\n   *\n   * @param n The number of `keys` to check.\n   * @param keys The keys to search on GPU-accessible memory with shape (n).\n   * @param founds The result that indicates if the keys are found, and should\n   * be allocated by caller on GPU-accessible memory with shape (n).\n   * @param stream The CUDA stream that is used to execute the operation.\n   *\n   */\n  virtual void contains(const size_type n, const key_type* keys,  // (n)\n                        bool* founds,                             // (n)\n                        cudaStream_t stream = 0) const = 0;\n\n  /**\n   * @brief Removes specified elements from the hash table.\n   *\n   * @param n The number of keys to remove.\n   * @param keys The keys to remove on GPU-accessible memory.\n   * @param stream The CUDA stream that is used to execute the operation.\n   *\n   */\n  virtual void erase(const size_type n, const key_type* keys,\n                     cudaStream_t stream = 0) = 0;\n\n  /**\n   * @brief Removes all of the elements in the hash table with no release\n   * object.\n   */\n  virtual void clear(cudaStream_t stream = 0) = 0;\n\n  /**\n   * @brief Exports a certain number of the key-value-score tuples from the\n   * hash table.\n   *\n   * @param n The maximum number of exported pairs.\n   * @param offset The position of the key to search.\n   * @param d_counter Accumulates amount of successfully exported values.\n   * @param keys The keys to dump from GPU-accessible memory with shape (n).\n   * @param values The values to dump from GPU-accessible memory with shape\n   * (n, DIM).\n   * @param scores The scores to search on GPU-accessible memory with shape (n).\n   * @parblock\n   * If @p scores is `nullptr`, the score for each key will not be returned.\n   * @endparblock\n   *\n   * @param stream The CUDA stream that is used to execute the operation.\n   *\n   * @return The number of elements dumped.\n   *\n   * @throw CudaException If the key-value size is too large for GPU shared\n   * memory. Reducing the value for @p n is currently required if this exception\n   * occurs.\n   */\n  virtual void export_batch(size_type n, const size_type offset,\n                            size_type* d_counter,          // (1)\n                            key_type* keys,                // (n)\n                            value_type* values,            // (n, DIM)\n                            score_type* scores = nullptr,  // (n)\n                            cudaStream_t stream = 0) const = 0;\n\n  virtual size_type export_batch(const size_type n, const size_type offset,\n                                 key_type* keys,                // (n)\n                                 value_type* values,            // (n, DIM)\n                                 score_type* scores = nullptr,  // (n)\n                                 cudaStream_t stream = 0) const = 0;\n\n  /**\n   * @brief Indicates if the hash table has no elements.\n   *\n   * @param stream The CUDA stream that is used to execute the operation.\n   * @return `true` if the table is empty and `false` otherwise.\n   */\n  virtual bool empty(cudaStream_t stream = 0) const = 0;\n\n  /**\n   * @brief Returns the hash table size.\n   *\n   * @param stream The CUDA stream that is used to execute the operation.\n   * @return The table size.\n   */\n  virtual size_type size(cudaStream_t stream = 0) const = 0;\n\n  /**\n   * @brief Returns the hash table capacity.\n   *\n   * @note The value that is returned might be less than the actual capacity of\n   * the hash table because the hash table currently keeps the capacity to be\n   * a power of 2 for performance considerations.\n   *\n   * @return The table capacity.\n   */\n  virtual size_type capacity() const = 0;\n\n  /**\n   * @brief Sets the number of buckets to the number that is needed to\n   * accommodate at least @p new_capacity elements without exceeding the maximum\n   * load factor. This method rehashes the hash table. Rehashing puts the\n   * elements into the appropriate buckets considering that total number of\n   * buckets has changed.\n   *\n   * @note If the value of @p new_capacity or double of @p new_capacity is\n   * greater or equal than `options_.max_capacity`, the reserve does not perform\n   * any change to the hash table.\n   *\n   * @param new_capacity The requested capacity for the hash table.\n   * @param stream The CUDA stream that is used to execute the operation.\n   */\n  virtual void reserve(const size_type new_capacity,\n                       cudaStream_t stream = 0) = 0;\n\n  /**\n   * @brief Returns the average number of elements per slot, that is, size()\n   * divided by capacity().\n   *\n   * @param stream The CUDA stream that is used to execute the operation.\n   *\n   * @return The load factor\n   */\n  virtual float load_factor(cudaStream_t stream = 0) const = 0;\n\n  /**\n   * @brief Set max_capacity of the table.\n   *\n   * @param new_max_capacity The new expecting max_capacity. It must be power\n   * of 2. Otherwise it will raise an error.\n   */\n  virtual void set_max_capacity(size_type new_max_capacity) = 0;\n\n  /**\n   * @brief Returns the dimension of the vectors.\n   *\n   * @return The dimension of the vectors.\n   */\n  virtual size_type dim() const noexcept = 0;\n\n  /**\n   * @brief Returns The length of each bucket.\n   *\n   * @return The length of each bucket.\n   */\n  virtual size_type max_bucket_size() const noexcept = 0;\n\n  /**\n   * @brief Returns the number of buckets in the table.\n   *\n   * @return The number of buckets in the table.\n   */\n  virtual size_type bucket_count() const noexcept = 0;\n\n  /**\n   * @brief Save keys, vectors, scores in table to file or files.\n   *\n   * @param file A BaseKVFile object defined the file format on host filesystem.\n   * @param max_workspace_size Saving is conducted in chunks. This value denotes\n   * the maximum amount of temporary memory to use when dumping the table.\n   * Larger values *can* lead to higher performance.\n   * @param stream The CUDA stream used to execute the operation.\n   *\n   * @return Number of KV pairs saved to file.\n   */\n  virtual size_type save(BaseKVFile<K, V, S>* file,\n                         const size_t max_workspace_size = 1L * 1024 * 1024,\n                         cudaStream_t stream = 0) const = 0;\n\n  /**\n   * @brief Load keys, vectors, scores from file to table.\n   *\n   * @param file An BaseKVFile defined the file format within filesystem.\n   * @param max_workspace_size Loading is conducted in chunks. This value\n   * denotes the maximum size of such chunks. Larger values *can* lead to higher\n   * performance.\n   * @param stream The CUDA stream used to execute the operation.\n   *\n   * @return Number of keys loaded from file.\n   */\n  virtual size_type load(BaseKVFile<K, V, S>* file,\n                         const size_t max_workspace_size = 1L * 1024 * 1024,\n                         cudaStream_t stream = 0) = 0;\n\n  virtual void set_global_epoch(const uint64_t epoch) = 0;\n};\n\n/**\n * A HierarchicalKV hash table is a concurrent and hierarchical hash table that\n * is powered by GPUs and can use HBM and host memory as storage for key-value\n * pairs. Support for SSD storage is a future consideration.\n *\n * The `score` is introduced to define the importance of each key, the\n * larger, the more important, the less likely they will be evicted. Eviction\n * occurs automatically when a bucket is full. The keys with the minimum `score`\n * value are evicted first. In a customized eviction strategy, we recommend\n * using the timestamp or frequency of the key occurrence as the `score` value\n * for each key. You can also assign a special value to the `score` to\n * perform a customized eviction strategy.\n *\n * @note By default configuration, this class is thread-safe.\n *\n * @tparam K The data type of the key.\n * @tparam V The data type of the vector's item type.\n *         The item data type should be a basic data type of C++/CUDA.\n * @tparam S The data type for `score`.\n *           Supported types: `uint64_t` and `uint32_t` (only for\n *           `EvictStrategy::kCustomized`).\n *\n */\ntemplate <typename K, typename V, typename S = uint64_t,\n          int Strategy = EvictStrategy::kLru, typename ArchTag = Sm80>\nclass HashTable : public HashTableBase<K, V, S> {\n public:\n  using size_type = size_t;\n  using key_type = K;\n  using value_type = V;\n  using score_type = S;\n  static constexpr int evict_strategy = Strategy;\n\n  using Pred = EraseIfPredict<key_type, score_type>;\n  using allocator_type = BaseAllocator;\n\n private:\n  using TableCore = nv::merlin::Table<key_type, value_type, score_type>;\n  static constexpr unsigned int TILE_SIZE = 4;\n\n  using DeviceMemoryPool = MemoryPool<DeviceAllocator<char>>;\n  using HostMemoryPool = MemoryPool<HostAllocator<char>>;\n\n public:\n  /**\n   * @brief Default constructor for the hash table class.\n   */\n  HashTable() {\n    static_assert((std::is_same<key_type, int64_t>::value ||\n                   std::is_same<key_type, uint64_t>::value),\n                  \"The key_type must be int64_t or uint64_t.\");\n\n    static_assert((std::is_same<score_type, uint64_t>::value ||\n                   std::is_same<score_type, uint32_t>::value),\n                  \"The score_type must be uint64_t or uint32_t.\");\n\n    // Incompatible: Epoch-based strategies encode epoch(hi32)|score(lo32),\n    // require 64-bit score\n    static_assert(!(std::is_same<score_type, uint32_t>::value &&\n                    (evict_strategy != EvictStrategy::kCustomized)),\n                  \"score_type uint32_t is only compatible with Customized; \"\n                  \"use uint64_t.\");\n  };\n\n  /**\n   * @brief Frees the resources used by the hash table and destroys the hash\n   * table object.\n   */\n  ~HashTable() {\n    if (initialized_) {\n      CUDA_CHECK(cudaDeviceSynchronize());\n\n      initialized_ = false;\n      destroy_table<key_type, value_type, score_type>(&table_, allocator_);\n      allocator_->free(MemoryType::Device, d_table_);\n      dev_mem_pool_.reset();\n      host_mem_pool_.reset();\n\n      CUDA_CHECK(cudaDeviceSynchronize());\n      if (default_allocator_ && allocator_ != nullptr) {\n        delete allocator_;\n      }\n    }\n  }\n\n private:\n  HashTable(const HashTable&) = delete;\n  HashTable& operator=(const HashTable&) = delete;\n  HashTable(HashTable&&) = delete;\n  HashTable& operator=(HashTable&&) = delete;\n\n public:\n  /**\n   * @brief Initialize a merlin::HashTable.\n   *\n   * @param options The configuration options.\n   */\n  void init(const HashTableOptions& options,\n            allocator_type* allocator = nullptr) {\n    if (initialized_) {\n      return;\n    }\n    options_ = options;\n\n    // MEMORY_MODE (dual-bucket) specific initialization.\n    if (options_.table_mode == TableMode::kMemory) {\n      // Note: dual-bucket mode does not use max_load_factor for rehash\n      // triggering.  The effective load factor is governed entirely by the\n      // score-based eviction mechanism.  We intentionally leave\n      // max_load_factor at its default value and never consult it.\n      MERLIN_CHECK(options_.init_capacity == options_.max_capacity,\n                   \"[MEMORY_MODE] init_capacity must equal max_capacity. \"\n                   \"Auto-rehash is not supported in dual-bucket mode.\");\n      MERLIN_CHECK(options_.max_hbm_for_vectors == 0,\n                   \"[MEMORY_MODE] Only pure HBM (fast mode) is supported. \"\n                   \"Set max_hbm_for_vectors = 0.\");\n      MERLIN_CHECK(\n          options_.dim * sizeof(value_type) <= 224 * sizeof(float),\n          \"[MEMORY_MODE] dim * sizeof(V) must not exceed 896 bytes \"\n          \"(i.e. dim <= 224 for float). The dual-bucket lookup kernel uses a \"\n          \"fixed-size shared memory buffer that cannot accommodate larger \"\n          \"value vectors.\");\n      MERLIN_CHECK(\n          options_.init_capacity / options_.max_bucket_size >= 2,\n          \"[MEMORY_MODE] capacity must provide at least 2 buckets \"\n          \"(capacity >= 2 * max_bucket_size). Dual-bucket addressing \"\n          \"requires b1 != b2, which is impossible with a single bucket.\");\n    }\n\n    MERLIN_CHECK(options.reserved_key_start_bit >= 0 &&\n                     options.reserved_key_start_bit <= MAX_RESERVED_KEY_BIT,\n                 \"options.reserved_key_start_bit should >= 0 and <= 62.\");\n    CUDA_CHECK(init_reserved_keys(options.reserved_key_start_bit));\n\n    default_allocator_ = (allocator == nullptr);\n    allocator_ = (allocator == nullptr) ? (new DefaultAllocator()) : allocator;\n\n    thrust_allocator_.set_allocator(allocator_);\n\n    if (options_.device_id >= 0) {\n      CUDA_CHECK(cudaSetDevice(options_.device_id));\n    } else {\n      CUDA_CHECK(cudaGetDevice(&(options_.device_id)));\n    }\n\n    MERLIN_CHECK(ispow2(static_cast<uint32_t>(options_.max_bucket_size)),\n                 \"Bucket size should be the pow of 2\");\n    MERLIN_CHECK(\n        ispow2(static_cast<uint32_t>(options_.num_of_buckets_per_alloc)),\n        \"Then `num_of_buckets_per_alloc` should be the pow of 2\");\n    MERLIN_CHECK(options_.init_capacity >= options_.num_of_buckets_per_alloc *\n                                               options_.max_bucket_size,\n                 \"Then `num_of_buckets_per_alloc` must be equal or less than \"\n                 \"initial required buckets number\");\n\n    options_.block_size = SAFE_GET_BLOCK_SIZE(options_.block_size);\n\n    MERLIN_CHECK(\n        (((options_.max_bucket_size * (sizeof(key_type) + sizeof(score_type))) %\n          128) == 0),\n        \"Storage size of keys and scores in one bucket should be the mutiple \"\n        \"of cache line size\");\n\n    // Construct table.\n    cudaDeviceProp deviceProp;\n    CUDA_CHECK(cudaGetDeviceProperties(&deviceProp, options_.device_id));\n    shared_mem_size_ = deviceProp.sharedMemPerBlock;\n    sm_cnt_ = deviceProp.multiProcessorCount;\n    max_threads_per_block_ = deviceProp.maxThreadsPerBlock;\n    const bool is_memory_mode = (options_.table_mode == TableMode::kMemory);\n    create_table<key_type, value_type, score_type>(\n        &table_, allocator_, options_.dim, options_.init_capacity,\n        options_.max_capacity, options_.max_hbm_for_vectors,\n        options_.max_bucket_size, options_.num_of_buckets_per_alloc,\n        /*tile_size=*/32, /*primary=*/true,\n        /*dual_bucket_mode=*/is_memory_mode);\n    options_.block_size = SAFE_GET_BLOCK_SIZE(options_.block_size);\n    reach_max_capacity_ = (options_.init_capacity * 2 > options_.max_capacity);\n\n    // MEMORY_MODE: force disable auto-rehash.\n    if (is_memory_mode) {\n      reach_max_capacity_ = true;  // Disable auto-rehash.\n    }\n\n    MERLIN_CHECK((!(options_.io_by_cpu && options_.max_hbm_for_vectors != 0)),\n                 \"[HierarchicalKV] `io_by_cpu` should not be true when \"\n                 \"`max_hbm_for_vectors` is not 0!\");\n    allocator_->alloc(MemoryType::Device, (void**)&(d_table_),\n                      sizeof(TableCore));\n\n    sync_table_configuration();\n\n    // Create memory pools.\n    dev_mem_pool_ = std::make_unique<MemoryPool<DeviceAllocator<char>>>(\n        options_.device_memory_pool, allocator_);\n    host_mem_pool_ = std::make_unique<MemoryPool<HostAllocator<char>>>(\n        options_.host_memory_pool, allocator_);\n\n    CUDA_CHECK(cudaDeviceSynchronize());\n\n    initialized_ = true;\n    CudaCheckError();\n  }\n\n  /**\n   * @brief Insert new key-value-score tuples into the hash table.\n   * If the key already exists, the values and scores are assigned new values.\n   *\n   * If the target bucket is full, the keys with minimum score will be\n   * overwritten by new key unless the score of the new key is even less than\n   * minimum score of the target bucket.\n   *\n   * @param n Number of key-value-score tuples to insert or assign.\n   * @param keys The keys to insert on GPU-accessible memory with shape\n   * (n).\n   * @param values The values to insert on GPU-accessible memory with\n   * shape (n, DIM).\n   * @param scores The scores to insert on GPU-accessible memory with shape\n   * (n).\n   * @parblock\n   * The scores should be a `uint64_t` value for built-in strategies. For\n   * `EvictStrategy::kCustomized`, `uint32_t` scores are also supported.\n   * You can specify a value such as the timestamp of the key insertion or\n   * number of key occurrences to perform a customized eviction strategy.\n   *\n   * The @p scores should be `nullptr`, when the LRU eviction strategy is\n   * applied.\n   * @endparblock\n   *\n   * @param stream The CUDA stream that is used to execute the operation.\n   * @param unique_key If all keys in the same batch are unique.\n   *\n   * @param ignore_evict_strategy A boolean option indicating whether if\n   * the insert_or_assign ignores the evict strategy of table with current\n   * scores anyway. If true, it does not check whether the scores conforms to\n   * the evict strategy. If false, it requires the scores follow the evict\n   * strategy of table.\n   */\n  void insert_or_assign(const size_type n,\n                        const key_type* keys,                // (n)\n                        const value_type* values,            // (n, DIM)\n                        const score_type* scores = nullptr,  // (n)\n                        cudaStream_t stream = 0, bool unique_key = true,\n                        bool ignore_evict_strategy = false) {\n    if (ignore_evict_strategy) {\n      insert_or_assign_impl<EvictStrategy::kCustomized>(\n          n, keys, values, scores, stream, unique_key, ignore_evict_strategy);\n    } else {\n      insert_or_assign_impl<evict_strategy>(n, keys, values, scores, stream,\n                                            unique_key, ignore_evict_strategy);\n    }\n  }\n\n  template <int evict_strategy_>\n  void insert_or_assign_impl(const size_type n,\n                             const key_type* keys,      // (n)\n                             const value_type* values,  // (n, DIM)\n                             const score_type* scores,  // (n)\n                             cudaStream_t stream, bool unique_key,\n                             bool ignore_evict_strategy) {\n    if (n == 0) {\n      return;\n    }\n\n    while (!reach_max_capacity_ &&\n           fast_load_factor(n, stream) > options_.max_load_factor) {\n      reserve(capacity() * 2, stream);\n    }\n\n    if (!ignore_evict_strategy) {\n      check_evict_strategy(scores);\n    }\n\n    std::unique_ptr<insert_unique_lock> lock_ptr;\n    if (options_.api_lock) {\n      lock_ptr = std::make_unique<insert_unique_lock>(mutex_, stream);\n    }\n\n    // MEMORY_MODE: dual-bucket upsert.\n    if (is_memory_mode()) {\n      MERLIN_CHECK(unique_key,\n                   \"[MEMORY_MODE] insert_or_assign requires unique_key=true \"\n                   \"in dual-bucket mode.\");\n\n      using DualSelector =\n          KernelSelector_DualBucketUpsert<key_type, value_type, score_type,\n                                          evict_strategy_, ArchTag>;\n      typename DualSelector::Params kernelParams(\n          /*load_factor=*/0.0f, table_->buckets, table_->buckets_size,\n          table_->buckets_num, static_cast<uint32_t>(options_.max_bucket_size),\n          static_cast<uint32_t>(options_.dim), keys, values, scores, n,\n          global_epoch_);\n      DualSelector::select_kernel(kernelParams, stream);\n      CudaCheckError();\n      return;\n    }\n\n    if (is_fast_mode()) {\n      static thread_local int step_counter = 0;\n      static thread_local float load_factor = 0.0;\n\n      if (((step_counter++) % kernel_select_interval_) == 0) {\n        load_factor = fast_load_factor(0, stream, false);\n      }\n\n      using Selector = KernelSelector_Upsert<key_type, value_type, score_type,\n                                             evict_strategy_, ArchTag>;\n      if (Selector::callable(unique_key,\n                             static_cast<uint32_t>(options_.max_bucket_size),\n                             static_cast<uint32_t>(options_.dim))) {\n        typename Selector::Params kernelParams(\n            load_factor, table_->buckets, table_->buckets_size,\n            table_->buckets_num,\n            static_cast<uint32_t>(options_.max_bucket_size),\n            static_cast<uint32_t>(options_.dim), keys, values, scores, n,\n            global_epoch_);\n        Selector::select_kernel(kernelParams, stream);\n      } else {\n        using Selector = SelectUpsertKernelWithIO<key_type, value_type,\n                                                  score_type, evict_strategy_>;\n        Selector::execute_kernel(\n            load_factor, options_.block_size, options_.max_bucket_size,\n            table_->buckets_num, options_.dim, stream, n, d_table_,\n            table_->buckets, keys, reinterpret_cast<const value_type*>(values),\n            scores, global_epoch_);\n      }\n    } else {\n      auto sortOp = SortPairOp<uintptr_t, int>();\n      auto d_sort_bytes = sortOp.get_storage_bytes(n, stream);\n\n      MultiVector<value_type*, int, value_type*, int, key_type*, uint8_t> mv(\n          n, n, n, n, n, d_sort_bytes);\n      const size_type dev_ws_size = mv.total_size();\n      auto dev_ws{dev_mem_pool_->get_workspace<1>(dev_ws_size, stream)};\n      auto temp_storage = dev_ws.get<uint8_t*>(0);\n      auto d_dst = get_vector<0>(mv, temp_storage);\n      auto d_src_offset = get_vector<1>(mv, temp_storage);\n      auto d_dst_sorted = get_vector<2>(mv, temp_storage);\n      auto d_src_offset_sorted = get_vector<3>(mv, temp_storage);\n      auto keys_ptr = get_vector<4>(mv, temp_storage);\n      auto d_sort_storage = get_vector<5>(mv, temp_storage);\n      sortOp.set_storage(reinterpret_cast<void*>(d_sort_storage));\n\n      CUDA_CHECK(cudaMemsetAsync(d_dst, 0, dev_ws_size, stream));\n\n      constexpr uint32_t MinBucketCapacityFilter =\n          sizeof(VecD_Load) / sizeof(D);\n\n      bool filter_condition =\n          unique_key && options_.max_bucket_size >= MinBucketCapacityFilter &&\n          !options_.io_by_cpu;\n\n      if (filter_condition) {\n        constexpr uint32_t BLOCK_SIZE = 128;\n\n        upsert_kernel_lock_key_hybrid<key_type, value_type, score_type,\n                                      BLOCK_SIZE, evict_strategy_>\n            <<<(n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(\n                table_->buckets, table_->buckets_size, table_->buckets_num,\n                options_.max_bucket_size, options_.dim, keys, d_dst, scores,\n                keys_ptr, d_src_offset, n, global_epoch_);\n\n      } else {\n        const size_t block_size = options_.block_size;\n        const size_t N = n * TILE_SIZE;\n        const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);\n\n        upsert_kernel<key_type, value_type, score_type, evict_strategy_,\n                      TILE_SIZE><<<grid_size, block_size, 0, stream>>>(\n            d_table_, table_->buckets, options_.max_bucket_size,\n            table_->buckets_num, options_.dim, keys, d_dst, scores,\n            d_src_offset, global_epoch_, N);\n      }\n\n      sortOp.sort(n, reinterpret_cast<uintptr_t*>(d_dst),\n                  reinterpret_cast<uintptr_t*>(d_dst_sorted), d_src_offset,\n                  d_src_offset_sorted, stream);\n\n      if (filter_condition) {\n        const size_t block_size = options_.io_block_size;\n        const size_t N = n * dim();\n        const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);\n\n        write_kernel_unlock_key<key_type, value_type, score_type>\n            <<<grid_size, block_size, 0, stream>>>(values, d_dst_sorted,\n                                                   d_src_offset_sorted, dim(),\n                                                   keys, keys_ptr, N);\n\n      } else if (options_.io_by_cpu) {\n        MultiVector<value_type*, int, value_type> mv1(n, n, n * dim());\n        const size_type host_ws_size = mv1.total_size();\n        auto host_ws{host_mem_pool_->get_workspace<1>(host_ws_size, stream)};\n        auto host_temp_storage = host_ws.get<uint8_t*>(0);\n        auto h_dst_sorted = get_vector<0>(mv1, host_temp_storage);\n        auto h_src_offset_sorted = get_vector<1>(mv1, host_temp_storage);\n        auto h_values = get_vector<2>(mv1, host_temp_storage);\n\n        CUDA_CHECK(cudaMemcpyAsync(h_dst_sorted, d_dst_sorted, mv1.offset(2),\n                                   cudaMemcpyDeviceToHost, stream));\n        CUDA_CHECK(cudaMemcpyAsync(h_values, values,\n                                   n * dim() * sizeof(value_type),\n                                   cudaMemcpyDeviceToHost, stream));\n        CUDA_CHECK(cudaStreamSynchronize(stream));\n\n        write_by_cpu<value_type>(h_dst_sorted, h_values, h_src_offset_sorted,\n                                 dim(), n);\n      } else {\n        const size_t block_size = options_.io_block_size;\n        const size_t N = n * dim();\n        const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);\n\n        write_kernel<key_type, value_type, score_type>\n            <<<grid_size, block_size, 0, stream>>>(\n                values, d_dst_sorted, d_src_offset_sorted, dim(), N);\n      }\n    }\n\n    CudaCheckError();\n  }\n\n  /**\n   * @brief Insert new key-value-score tuples into the hash table.\n   * If the key already exists, the values and scores are assigned new values.\n   *\n   * If the target bucket is full, the keys with minimum score will be\n   * overwritten by new key unless the score of the new key is even less than\n   * minimum score of the target bucket. The overwritten key with minimum\n   * score will be evicted, with its values and score, to evicted_keys,\n   * evicted_values, evcted_scores seperately in compact format.\n   *\n   * @param n Number of key-value-score tuples to insert or assign.\n   * @param keys The keys to insert on GPU-accessible memory with shape\n   * (n).\n   * @param values The values to insert on GPU-accessible memory with\n   * shape (n, DIM).\n   * @param scores The scores to insert on GPU-accessible memory with shape\n   * (n).\n   * @params evicted_keys The output of keys replaced with minimum score.\n   * @params evicted_values The output of values replaced with minimum score on\n   * keys.\n   * @params evicted_scores The output of scores replaced with minimum score on\n   * keys.\n   * @parblock\n   * The scores should be a `uint64_t` value for built-in strategies. For\n   * `EvictStrategy::kCustomized`, `uint32_t` scores are also supported.\n   * You can specify a value such as the timestamp of the key insertion or\n   * number of key occurrences to perform a customized eviction strategy.\n   *\n   * The @p scores should be `nullptr`, when the LRU eviction strategy is\n   * applied.\n   * @endparblock\n   *\n   * @param d_evicted_counter The number of elements evicted on GPU-accessible\n   * memory. @notice The caller should guarantee it is set to `0` before\n   * calling.\n   * @param stream The CUDA stream that is used to execute the operation.\n   * @param unique_key If all keys in the same batch are unique.\n   *\n   * @param ignore_evict_strategy A boolean option indicating whether if\n   * the insert_or_assign ignores the evict strategy of table with current\n   * scores anyway. If true, it does not check whether the scores confroms to\n   * the evict strategy. If false, it requires the scores follow the evict\n   * strategy of table.\n   */\n  void insert_and_evict(const size_type n,\n                        const key_type* keys,          // (n)\n                        const value_type* values,      // (n, DIM)\n                        const score_type* scores,      // (n)\n                        key_type* evicted_keys,        // (n)\n                        value_type* evicted_values,    // (n, DIM)\n                        score_type* evicted_scores,    // (n)\n                        size_type* d_evicted_counter,  // (1)\n                        cudaStream_t stream = 0, bool unique_key = true,\n                        bool ignore_evict_strategy = false) {\n    MERLIN_CHECK(\n        !is_memory_mode(),\n        \"[MEMORY_MODE] insert_and_evict() is not supported in dual-bucket \"\n        \"mode. Use insert_or_assign() instead.\");\n    if (n == 0) {\n      return;\n    }\n\n    while (!reach_max_capacity_ &&\n           fast_load_factor(n, stream) > options_.max_load_factor) {\n      reserve(capacity() * 2, stream);\n    }\n\n    if (!ignore_evict_strategy) {\n      check_evict_strategy(scores);\n    }\n\n    std::unique_ptr<insert_unique_lock> lock_ptr;\n    if (options_.api_lock) {\n      lock_ptr = std::make_unique<insert_unique_lock>(mutex_, stream);\n    }\n\n    // TODO: Currently only need eviction when using HashTable as HBM cache.\n    if (!is_fast_mode()) {\n      throw std::runtime_error(\"Only allow insert_and_evict in pure HBM mode.\");\n    }\n\n    static thread_local int step_counter = 0;\n    static thread_local float load_factor = 0.0;\n\n    if (((step_counter++) % kernel_select_interval_) == 0) {\n      load_factor = fast_load_factor(0, stream, false);\n    }\n\n    using Selector =\n        KernelSelector_UpsertAndEvict<key_type, value_type, score_type,\n                                      evict_strategy, ArchTag>;\n    if (Selector::callable(unique_key,\n                           static_cast<uint32_t>(options_.max_bucket_size),\n                           static_cast<uint32_t>(options_.dim))) {\n      typename Selector::Params kernelParams(\n          load_factor, table_->buckets, table_->buckets_size,\n          table_->buckets_num, static_cast<uint32_t>(options_.max_bucket_size),\n          static_cast<uint32_t>(options_.dim), keys, values, scores,\n          evicted_keys, evicted_values, evicted_scores, n, d_evicted_counter,\n          global_epoch_);\n      Selector::select_kernel(kernelParams, stream);\n    } else if (unique_key and options_.max_bucket_size % 16 == 0) {\n      using KernelLauncher =\n          InsertAndEvictKernelLauncher<key_type, value_type, score_type,\n                                       evict_strategy>;\n      typename KernelLauncher::Params kernelParams(\n          load_factor, table_->buckets, table_->buckets_size,\n          table_->buckets_num, static_cast<uint32_t>(options_.max_bucket_size),\n          static_cast<uint32_t>(options_.dim), keys, values, scores,\n          evicted_keys, evicted_values, evicted_scores, n, d_evicted_counter,\n          global_epoch_);\n      KernelLauncher::launch_kernel(kernelParams, stream);\n    } else {\n      // always use max tile to avoid data-deps as possible.\n      const int TILE_SIZE = 32;\n      size_t n_offsets = (n + TILE_SIZE - 1) / TILE_SIZE;\n      const size_type dev_ws_size =\n          n * (sizeof(key_type) + sizeof(score_type)) +\n          n_offsets * sizeof(int64_t) + n * dim() * sizeof(value_type) +\n          n * sizeof(bool);\n\n      auto dev_ws{dev_mem_pool_->get_workspace<1>(dev_ws_size, stream)};\n      auto tmp_evict_keys{dev_ws.get<key_type*>(0)};\n      auto tmp_evict_scores = reinterpret_cast<score_type*>(tmp_evict_keys + n);\n      auto d_offsets = reinterpret_cast<int64_t*>(tmp_evict_scores + n);\n      auto tmp_evict_values =\n          reinterpret_cast<value_type*>(d_offsets + n_offsets);\n      auto d_masks = reinterpret_cast<bool*>(tmp_evict_values + n * dim());\n\n      CUDA_CHECK(\n          cudaMemsetAsync(d_offsets, 0, n_offsets * sizeof(int64_t), stream));\n      CUDA_CHECK(cudaMemsetAsync(d_masks, 0, n * sizeof(bool), stream));\n\n      size_type block_size = options_.block_size;\n      size_type grid_size = SAFE_GET_GRID_SIZE(n, block_size);\n      CUDA_CHECK(memset64Async(tmp_evict_keys, EMPTY_KEY_CPU, n, stream));\n      using Selector =\n          SelectUpsertAndEvictKernelWithIO<key_type, value_type, score_type,\n                                           evict_strategy>;\n      Selector::execute_kernel(\n          load_factor, options_.block_size, options_.max_bucket_size,\n          table_->buckets_num, options_.dim, stream, n, d_table_,\n          table_->buckets, keys, values, scores, tmp_evict_keys,\n          tmp_evict_values, tmp_evict_scores, global_epoch_);\n      keys_not_empty<K>\n          <<<grid_size, block_size, 0, stream>>>(tmp_evict_keys, d_masks, n);\n\n      gpu_cell_count<int64_t, TILE_SIZE><<<grid_size, block_size, 0, stream>>>(\n          d_masks, d_offsets, n, d_evicted_counter);\n\n      void* d_temp_storage = nullptr;\n      size_t temp_storage_bytes = 0;\n      cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes,\n                                    d_offsets, d_offsets, n_offsets, stream);\n      auto dev_ws1{dev_mem_pool_->get_workspace<1>(temp_storage_bytes, stream)};\n      d_temp_storage = dev_ws1.get<void*>(0);\n      cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes,\n                                    d_offsets, d_offsets, n_offsets, stream);\n\n      compact_key_value_score_kernel<K, V, S, int64_t, TILE_SIZE>\n          <<<grid_size, block_size, 0, stream>>>(\n              d_masks, n, d_offsets, tmp_evict_keys, tmp_evict_values,\n              tmp_evict_scores, evicted_keys, evicted_values, evicted_scores,\n              dim());\n    }\n    return;\n  }\n\n  /**\n   * @brief Insert new key-value-score tuples into the hash table.\n   * If the key already exists, the values and scores are assigned new values.\n   *\n   * If the target bucket is full, the keys with minimum score will be\n   * overwritten by new key unless the score of the new key is even less than\n   * minimum score of the target bucket. The overwritten key with minimum\n   * score will be evicted, with its values and score, to evicted_keys,\n   * evicted_values, evcted_scores seperately in compact format.\n   *\n   * @param n Number of key-value-score tuples to insert or assign.\n   * @param keys The keys to insert on GPU-accessible memory with shape\n   * (n).\n   * @param values The values to insert on GPU-accessible memory with\n   * shape (n, DIM).\n   * @param scores The scores to insert on GPU-accessible memory with shape\n   * (n).\n   * @params evicted_keys The output of keys replaced with minimum score.\n   * @params evicted_values The output of values replaced with minimum score on\n   * keys.\n   * @params evicted_scores The output of scores replaced with minimum score on\n   * keys.\n   * @parblock\n   * The scores should be a `uint64_t` value for built-in strategies. For\n   * `EvictStrategy::kCustomized`, `uint32_t` scores are also supported.\n   * You can specify a value such as the timestamp of the key insertion or\n   * number of key occurrences to perform a customized eviction strategy.\n   *\n   * The @p scores should be `nullptr`, when the LRU eviction strategy is\n   * applied.\n   * @endparblock\n   *\n   * @param stream The CUDA stream that is used to execute the operation.\n   * @param unique_key If all keys in the same batch are unique.\n   *\n   * @param ignore_evict_strategy A boolean option indicating whether if\n   * the insert_or_assign ignores the evict strategy of table with current\n   * scores anyway. If true, it does not check whether the scores confroms to\n   * the evict strategy. If false, it requires the scores follow the evict\n   * strategy of table.\n   *\n   * @return The number of elements evicted.\n   */\n  size_type insert_and_evict(const size_type n,\n                             const key_type* keys,        // (n)\n                             const value_type* values,    // (n, DIM)\n                             const score_type* scores,    // (n)\n                             key_type* evicted_keys,      // (n)\n                             value_type* evicted_values,  // (n, DIM)\n                             score_type* evicted_scores,  // (n)\n                             cudaStream_t stream = 0, bool unique_key = true,\n                             bool ignore_evict_strategy = false) {\n    if (n == 0) {\n      return 0;\n    }\n    auto dev_ws{dev_mem_pool_->get_workspace<1>(sizeof(size_type), stream)};\n    size_type* d_evicted_counter{dev_ws.get<size_type*>(0)};\n\n    CUDA_CHECK(\n        cudaMemsetAsync(d_evicted_counter, 0, sizeof(size_type), stream));\n    insert_and_evict(n, keys, values, scores, evicted_keys, evicted_values,\n                     evicted_scores, d_evicted_counter, stream, unique_key,\n                     ignore_evict_strategy);\n\n    size_type h_evicted_counter = 0;\n    CUDA_CHECK(cudaMemcpyAsync(&h_evicted_counter, d_evicted_counter,\n                               sizeof(size_type), cudaMemcpyDeviceToHost,\n                               stream));\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    CudaCheckError();\n    return h_evicted_counter;\n  }\n\n  /**\n   * Searches for each key in @p keys in the hash table.\n   * If the key is found and the corresponding value in @p accum_or_assigns is\n   * `true`, the @p vectors_or_deltas is treated as a delta to the old\n   * value, and the delta is added to the old value of the key.\n   *\n   * If the key is not found and the corresponding value in @p accum_or_assigns\n   * is `false`, the @p vectors_or_deltas is treated as a new value and the\n   * key-value pair is updated in the table directly.\n   *\n   * @note When the key is found and the value of @p accum_or_assigns is\n   * `false`, or when the key is not found and the value of @p accum_or_assigns\n   * is `true`, nothing is changed and this operation is ignored.\n   * The algorithm assumes these situations occur while the key was modified or\n   * removed by other processes just now.\n   *\n   * @param n The number of key-value-score tuples to process.\n   * @param keys The keys to insert on GPU-accessible memory with shape (n).\n   * @param value_or_deltas The values or deltas to insert on GPU-accessible\n   * memory with shape (n, DIM).\n   * @param accum_or_assigns The operation type with shape (n). A value of\n   * `true` indicates to accum and `false` indicates to assign.\n   * @param scores The scores to insert on GPU-accessible memory with shape (n).\n   * @parblock\n   * The scores should be a `uint64_t` value for built-in strategies. For\n   * `EvictStrategy::kCustomized`, `uint32_t` scores are also supported.\n   * You can specify a value such as the timestamp of the key insertion or\n   * number of key occurrences to perform a customized eviction strategy.\n   *\n   * The @p scores should be `nullptr`, when the LRU eviction strategy is\n   * applied.\n   * @endparblock\n   *\n   * @param stream The CUDA stream that is used to execute the operation.\n   *\n   * @param ignore_evict_strategy A boolean option indicating whether if\n   * the accum_or_assign ignores the evict strategy of table with current\n   * scores anyway. If true, it does not check whether the scores confroms to\n   * the evict strategy. If false, it requires the scores follow the evict\n   * strategy of table.\n   */\n  void accum_or_assign(const size_type n,\n                       const key_type* keys,                // (n)\n                       const value_type* value_or_deltas,   // (n, DIM)\n                       const bool* accum_or_assigns,        // (n)\n                       const score_type* scores = nullptr,  // (n)\n                       cudaStream_t stream = 0,\n                       bool ignore_evict_strategy = false) {\n    MERLIN_CHECK(\n        !is_memory_mode(),\n        \"[MEMORY_MODE] accum_or_assign() is not supported in dual-bucket \"\n        \"mode. Use insert_or_assign() instead.\");\n    if (n == 0) {\n      return;\n    }\n\n    while (!reach_max_capacity_ &&\n           fast_load_factor(n, stream) > options_.max_load_factor) {\n      reserve(capacity() * 2, stream);\n    }\n\n    if (!ignore_evict_strategy) {\n      check_evict_strategy(scores);\n    }\n\n    std::unique_ptr<insert_unique_lock> lock_ptr;\n    if (options_.api_lock) {\n      lock_ptr = std::make_unique<insert_unique_lock>(mutex_, stream);\n    }\n\n    if (is_fast_mode()) {\n      using Selector =\n          SelectAccumOrAssignKernelWithIO<key_type, value_type, score_type,\n                                          evict_strategy>;\n      static thread_local int step_counter = 0;\n      static thread_local float load_factor = 0.0;\n\n      if (((step_counter++) % kernel_select_interval_) == 0) {\n        load_factor = fast_load_factor(0, stream, false);\n      }\n      Selector::execute_kernel(\n          load_factor, options_.block_size, options_.max_bucket_size,\n          table_->buckets_num, dim(), stream, n, d_table_, keys,\n          value_or_deltas, scores, accum_or_assigns, global_epoch_);\n\n    } else {\n      auto sortOp = SortPairOp<uintptr_t, int>();\n      auto d_sort_bytes = sortOp.get_storage_bytes(n, stream);\n\n      MultiVector<value_type*, int, value_type*, int, bool, uint8_t> mv(\n          n, n, n, n, n, d_sort_bytes);\n      const size_type dev_ws_size = mv.total_size();\n      auto dev_ws{dev_mem_pool_->get_workspace<1>(dev_ws_size, stream)};\n      auto temp_storage = dev_ws.get<uint8_t*>(0);\n      auto dst = get_vector<0>(mv, temp_storage);\n      auto src_offset = get_vector<1>(mv, temp_storage);\n      auto dst_sorted = get_vector<2>(mv, temp_storage);\n      auto src_offset_sorted = get_vector<3>(mv, temp_storage);\n      auto founds = get_vector<4>(mv, temp_storage);\n      auto d_sort_storage = get_vector<5>(mv, temp_storage);\n      sortOp.set_storage(reinterpret_cast<void*>(d_sort_storage));\n\n      CUDA_CHECK(cudaMemsetAsync(dst, 0, dev_ws_size, stream));\n\n      {\n        const size_t block_size = options_.block_size;\n        const size_t N = n * TILE_SIZE;\n        const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);\n\n        accum_or_assign_kernel<key_type, value_type, score_type, evict_strategy,\n                               TILE_SIZE><<<grid_size, block_size, 0, stream>>>(\n            d_table_, options_.max_bucket_size, table_->buckets_num, dim(),\n            keys, dst, scores, accum_or_assigns, src_offset, founds,\n            global_epoch_, N);\n      }\n\n      sortOp.sort(n, reinterpret_cast<uintptr_t*>(dst),\n                  reinterpret_cast<uintptr_t*>(dst_sorted), src_offset,\n                  src_offset_sorted, stream);\n\n      {\n        const size_t block_size = options_.io_block_size;\n        const size_t N = n * dim();\n        const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);\n\n        write_with_accum_kernel<key_type, value_type, score_type>\n            <<<grid_size, block_size, 0, stream>>>(value_or_deltas, dst_sorted,\n                                                   accum_or_assigns, founds,\n                                                   src_offset_sorted, dim(), N);\n      }\n    }\n    CudaCheckError();\n  }\n\n  /**\n   * @brief Searches the hash table for the specified keys.\n   * When a key is missing, the value in @p values and @p scores will be\n   * inserted.\n   *\n   * @param n The number of key-value-score tuples to search or insert.\n   * @param keys The keys to search on GPU-accessible memory with shape (n).\n   * @param values The values to search on GPU-accessible memory with\n   * shape (n, DIM).\n   * @param scores The scores to search on GPU-accessible memory with shape (n).\n   * @parblock\n   * If @p scores is `nullptr`, the score for each key will not be returned.\n   * @endparblock\n   * @param stream The CUDA stream that is used to execute the operation.\n   * @param unique_key If all keys in the same batch are unique.\n   *\n   */\n  void find_or_insert(const size_type n, const key_type* keys,  // (n)\n                      value_type* values,                       // (n * DIM)\n                      score_type* scores = nullptr,             // (n)\n                      cudaStream_t stream = 0, bool unique_key = true,\n                      bool ignore_evict_strategy = false) {\n    MERLIN_CHECK(\n        !is_memory_mode(),\n        \"[MEMORY_MODE] find_or_insert() is not supported in dual-bucket mode. \"\n        \"Use insert_or_assign() and find() separately.\");\n    if (n == 0) {\n      return;\n    }\n\n    while (!reach_max_capacity_ &&\n           fast_load_factor(n, stream) > options_.max_load_factor) {\n      reserve(capacity() * 2, stream);\n    }\n\n    if (!ignore_evict_strategy) {\n      check_evict_strategy(scores);\n    }\n\n    std::unique_ptr<insert_unique_lock> lock_ptr;\n    if (options_.api_lock) {\n      lock_ptr = std::make_unique<insert_unique_lock>(mutex_, stream);\n    }\n\n    if (is_fast_mode()) {\n      static thread_local int step_counter = 0;\n      static thread_local float load_factor = 0.0;\n\n      if (((step_counter++) % kernel_select_interval_) == 0) {\n        load_factor = fast_load_factor(0, stream, false);\n      }\n\n      using Selector =\n          KernelSelector_FindOrInsert<key_type, value_type, score_type,\n                                      evict_strategy, ArchTag>;\n      if (Selector::callable(unique_key,\n                             static_cast<uint32_t>(options_.max_bucket_size),\n                             static_cast<uint32_t>(options_.dim))) {\n        typename Selector::Params kernelParams(\n            load_factor, table_->buckets, table_->buckets_size,\n            table_->buckets_num,\n            static_cast<uint32_t>(options_.max_bucket_size),\n            static_cast<uint32_t>(options_.dim), keys, values, scores, n,\n            global_epoch_);\n        Selector::select_kernel(kernelParams, stream);\n      } else {\n        using Selector =\n            SelectFindOrInsertKernelWithIO<key_type, value_type, score_type,\n                                           evict_strategy>;\n        Selector::execute_kernel(\n            load_factor, options_.block_size, options_.max_bucket_size,\n            table_->buckets_num, options_.dim, stream, n, d_table_,\n            table_->buckets, keys, values, scores, global_epoch_);\n      }\n    } else {\n      auto sortOp = SortPairOp<uintptr_t, int>();\n      auto d_sort_bytes = sortOp.get_storage_bytes(n, stream);\n\n      MultiVector<value_type*, int, value_type*, int, bool, key_type*, uint8_t>\n          mv(n, n, n, n, n, n, d_sort_bytes);\n      const size_type dev_ws_size = mv.total_size();\n      auto dev_ws{dev_mem_pool_->get_workspace<1>(dev_ws_size, stream)};\n      auto temp_storage = dev_ws.get<uint8_t*>(0);\n      auto d_table_value_addrs = get_vector<0>(mv, temp_storage);\n      auto param_key_index = get_vector<1>(mv, temp_storage);\n      auto d_table_value_addrs_sorted = get_vector<2>(mv, temp_storage);\n      auto param_key_index_sorted = get_vector<3>(mv, temp_storage);\n      auto founds = get_vector<4>(mv, temp_storage);\n      auto keys_ptr = get_vector<5>(mv, temp_storage);\n      auto d_sort_storage = get_vector<6>(mv, temp_storage);\n      sortOp.set_storage(reinterpret_cast<void*>(d_sort_storage));\n\n      CUDA_CHECK(cudaMemsetAsync(d_table_value_addrs, 0, dev_ws_size, stream));\n\n      constexpr uint32_t MinBucketCapacityFilter =\n          sizeof(VecD_Load) / sizeof(D);\n\n      bool filter_condition =\n          unique_key && options_.max_bucket_size >= MinBucketCapacityFilter &&\n          !options_.io_by_cpu;\n\n      if (filter_condition) {\n        constexpr uint32_t BLOCK_SIZE = 128;\n\n        find_or_insert_kernel_lock_key_hybrid<key_type, value_type, score_type,\n                                              BLOCK_SIZE, evict_strategy>\n            <<<(n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(\n                table_->buckets, table_->buckets_size, table_->buckets_num,\n                options_.max_bucket_size, options_.dim, keys,\n                d_table_value_addrs, scores, keys_ptr, param_key_index, founds,\n                n, global_epoch_);\n\n      } else {\n        const size_t block_size = options_.block_size;\n        const size_t N = n * TILE_SIZE;\n        const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);\n\n        find_or_insert_kernel<key_type, value_type, score_type, evict_strategy,\n                              TILE_SIZE><<<grid_size, block_size, 0, stream>>>(\n            d_table_, table_->buckets, options_.max_bucket_size,\n            table_->buckets_num, options_.dim, keys, d_table_value_addrs,\n            scores, founds, param_key_index, global_epoch_, N);\n      }\n\n      sortOp.sort(n, reinterpret_cast<uintptr_t*>(d_table_value_addrs),\n                  reinterpret_cast<uintptr_t*>(d_table_value_addrs_sorted),\n                  param_key_index, param_key_index_sorted, stream);\n\n      if (filter_condition) {\n        const size_t block_size = options_.io_block_size;\n        const size_t N = n * dim();\n        const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);\n\n        read_or_write_kernel_unlock_key<key_type, value_type, score_type, V>\n            <<<grid_size, block_size, 0, stream>>>(\n                d_table_value_addrs_sorted, values, founds,\n                param_key_index_sorted, keys_ptr, keys, dim(), N);\n\n      } else if (options_.io_by_cpu) {\n        MultiVector<value_type*, int, bool, value_type> mv1(n, n, n, n * dim());\n        const size_type host_ws_size = mv1.total_size();\n        auto host_ws{host_mem_pool_->get_workspace<1>(host_ws_size, stream)};\n        auto host_temp_storage = host_ws.get<uint8_t*>(0);\n        auto h_table_value_addrs_sorted = get_vector<0>(mv1, host_temp_storage);\n        auto h_param_key_index_sorted = get_vector<1>(mv1, host_temp_storage);\n        auto h_founds = get_vector<2>(mv1, host_temp_storage);\n        auto h_param_values = get_vector<3>(mv1, host_temp_storage);\n\n        CUDA_CHECK(cudaMemcpyAsync(h_table_value_addrs_sorted,\n                                   d_table_value_addrs_sorted, mv1.offset(3),\n                                   cudaMemcpyDeviceToHost, stream));\n        CUDA_CHECK(cudaMemcpyAsync(h_param_values, values,\n                                   n * sizeof(value_type) * dim(),\n                                   cudaMemcpyDeviceToHost, stream));\n        CUDA_CHECK(cudaStreamSynchronize(stream));\n\n        read_or_write_by_cpu<value_type>(\n            h_table_value_addrs_sorted, h_param_values,\n            h_param_key_index_sorted, h_founds, dim(), n);\n        CUDA_CHECK(cudaMemcpyAsync(values, h_param_values,\n                                   n * sizeof(value_type) * dim(),\n                                   cudaMemcpyHostToDevice, stream));\n      } else {\n        const size_t block_size = options_.io_block_size;\n        const size_t N = n * dim();\n        const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);\n\n        read_or_write_kernel<key_type, value_type, score_type>\n            <<<grid_size, block_size, 0, stream>>>(\n                d_table_value_addrs_sorted, values, founds,\n                param_key_index_sorted, dim(), N);\n      }\n    }\n\n    CudaCheckError();\n  }\n\n  /**\n   * @brief Searches the hash table for the specified keys and returns address\n   * of the values. When a key is missing, the value in @p values and @p scores\n   * will be inserted.\n   *\n   * @warning This API returns internal addresses for high-performance but\n   * thread-unsafe. The caller is responsible for guaranteeing data consistency.\n   *\n   * @param n The number of key-value-score tuples to search or insert.\n   * @param keys The keys to search on GPU-accessible memory with shape (n).\n   * @param values  The addresses of values to search on GPU-accessible memory\n   * with shape (n).\n   * @param founds The status that indicates if the keys are found on\n   * @param scores The scores to search on GPU-accessible memory with shape (n).\n   * @parblock\n   * If @p scores is `nullptr`, the score for each key will not be returned.\n   * @endparblock\n   * @param stream The CUDA stream that is used to execute the operation.\n   * @param unique_key If all keys in the same batch are unique.\n   * @param locked_key_ptrs If it isn't nullptr then the keys in the table will\n   * be locked, and key's address will write to locked_key_ptrs. Using\n   * unlock_keys to unlock these keys.\n   *\n   */\n  void find_or_insert(const size_type n, const key_type* keys,  // (n)\n                      value_type** values,                      // (n)\n                      bool* founds,                             // (n)\n                      score_type* scores = nullptr,             // (n)\n                      cudaStream_t stream = 0, bool unique_key = true,\n                      bool ignore_evict_strategy = false,\n                      key_type** locked_key_ptrs = nullptr) {\n    MERLIN_CHECK(\n        !is_memory_mode(),\n        \"[MEMORY_MODE] find_or_insert() is not supported in dual-bucket mode. \"\n        \"Use insert_or_assign() and find() separately.\");\n    if (n == 0) {\n      return;\n    }\n\n    while (!reach_max_capacity_ &&\n           fast_load_factor(n, stream) > options_.max_load_factor) {\n      reserve(capacity() * 2, stream);\n    }\n\n    if (!ignore_evict_strategy) {\n      check_evict_strategy(scores);\n    }\n\n    std::unique_ptr<insert_unique_lock> lock_ptr;\n    if (options_.api_lock) {\n      lock_ptr = std::make_unique<insert_unique_lock>(mutex_, stream);\n    }\n\n    constexpr uint32_t MinBucketCapacityFilter = sizeof(VecD_Load) / sizeof(D);\n\n    if (locked_key_ptrs != nullptr) {\n      if (!unique_key || options_.max_bucket_size < MinBucketCapacityFilter) {\n        throw std::invalid_argument(\n            \"unique_key should be true and max_bucket_size should be larger.\");\n      }\n\n      constexpr uint32_t BLOCK_SIZE = 128U;\n      find_or_insert_ptr_kernel_lock_key<key_type, value_type, score_type,\n                                         BLOCK_SIZE, evict_strategy>\n          <<<(n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(\n              table_->buckets, table_->buckets_size, table_->buckets_num,\n              options_.max_bucket_size, options_.dim, keys, values, scores,\n              locked_key_ptrs, n, founds, global_epoch_);\n      CudaCheckError();\n      return;\n    }\n\n    if (unique_key && options_.max_bucket_size >= MinBucketCapacityFilter) {\n      constexpr uint32_t BLOCK_SIZE = 128U;\n\n      const size_type dev_ws_size{n * sizeof(key_type**)};\n      auto dev_ws{dev_mem_pool_->get_workspace<1>(dev_ws_size, stream)};\n      auto keys_ptr{dev_ws.get<key_type**>(0)};\n      CUDA_CHECK(cudaMemsetAsync(keys_ptr, 0, dev_ws_size, stream));\n\n      find_or_insert_ptr_kernel_lock_key<key_type, value_type, score_type,\n                                         BLOCK_SIZE, evict_strategy>\n          <<<(n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(\n              table_->buckets, table_->buckets_size, table_->buckets_num,\n              options_.max_bucket_size, options_.dim, keys, values, scores,\n              keys_ptr, n, founds, global_epoch_);\n\n      find_or_insert_ptr_kernel_unlock_key<key_type>\n          <<<(n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(\n              keys, keys_ptr, n);\n    } else {\n      using Selector = SelectFindOrInsertPtrKernel<key_type, value_type,\n                                                   score_type, evict_strategy>;\n      static thread_local int step_counter = 0;\n      static thread_local float load_factor = 0.0;\n\n      if (((step_counter++) % kernel_select_interval_) == 0) {\n        load_factor = fast_load_factor(0, stream, false);\n      }\n      Selector::execute_kernel(\n          load_factor, options_.block_size, options_.max_bucket_size,\n          table_->buckets_num, options_.dim, stream, n, d_table_,\n          table_->buckets, keys, values, scores, founds, global_epoch_);\n    }\n\n    CudaCheckError();\n  }\n\n  /**\n   * @brief\n   * This function will lock the keys in the table and unexisted keys will be\n   * ignored.\n   *\n   * @param n The number of keys in the table to be locked.\n   * @param locked_key_ptrs The pointers of locked keys in the table with shape\n   * (n).\n   * @param keys The keys to search on GPU-accessible memory with shape (n).\n   * @param success The status that indicates if the lock operation is\n   * succeed.\n   * @param stream The CUDA stream that is used to execute the operation.\n   * @param scores The scores of the input keys will set to scores if provided.\n   *\n   */\n  void lock_keys(const size_type n,\n                 key_type const* keys,        // (n)\n                 key_type** locked_key_ptrs,  // (n)\n                 bool* success = nullptr,     // (n)\n                 cudaStream_t stream = 0, score_type const* scores = nullptr) {\n    if (n == 0) {\n      return;\n    }\n\n    std::unique_ptr<insert_unique_lock> lock_ptr;\n    if (options_.api_lock) {\n      lock_ptr = std::make_unique<insert_unique_lock>(mutex_, stream);\n    }\n\n    constexpr uint32_t MinBucketCapacityFilter = sizeof(VecD_Load) / sizeof(D);\n    if (options_.max_bucket_size < MinBucketCapacityFilter) {\n      throw std::runtime_error(\n          \"Not support lock_keys API because the bucket capacity is too \"\n          \"small.\");\n    }\n    constexpr uint32_t BLOCK_SIZE = 128U;\n    lock_kernel_with_filter<key_type, value_type, score_type, evict_strategy>\n        <<<(n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(\n            table_->buckets, table_->buckets_num, options_.max_bucket_size,\n            options_.dim, keys, locked_key_ptrs, success, scores, global_epoch_,\n            n);\n    CudaCheckError();\n  }\n\n  /**\n   * @brief Using pointers to address the keys in the hash table and set them\n   * to target keys.\n   * This function will unlock the keys in the table which are locked by\n   * the previous call to find_or_insert.\n   *\n   * @param n The number of keys in the table to be unlocked.\n   * @param locked_key_ptrs The pointers of locked keys in the table with shape\n   * (n).\n   * @param keys The keys to search on GPU-accessible memory with shape (n).\n   * @param success The status that indicates if the unlock operation is\n   * succeed.\n   * @param stream The CUDA stream that is used to execute the operation.\n   *\n   */\n  void unlock_keys(const size_type n, key_type** locked_key_ptrs,  // (n)\n                   const key_type* keys,                           // (n)\n                   bool* success = nullptr,                        // (n)\n                   cudaStream_t stream = 0) {\n    if (n == 0) {\n      return;\n    }\n\n    std::unique_ptr<insert_unique_lock> lock_ptr;\n    if (options_.api_lock) {\n      lock_ptr = std::make_unique<insert_unique_lock>(mutex_, stream);\n    }\n\n    constexpr uint32_t BLOCK_SIZE = 128U;\n    /// TODO: check the key belongs to the bucket.\n    unlock_keys_kernel<key_type>\n        <<<(n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(\n            n, locked_key_ptrs, keys, success);\n  }\n\n  /**\n   * @brief Assign new key-value-score tuples into the hash table.\n   * If the key doesn't exist, the operation on the key will be ignored.\n   *\n   * @param n Number of key-value-score tuples to insert or assign.\n   * @param keys The keys to insert on GPU-accessible memory with shape\n   * (n).\n   * @param values The values to insert on GPU-accessible memory with\n   * shape (n, DIM).\n   * @param scores The scores to insert on GPU-accessible memory with shape\n   * (n).\n   * @parblock\n   * The scores should be a `uint64_t` value for built-in strategies. For\n   * `EvictStrategy::kCustomized`, `uint32_t` scores are also supported.\n   * You can specify a value such as the timestamp of the key insertion or\n   * number of key occurrences to perform a customized eviction strategy.\n   *\n   * The @p scores should be `nullptr`, when the LRU eviction strategy is\n   * applied.\n   * @endparblock\n   *\n   * @param stream The CUDA stream that is used to execute the operation.\n   *\n   * @param unique_key If all keys in the same batch are unique.\n   */\n  void assign(const size_type n,\n              const key_type* keys,                // (n)\n              const value_type* values,            // (n, DIM)\n              const score_type* scores = nullptr,  // (n)\n              cudaStream_t stream = 0, bool unique_key = true) {\n    if (n == 0) {\n      return;\n    }\n\n    check_evict_strategy(scores);\n\n    std::unique_ptr<update_shared_lock> lock_ptr;\n    if (options_.api_lock) {\n      lock_ptr = std::make_unique<update_shared_lock>(mutex_, stream);\n    }\n\n    if (is_fast_mode()) {\n      static thread_local int step_counter = 0;\n      static thread_local float load_factor = 0.0;\n\n      if (((step_counter++) % kernel_select_interval_) == 0) {\n        load_factor = fast_load_factor(0, stream, false);\n      }\n      using Selector = KernelSelector_Update<key_type, value_type, score_type,\n                                             evict_strategy, ArchTag>;\n      if (Selector::callable(unique_key,\n                             static_cast<uint32_t>(options_.max_bucket_size),\n                             static_cast<uint32_t>(options_.dim))) {\n        typename Selector::Params kernelParams(\n            load_factor, table_->buckets, table_->buckets_num,\n            static_cast<uint32_t>(options_.max_bucket_size),\n            static_cast<uint32_t>(options_.dim), keys, values, scores, n,\n            global_epoch_);\n        Selector::select_kernel(kernelParams, stream);\n      } else {\n        using Selector = SelectUpdateKernelWithIO<key_type, value_type,\n                                                  score_type, evict_strategy>;\n        Selector::execute_kernel(\n            load_factor, options_.block_size, options_.max_bucket_size,\n            table_->buckets_num, options_.dim, stream, n, d_table_,\n            table_->buckets, keys, values, scores, global_epoch_);\n      }\n    } else {\n      auto sortOp = SortPairOp<uintptr_t, int>();\n      auto d_sort_bytes = sortOp.get_storage_bytes(n, stream);\n\n      MultiVector<value_type*, int, value_type*, int, key_type*, uint8_t> mv(\n          n, n, n, n, n, d_sort_bytes);\n      const size_type dev_ws_size = mv.total_size();\n      auto dev_ws{dev_mem_pool_->get_workspace<1>(dev_ws_size, stream)};\n      auto temp_storage = dev_ws.get<uint8_t*>(0);\n      auto d_dst = get_vector<0>(mv, temp_storage);\n      auto d_src_offset = get_vector<1>(mv, temp_storage);\n      auto d_dst_sorted = get_vector<2>(mv, temp_storage);\n      auto d_src_offset_sorted = get_vector<3>(mv, temp_storage);\n      auto keys_ptr = get_vector<4>(mv, temp_storage);\n      auto d_sort_storage = get_vector<5>(mv, temp_storage);\n      sortOp.set_storage(reinterpret_cast<void*>(d_sort_storage));\n\n      CUDA_CHECK(cudaMemsetAsync(d_dst, 0, dev_ws_size, stream));\n\n      constexpr uint32_t MinBucketCapacityFilter =\n          sizeof(VecD_Load) / sizeof(D);\n\n      bool filter_condition =\n          options_.max_bucket_size >= MinBucketCapacityFilter &&\n          !options_.io_by_cpu && unique_key;\n\n      if (filter_condition) {\n        constexpr uint32_t BLOCK_SIZE = 128U;\n\n        tlp_update_kernel_hybrid<key_type, value_type, score_type,\n                                 evict_strategy>\n            <<<(n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(\n                table_->buckets, table_->buckets_num, options_.max_bucket_size,\n                options_.dim, keys, d_dst, scores, keys_ptr, d_src_offset,\n                global_epoch_, n);\n\n      } else {\n        const size_t block_size = options_.block_size;\n        const size_t N = n * TILE_SIZE;\n        const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);\n\n        update_kernel<key_type, value_type, score_type, evict_strategy,\n                      TILE_SIZE><<<grid_size, block_size, 0, stream>>>(\n            d_table_, table_->buckets, options_.max_bucket_size,\n            table_->buckets_num, options_.dim, keys, d_dst, scores,\n            d_src_offset, global_epoch_, N);\n      }\n\n      sortOp.sort(n, reinterpret_cast<uintptr_t*>(d_dst),\n                  reinterpret_cast<uintptr_t*>(d_dst_sorted), d_src_offset,\n                  d_src_offset_sorted, stream);\n\n      if (filter_condition) {\n        const size_t block_size = options_.io_block_size;\n        const size_t N = n * dim();\n        const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);\n\n        write_kernel_unlock_key<key_type, value_type, score_type>\n            <<<grid_size, block_size, 0, stream>>>(values, d_dst_sorted,\n                                                   d_src_offset_sorted, dim(),\n                                                   keys, keys_ptr, N);\n\n      } else if (options_.io_by_cpu) {\n        MultiVector<value_type*, int, value_type> mv1(n, n, n * dim());\n        const size_type host_ws_size = mv1.total_size();\n        auto host_ws{host_mem_pool_->get_workspace<1>(host_ws_size, stream)};\n        auto host_temp_storage = host_ws.get<uint8_t*>(0);\n        auto h_dst_sorted = get_vector<0>(mv1, host_temp_storage);\n        auto h_src_offset_sorted = get_vector<1>(mv1, host_temp_storage);\n        auto h_values = get_vector<2>(mv1, host_temp_storage);\n\n        CUDA_CHECK(cudaMemcpyAsync(h_dst_sorted, d_dst_sorted, mv1.offset(2),\n                                   cudaMemcpyDeviceToHost, stream));\n        CUDA_CHECK(cudaMemcpyAsync(h_values, values,\n                                   n * dim() * sizeof(value_type),\n                                   cudaMemcpyDeviceToHost, stream));\n        CUDA_CHECK(cudaStreamSynchronize(stream));\n\n        write_by_cpu<value_type>(h_dst_sorted, h_values, h_src_offset_sorted,\n                                 dim(), n);\n      } else {\n        const size_t block_size = options_.io_block_size;\n        const size_t N = n * dim();\n        const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);\n\n        write_kernel<key_type, value_type, score_type>\n            <<<grid_size, block_size, 0, stream>>>(\n                values, d_dst_sorted, d_src_offset_sorted, dim(), N);\n      }\n    }\n\n    CudaCheckError();\n  }\n\n  /**\n   * @brief Assign new scores for keys.\n   * If the key doesn't exist, the operation on the key will be ignored.\n   *\n   * @param n Number of key-score pairs to assign.\n   * @param keys The keys to insert on GPU-accessible memory with shape\n   * (n).\n   * @parblock\n   * The scores should be a `uint64_t` value for built-in strategies. For\n   * `EvictStrategy::kCustomized`, `uint32_t` scores are also supported.\n   * You can specify a value such as the timestamp of the key insertion or\n   * number of key occurrences to perform a customized eviction strategy.\n   *\n   * The @p scores should be `nullptr`, when the LRU eviction strategy is\n   * applied.\n   * @endparblock\n   *\n   * @param stream The CUDA stream that is used to execute the operation.\n   *\n   * @param unique_key If all keys in the same batch are unique.\n   */\n  void assign_scores(const size_type n,\n                     const key_type* keys,                // (n)\n                     const score_type* scores = nullptr,  // (n)\n                     cudaStream_t stream = 0, bool unique_key = true) {\n    MERLIN_CHECK(\n        !is_memory_mode(),\n        \"[MEMORY_MODE] assign_scores() is not supported in dual-bucket mode. \"\n        \"Scores are managed by insert_or_assign() in MEMORY_MODE.\");\n    if (n == 0) {\n      return;\n    }\n\n    check_evict_strategy(scores);\n\n    {\n      std::unique_ptr<update_shared_lock> lock_ptr;\n      if (options_.api_lock) {\n        lock_ptr = std::make_unique<update_shared_lock>(mutex_, stream);\n      }\n      static thread_local int step_counter = 0;\n      static thread_local float load_factor = 0.0;\n\n      if (((step_counter++) % kernel_select_interval_) == 0) {\n        load_factor = fast_load_factor(0, stream, false);\n      }\n      using Selector = KernelSelector_UpdateScore<key_type, value_type,\n                                                  score_type, evict_strategy>;\n      if (Selector::callable(unique_key,\n                             static_cast<uint32_t>(options_.max_bucket_size))) {\n        typename Selector::Params kernelParams(\n            load_factor, table_->buckets, table_->buckets_num,\n            static_cast<uint32_t>(options_.max_bucket_size), keys, scores, n,\n            global_epoch_);\n        Selector::select_kernel(kernelParams, stream);\n      } else {\n        using Selector = SelectUpdateScoreKernel<key_type, value_type,\n                                                 score_type, evict_strategy>;\n        Selector::execute_kernel(load_factor, options_.block_size,\n                                 options_.max_bucket_size, table_->buckets_num,\n                                 stream, n, d_table_, table_->buckets, keys,\n                                 scores, global_epoch_);\n      }\n    }\n\n    CudaCheckError();\n  }\n\n  /**\n   * @brief Alias of `assign_scores`.\n   */\n  void assign(const size_type n,\n              const key_type* keys,                // (n)\n              const score_type* scores = nullptr,  // (n)\n              cudaStream_t stream = 0, bool unique_key = true) {\n    assign_scores(n, keys, scores, stream, unique_key);\n  }\n\n  /**\n   * @brief Assign new values for each keys .\n   * If the key doesn't exist, the operation on the key will be ignored.\n   *\n   * @param n Number of key-value pairs to assign.\n   * @param keys The keys need to be operated, which must be on GPU-accessible\n   * memory with shape (n).\n   * @param values The values need to be updated, which must be on\n   * GPU-accessible memory with shape (n, DIM).\n   *\n   * @param stream The CUDA stream that is used to execute the operation.\n   *\n   * @param unique_key If all keys in the same batch are unique.\n   */\n  void assign_values(const size_type n,\n                     const key_type* keys,      // (n)\n                     const value_type* values,  // (n, DIM)\n                     cudaStream_t stream = 0, bool unique_key = true) {\n    MERLIN_CHECK(\n        !is_memory_mode(),\n        \"[MEMORY_MODE] assign_values() is not supported in dual-bucket mode. \"\n        \"Use insert_or_assign() to update values in MEMORY_MODE.\");\n    if (n == 0) {\n      return;\n    }\n\n    std::unique_ptr<update_shared_lock> lock_ptr;\n    if (options_.api_lock) {\n      lock_ptr = std::make_unique<update_shared_lock>(mutex_, stream);\n    }\n\n    if (is_fast_mode()) {\n      static thread_local int step_counter = 0;\n      static thread_local float load_factor = 0.0;\n\n      if (((step_counter++) % kernel_select_interval_) == 0) {\n        load_factor = fast_load_factor(0, stream, false);\n      }\n      using Selector = KernelSelector_UpdateValues<key_type, value_type,\n                                                   score_type, ArchTag>;\n      if (Selector::callable(unique_key,\n                             static_cast<uint32_t>(options_.max_bucket_size),\n                             static_cast<uint32_t>(options_.dim))) {\n        typename Selector::Params kernelParams(\n            load_factor, table_->buckets, table_->buckets_num,\n            static_cast<uint32_t>(options_.max_bucket_size),\n            static_cast<uint32_t>(options_.dim), keys, values, n);\n        Selector::select_kernel(kernelParams, stream);\n      } else {\n        using Selector =\n            SelectUpdateValuesKernelWithIO<key_type, value_type, score_type>;\n        Selector::execute_kernel(load_factor, options_.block_size,\n                                 options_.max_bucket_size, table_->buckets_num,\n                                 options_.dim, stream, n, d_table_,\n                                 table_->buckets, keys, values);\n      }\n    } else {\n      auto sortOp = SortPairOp<uintptr_t, int>();\n      auto d_sort_bytes = sortOp.get_storage_bytes(n, stream);\n\n      MultiVector<value_type*, int, value_type*, int, key_type*, uint8_t> mv(\n          n, n, n, n, n, d_sort_bytes);\n      const size_type dev_ws_size = mv.total_size();\n      auto dev_ws{dev_mem_pool_->get_workspace<1>(dev_ws_size, stream)};\n      auto temp_storage = dev_ws.get<uint8_t*>(0);\n      auto d_dst = get_vector<0>(mv, temp_storage);\n      auto d_src_offset = get_vector<1>(mv, temp_storage);\n      auto d_dst_sorted = get_vector<2>(mv, temp_storage);\n      auto d_src_offset_sorted = get_vector<3>(mv, temp_storage);\n      auto keys_ptr = get_vector<4>(mv, temp_storage);\n      auto d_sort_storage = get_vector<5>(mv, temp_storage);\n      sortOp.set_storage(reinterpret_cast<void*>(d_sort_storage));\n\n      CUDA_CHECK(cudaMemsetAsync(d_dst, 0, dev_ws_size, stream));\n\n      constexpr uint32_t MinBucketCapacityFilter =\n          sizeof(VecD_Load) / sizeof(D);\n\n      bool filter_condition =\n          options_.max_bucket_size >= MinBucketCapacityFilter &&\n          !options_.io_by_cpu && unique_key;\n\n      if (filter_condition) {\n        constexpr uint32_t BLOCK_SIZE = 128U;\n\n        tlp_update_values_kernel_hybrid<key_type, value_type, score_type>\n            <<<(n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(\n                table_->buckets, table_->buckets_num, options_.max_bucket_size,\n                options_.dim, keys, d_dst, keys_ptr, d_src_offset, n);\n\n      } else {\n        const size_t block_size = options_.block_size;\n        const size_t N = n * TILE_SIZE;\n        const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);\n\n        update_values_kernel<key_type, value_type, score_type, TILE_SIZE>\n            <<<grid_size, block_size, 0, stream>>>(\n                d_table_, table_->buckets, options_.max_bucket_size,\n                table_->buckets_num, options_.dim, keys, d_dst, d_src_offset,\n                N);\n      }\n\n      sortOp.sort(n, reinterpret_cast<uintptr_t*>(d_dst),\n                  reinterpret_cast<uintptr_t*>(d_dst_sorted), d_src_offset,\n                  d_src_offset_sorted, stream);\n\n      if (filter_condition) {\n        const size_t block_size = options_.io_block_size;\n        uint64_t total_value_size = sizeof(value_type) * dim();\n        if (total_value_size % 16 == 0) {\n          using VecV = byte16;\n          uint64_t vec_dim = total_value_size / sizeof(VecV);\n          const size_t N = n * vec_dim;\n          const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);\n\n          write_kernel_unlock_key<key_type, VecV, score_type>\n              <<<grid_size, block_size, 0, stream>>>(\n                  reinterpret_cast<const VecV*>(values),\n                  reinterpret_cast<VecV**>(d_dst_sorted), d_src_offset_sorted,\n                  vec_dim, keys, keys_ptr, N);\n        } else {\n          const size_t N = n * dim();\n          const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);\n\n          write_kernel_unlock_key<key_type, value_type, score_type>\n              <<<grid_size, block_size, 0, stream>>>(values, d_dst_sorted,\n                                                     d_src_offset_sorted, dim(),\n                                                     keys, keys_ptr, N);\n        }\n      } else if (options_.io_by_cpu) {\n        MultiVector<value_type*, int, value_type> mv1(n, n, n * dim());\n        const size_type host_ws_size = mv1.total_size();\n        auto host_ws{host_mem_pool_->get_workspace<1>(host_ws_size, stream)};\n        auto host_temp_storage = host_ws.get<uint8_t*>(0);\n        auto h_dst_sorted = get_vector<0>(mv1, host_temp_storage);\n        auto h_src_offset_sorted = get_vector<1>(mv1, host_temp_storage);\n        auto h_values = get_vector<2>(mv1, host_temp_storage);\n\n        CUDA_CHECK(cudaMemcpyAsync(h_dst_sorted, d_dst_sorted, mv1.offset(2),\n                                   cudaMemcpyDeviceToHost, stream));\n        CUDA_CHECK(cudaMemcpyAsync(h_values, values,\n                                   n * dim() * sizeof(value_type),\n                                   cudaMemcpyDeviceToHost, stream));\n        CUDA_CHECK(cudaStreamSynchronize(stream));\n\n        write_by_cpu<value_type>(h_dst_sorted, h_values, h_src_offset_sorted,\n                                 dim(), n);\n      } else {\n        const size_t block_size = options_.io_block_size;\n        const size_t N = n * dim();\n        const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);\n\n        write_kernel<key_type, value_type, score_type>\n            <<<grid_size, block_size, 0, stream>>>(\n                values, d_dst_sorted, d_src_offset_sorted, dim(), N);\n      }\n    }\n\n    CudaCheckError();\n  }\n\n  /**\n   * @brief Searches the hash table for the specified keys.\n   *\n   * @note When a key is missing, the value in @p values is not changed.\n   *\n   * @param n The number of key-value-score tuples to search.\n   * @param keys The keys to search on GPU-accessible memory with shape (n).\n   * @param values The values to search on GPU-accessible memory with\n   * shape (n, DIM).\n   * @param founds The status that indicates if the keys are found on\n   * GPU-accessible memory with shape (n).\n   * @param scores The scores to search on GPU-accessible memory with shape (n).\n   * @parblock\n   * If @p scores is `nullptr`, the score for each key will not be returned.\n   * @endparblock\n   * @param stream The CUDA stream that is used to execute the operation.\n   *\n   */\n  void find(const size_type n, const key_type* keys,  // (n)\n            value_type* values,                       // (n, DIM)\n            bool* founds,                             // (n)\n            score_type* scores = nullptr,             // (n)\n            cudaStream_t stream = 0) const {\n    if (n == 0) {\n      return;\n    }\n\n    CUDA_CHECK(cudaMemsetAsync(founds, 0, n * sizeof(bool), stream));\n\n    std::unique_ptr<read_shared_lock> lock_ptr;\n    if (options_.api_lock) {\n      lock_ptr = std::make_unique<read_shared_lock>(mutex_, stream);\n    }\n\n    const uint32_t value_size = dim() * sizeof(V);\n\n    // MEMORY_MODE: dual-bucket find (sequential b1 then b2).\n    if (is_memory_mode()) {\n      using DualSelector = SelectDualBucketLookupKernel<key_type, value_type,\n                                                        score_type, ArchTag>;\n      LookupKernelParams<key_type, value_type, score_type> lookupParams(\n          table_->buckets, table_->buckets_num, static_cast<uint32_t>(dim()),\n          keys, values, scores, founds, n);\n      DualSelector::select_kernel(lookupParams, table_->buckets_size, stream);\n      CudaCheckError();\n      return;\n    }\n\n    if (is_fast_mode()) {\n      using Selector = SelectPipelineLookupKernelWithIO<key_type, value_type,\n                                                        score_type, ArchTag>;\n      const uint32_t pipeline_max_size = Selector::max_value_size();\n      // Pipeline lookup kernel only supports \"bucket_size = 128\".\n      if (options_.max_bucket_size == 128 && value_size <= pipeline_max_size) {\n        LookupKernelParams<key_type, value_type, score_type> lookupParams(\n            table_->buckets, table_->buckets_num, static_cast<uint32_t>(dim()),\n            keys, values, scores, founds, n);\n        Selector::select_kernel(lookupParams, stream);\n      } else {\n        using Selector =\n            SelectLookupKernelWithIO<key_type, value_type, score_type>;\n        static thread_local int step_counter = 0;\n        static thread_local float load_factor = 0.0;\n\n        if (((step_counter++) % kernel_select_interval_) == 0) {\n          load_factor = fast_load_factor(0, stream, false);\n        }\n        Selector::execute_kernel(load_factor, options_.block_size,\n                                 options_.max_bucket_size, table_->buckets_num,\n                                 options_.dim, stream, n, d_table_,\n                                 table_->buckets, keys, values, scores, founds);\n      }\n    } else {\n      auto sortOp = SortPairOp<uintptr_t, int>();\n      auto d_sort_bytes = sortOp.get_storage_bytes(n, stream);\n\n      MultiVector<value_type*, int, value_type*, int, uint8_t> mv(n, n, n, n,\n                                                                  d_sort_bytes);\n      const size_type dev_ws_size = mv.total_size();\n      auto dev_ws{dev_mem_pool_->get_workspace<1>(dev_ws_size, stream)};\n      auto temp_storage = dev_ws.get<uint8_t*>(0);\n      auto src = get_vector<0>(mv, temp_storage);\n      auto dst_offset = get_vector<1>(mv, temp_storage);\n      auto src_sorted = get_vector<2>(mv, temp_storage);\n      auto dst_offset_sorted = get_vector<3>(mv, temp_storage);\n      auto d_sort_storage = get_vector<4>(mv, temp_storage);\n      sortOp.set_storage(reinterpret_cast<void*>(d_sort_storage));\n\n      CUDA_CHECK(cudaMemsetAsync(src, 0, dev_ws_size, stream));\n\n      constexpr uint32_t MinBucketCapacityFilter =\n          sizeof(VecD_Load) / sizeof(D);\n\n      bool filter_condition =\n          options_.max_bucket_size >= MinBucketCapacityFilter;\n\n      if (filter_condition) {\n        constexpr uint32_t BLOCK_SIZE = 128U;\n\n        tlp_lookup_kernel_hybrid<key_type, value_type, score_type>\n            <<<(n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(\n                table_->buckets, table_->buckets_num, options_.max_bucket_size,\n                options_.dim, keys, src, scores, dst_offset, founds, n);\n      } else {\n        const size_t block_size = options_.block_size;\n        const size_t N = n * TILE_SIZE;\n        const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);\n\n        lookup_kernel<key_type, value_type, score_type, TILE_SIZE>\n            <<<grid_size, block_size, 0, stream>>>(\n                d_table_, table_->buckets, options_.max_bucket_size,\n                table_->buckets_num, options_.dim, keys, src, scores, founds,\n                dst_offset, N);\n      }\n\n      if (values != nullptr) {\n        sortOp.sort(n, reinterpret_cast<uintptr_t*>(src),\n                    reinterpret_cast<uintptr_t*>(src_sorted), dst_offset,\n                    dst_offset_sorted, stream);\n\n        const size_t block_size = options_.io_block_size;\n        const size_t N = n * dim();\n        const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);\n\n        read_kernel<key_type, value_type, score_type>\n            <<<grid_size, block_size, 0, stream>>>(src_sorted, values, founds,\n                                                   dst_offset_sorted, dim(), N);\n      }\n    }\n\n    CudaCheckError();\n  }\n\n  /**\n   * @brief Searches the hash table for the specified keys.\n   *\n   * @note When the searched keys are not hit, missed keys/indices/size can be\n   * obtained.\n   *\n   * @param n The number of key-value-score tuples to search.\n   * @param keys The keys to search on GPU-accessible memory with shape (n).\n   * @param values The values to search on GPU-accessible memory with\n   * shape (n, DIM).\n   * @param missed_keys The missed keys to search on GPU-accessible memory with\n   * shape (n).\n   * @param missed_indices The missed indices to search on GPU-accessible memory\n   * with shape (n).\n   * @param missed_size The size of `missed_keys` and `missed_indices`.\n   * @param scores The scores to search on GPU-accessible memory with shape (n).\n   * @parblock\n   * If @p scores is `nullptr`, the score for each key will not be returned.\n   * @endparblock\n   * @param stream The CUDA stream that is used to execute the operation.\n   */\n  void find(const size_type n, const key_type* keys,  // (n)\n            value_type* values,                       // (n, DIM)\n            key_type* missed_keys,                    // (n)\n            int* missed_indices,                      // (n)\n            int* missed_size,                         // scalar\n            score_type* scores = nullptr,             // (n)\n            cudaStream_t stream = 0) const {\n    if (n == 0) {\n      return;\n    }\n\n    CUDA_CHECK(cudaMemsetAsync(missed_size, 0, sizeof(*missed_size), stream));\n\n    std::unique_ptr<read_shared_lock> lock_ptr;\n    if (options_.api_lock) {\n      lock_ptr = std::make_unique<read_shared_lock>(mutex_, stream);\n    }\n\n    const uint32_t value_size = options_.dim * sizeof(V);\n\n    if (is_fast_mode()) {\n      using Selector = SelectPipelineLookupKernelWithIO<key_type, value_type,\n                                                        score_type, ArchTag>;\n      const uint32_t pipeline_max_size = Selector::max_value_size();\n      // Pipeline lookup kernel only supports \"bucket_size = 128\".\n      if (options_.max_bucket_size == 128 && value_size <= pipeline_max_size) {\n        LookupKernelParamsV2<key_type, value_type, score_type> lookupParams(\n            table_->buckets, table_->buckets_num, static_cast<uint32_t>(dim()),\n            keys, values, scores, missed_keys, missed_indices, missed_size, n);\n        Selector::select_kernel(lookupParams, stream);\n      } else {\n        using Selector =\n            SelectLookupKernelWithIOV2<key_type, value_type, score_type>;\n        static thread_local int step_counter = 0;\n        static thread_local float load_factor = 0.0;\n\n        if (((step_counter++) % kernel_select_interval_) == 0) {\n          load_factor = fast_load_factor(0, stream, false);\n        }\n        Selector::execute_kernel(load_factor, options_.block_size,\n                                 options_.max_bucket_size, table_->buckets_num,\n                                 options_.dim, stream, n, d_table_,\n                                 table_->buckets, keys, values, scores,\n                                 missed_keys, missed_indices, missed_size);\n      }\n    } else {\n      auto sortOp = SortPairOp<uintptr_t, int>();\n      auto d_sort_bytes = sortOp.get_storage_bytes(n, stream);\n\n      MultiVector<value_type*, int, value_type*, int, uint8_t> mv(n, n, n, n,\n                                                                  d_sort_bytes);\n      const size_type dev_ws_size = mv.total_size();\n      auto dev_ws{dev_mem_pool_->get_workspace<1>(dev_ws_size, stream)};\n      auto temp_storage = dev_ws.get<uint8_t*>(0);\n      auto src = get_vector<0>(mv, temp_storage);\n      auto dst_offset = get_vector<1>(mv, temp_storage);\n      auto src_sorted = get_vector<2>(mv, temp_storage);\n      auto dst_offset_sorted = get_vector<3>(mv, temp_storage);\n      auto d_sort_storage = get_vector<4>(mv, temp_storage);\n      sortOp.set_storage(reinterpret_cast<void*>(d_sort_storage));\n\n      CUDA_CHECK(cudaMemsetAsync(src, 0, dev_ws_size, stream));\n\n      constexpr uint32_t MinBucketCapacityFilter =\n          sizeof(VecD_Load) / sizeof(D);\n\n      bool filter_condition =\n          options_.max_bucket_size >= MinBucketCapacityFilter;\n\n      if (filter_condition) {\n        constexpr uint32_t BLOCK_SIZE = 128U;\n\n        tlp_lookup_kernel_hybrid<key_type, value_type, score_type>\n            <<<(n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(\n                table_->buckets, table_->buckets_num, options_.max_bucket_size,\n                options_.dim, keys, src, scores, dst_offset, missed_keys,\n                missed_indices, missed_size, n);\n      } else {\n        const size_t block_size = options_.block_size;\n        const size_t N = n * TILE_SIZE;\n        const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);\n\n        lookup_kernel<key_type, value_type, score_type, TILE_SIZE>\n            <<<grid_size, block_size, 0, stream>>>(\n                d_table_, table_->buckets, options_.max_bucket_size,\n                table_->buckets_num, options_.dim, keys, src, scores,\n                missed_keys, missed_indices, missed_size, dst_offset, N);\n      }\n\n      if (values != nullptr) {\n        sortOp.sort(n, reinterpret_cast<uintptr_t*>(src),\n                    reinterpret_cast<uintptr_t*>(src_sorted), dst_offset,\n                    dst_offset_sorted, stream);\n\n        const size_t block_size = options_.io_block_size;\n        const size_t N = n * dim();\n        const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);\n\n        read_kernel<key_type, value_type, score_type>\n            <<<grid_size, block_size, 0, stream>>>(src_sorted, values,\n                                                   dst_offset_sorted, dim(), N);\n      }\n    }\n\n    CudaCheckError();\n  }\n\n  /**\n   * @brief Searches the hash table for the specified keys and returns address\n   * of the values.\n   *\n   * @note When a key is missing, the data in @p values won't change.\n   * @warning This API returns internal addresses for high-performance but\n   * thread-unsafe. The caller is responsible for guaranteeing data consistency.\n   *\n   * @param n The number of key-value-score tuples to search.\n   * @param keys The keys to search on GPU-accessible memory with shape (n).\n   * @param values The addresses of values to search on GPU-accessible memory\n   * with shape (n).\n   * @param founds The status that indicates if the keys are found on\n   * GPU-accessible memory with shape (n).\n   * @param scores The scores to search on GPU-accessible memory with shape (n).\n   * @parblock\n   * If @p scores is `nullptr`, the score for each key will not be returned.\n   * @endparblock\n   * @param stream The CUDA stream that is used to execute the operation.\n   * @param unique_key If all keys in the same batch are unique.\n   *\n   */\n  void find(const size_type n, const key_type* keys,  // (n)\n            value_type** values,                      // (n)\n            bool* founds,                             // (n)\n            score_type* scores = nullptr,             // (n)\n            cudaStream_t stream = 0, bool unique_key = true) const {\n    if (n == 0) {\n      return;\n    }\n\n    std::unique_ptr<read_shared_lock> lock_ptr;\n    if (options_.api_lock) {\n      lock_ptr = std::make_unique<read_shared_lock>(mutex_, stream);\n    }\n\n    constexpr uint32_t MinBucketCapacityFilter = sizeof(VecD_Load) / sizeof(D);\n    if (unique_key && options_.max_bucket_size >= MinBucketCapacityFilter) {\n      // Track load factor to choose between TLP and pipelined kernels.\n      static thread_local int step_counter = 0;\n      static thread_local float load_factor = 0.0;\n      if (((step_counter++) % kernel_select_interval_) == 0) {\n        load_factor = fast_load_factor(0, stream, false);\n      }\n\n      if (load_factor > 0.875f && options_.max_bucket_size == 128) {\n        // At high load factors, the TLP kernel degrades because empty-slot\n        // early termination fails.  Switch to the pipelined cooperative kernel\n        // which scans all 128 digests in one parallel step (32 threads/key).\n        constexpr uint32_t BLOCK_SIZE = 128U;\n        const size_t grid_size = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;\n        lookup_ptr_kernel_with_pipeline<key_type, value_type, score_type>\n            <<<grid_size, BLOCK_SIZE, 0, stream>>>(\n                table_->buckets, table_->buckets_num, options_.dim, keys,\n                values, scores, founds, n);\n      } else {\n        constexpr uint32_t BLOCK_SIZE = 128U;\n        tlp_lookup_ptr_kernel_with_filter<key_type, value_type, score_type,\n                                          evict_strategy>\n            <<<(n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(\n                table_->buckets, table_->buckets_num, options_.max_bucket_size,\n                options_.dim, keys, values, scores, founds, n, false,\n                global_epoch_);\n      }\n    } else {\n      using Selector = SelectLookupPtrKernel<key_type, value_type, score_type>;\n      static thread_local int step_counter = 0;\n      static thread_local float load_factor = 0.0;\n\n      if (((step_counter++) % kernel_select_interval_) == 0) {\n        load_factor = fast_load_factor(0, stream, false);\n      }\n\n      Selector::execute_kernel(load_factor, options_.block_size,\n                               options_.max_bucket_size, table_->buckets_num,\n                               options_.dim, stream, n, d_table_,\n                               table_->buckets, keys, values, scores, founds);\n    }\n\n    CudaCheckError();\n  }\n\n  /**\n   * @brief Searches the hash table for the specified keys and returns address\n   * of the values, and will update the scores.\n   *\n   * @note When a key is missing, the data in @p values won't change.\n   * @warning This API returns internal addresses for high-performance but\n   * thread-unsafe. The caller is responsible for guaranteeing data consistency.\n   *\n   * @param n The number of key-value-score tuples to search.\n   * @param keys The keys to search on GPU-accessible memory with shape (n).\n   * @param values The addresses of values to search on GPU-accessible memory\n   * with shape (n).\n   * @param founds The status that indicates if the keys are found on\n   * GPU-accessible memory with shape (n).\n   * @param scores The scores to search on GPU-accessible memory with shape (n).\n   * @parblock\n   * If @p scores is `nullptr`, the score for each key will not be returned.\n   * @endparblock\n   * @param stream The CUDA stream that is used to execute the operation.\n   * @param unique_key If all keys in the same batch are unique.\n   *\n   */\n  void find_and_update(const size_type n, const key_type* keys,  // (n)\n                       value_type** values,                      // (n)\n                       bool* founds,                             // (n)\n                       score_type* scores = nullptr,             // (n)\n                       cudaStream_t stream = 0, bool unique_key = true) {\n    if (n == 0) {\n      return;\n    }\n\n    std::unique_ptr<read_shared_lock> lock_ptr;\n    if (options_.api_lock) {\n      lock_ptr = std::make_unique<read_shared_lock>(mutex_, stream);\n    }\n\n    check_evict_strategy(scores);\n\n    constexpr uint32_t MinBucketCapacityFilter = sizeof(VecD_Load) / sizeof(D);\n    if (unique_key && options_.max_bucket_size >= MinBucketCapacityFilter) {\n      constexpr uint32_t BLOCK_SIZE = 128U;\n      tlp_lookup_ptr_kernel_with_filter<key_type, value_type, score_type,\n                                        evict_strategy>\n          <<<(n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(\n              table_->buckets, table_->buckets_num, options_.max_bucket_size,\n              options_.dim, keys, values, scores, founds, n, true,\n              global_epoch_);\n    } else {\n      throw std::runtime_error(\n          \"Not support update score when keys are not unique or bucket \"\n          \"capacity is small.\");\n    }\n\n    CudaCheckError();\n  }\n\n  /**\n   * @brief Checks if there are elements with key equivalent to `keys` in the\n   * table.\n   *\n   * @param n The number of `keys` to check.\n   * @param keys The keys to search on GPU-accessible memory with shape (n).\n   * @param founds The result that indicates if the keys are found, and should\n   * be allocated by caller on GPU-accessible memory with shape (n).\n   * @param stream The CUDA stream that is used to execute the operation.\n   *\n   */\n  void contains(const size_type n, const key_type* keys,  // (n)\n                bool* founds,                             // (n)\n                cudaStream_t stream = 0) const {\n    MERLIN_CHECK(\n        !is_memory_mode(),\n        \"[MEMORY_MODE] contains() is not supported in dual-bucket mode. \"\n        \"Key may reside in either bucket.\");\n    if (n == 0) {\n      return;\n    }\n\n    std::unique_ptr<read_shared_lock> lock_ptr;\n    if (options_.api_lock) {\n      lock_ptr = std::make_unique<read_shared_lock>(mutex_, stream);\n    }\n\n    if (options_.max_bucket_size == 128) {\n      // Pipeline lookup kernel only supports \"bucket_size = 128\".\n      using Selector = SelectPipelineContainsKernel<key_type, value_type,\n                                                    score_type, ArchTag>;\n      ContainsKernelParams<key_type, value_type, score_type> containsParams(\n          table_->buckets, table_->buckets_num, static_cast<uint32_t>(dim()),\n          keys, founds, n);\n      Selector::select_kernel(containsParams, stream);\n    } else {\n      using Selector = SelectContainsKernel<key_type, value_type, score_type>;\n      static thread_local int step_counter = 0;\n      static thread_local float load_factor = 0.0;\n\n      if (((step_counter++) % kernel_select_interval_) == 0) {\n        load_factor = fast_load_factor(0, stream, false);\n      }\n      Selector::execute_kernel(load_factor, options_.block_size,\n                               options_.max_bucket_size, table_->buckets_num,\n                               options_.dim, stream, n, d_table_,\n                               table_->buckets, keys, founds);\n    }\n    CudaCheckError();\n  }\n\n  /**\n   * @brief Removes specified elements from the hash table.\n   *\n   * @param n The number of keys to remove.\n   * @param keys The keys to remove on GPU-accessible memory.\n   * @param stream The CUDA stream that is used to execute the operation.\n   *\n   */\n  void erase(const size_type n, const key_type* keys, cudaStream_t stream = 0) {\n    MERLIN_CHECK(!is_memory_mode(),\n                 \"[MEMORY_MODE] erase() is not supported in dual-bucket mode. \"\n                 \"Key may reside in either bucket.\");\n    if (n == 0) {\n      return;\n    }\n\n    std::unique_ptr<update_read_lock> lock_ptr;\n    if (options_.api_lock) {\n      lock_ptr = std::make_unique<update_read_lock>(mutex_, stream);\n    }\n\n    {\n      const size_t block_size = options_.block_size;\n      const size_t N = n * TILE_SIZE;\n      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);\n\n      remove_kernel<key_type, value_type, score_type, TILE_SIZE>\n          <<<grid_size, block_size, 0, stream>>>(\n              d_table_, keys, table_->buckets, table_->buckets_size,\n              table_->bucket_max_size, table_->buckets_num, N);\n    }\n\n    CudaCheckError();\n    return;\n  }\n\n  /**\n   * @brief Erases all elements that satisfy the predicate @p pred from the\n   * hash table.\n   *\n   * @tparam PredFunctor The predicate template <typename K, typename S>\n   * function with operator signature (bool*)(const K&, const S&, const K&,\n   * const threshold) that returns `true` if the element should be erased. The\n   * value for @p pred should be a function with type `Pred` defined like the\n   * following example:\n   *\n   *    ```\n   *    template <class K, class S>\n   *    struct EraseIfPredFunctor {\n   *      __forceinline__ __device__ bool operator()(const K& key,\n   *                                                 S& score,\n   *                                                 const K& pattern,\n   *                                                 const S& threshold) {\n   *        return ((key & 0x1 == pattern) && (score < threshold));\n   *      }\n   *    };\n   *    ```\n   *\n   * @param pattern The third user-defined argument to @p pred with key_type\n   * type.\n   * @param threshold The fourth user-defined argument to @p pred with\n   * score_type type.\n   * @param stream The CUDA stream that is used to execute the operation.\n   *\n   * @return The number of elements removed.\n   *\n   */\n  template <template <typename, typename> class PredFunctor>\n  size_type erase_if(const key_type& pattern, const score_type& threshold,\n                     cudaStream_t stream = 0) {\n    std::unique_ptr<update_read_lock> lock_ptr;\n    if (options_.api_lock) {\n      lock_ptr = std::make_unique<update_read_lock>(mutex_, stream);\n    }\n\n    auto dev_ws{dev_mem_pool_->get_workspace<1>(sizeof(size_type), stream)};\n    auto d_count{dev_ws.get<size_type*>(0)};\n\n    CUDA_CHECK(cudaMemsetAsync(d_count, 0, sizeof(size_type), stream));\n\n    {\n      const size_t block_size = options_.block_size;\n      const size_t N = table_->buckets_num;\n      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);\n\n      remove_kernel<key_type, value_type, score_type, PredFunctor>\n          <<<grid_size, block_size, 0, stream>>>(\n              d_table_, pattern, threshold, d_count, table_->buckets,\n              table_->buckets_size, table_->bucket_max_size,\n              table_->buckets_num, N);\n    }\n\n    size_type count = 0;\n    CUDA_CHECK(cudaMemcpyAsync(&count, d_count, sizeof(size_type),\n                               cudaMemcpyDeviceToHost, stream));\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    CudaCheckError();\n    return count;\n  }\n\n  /**\n   * @brief Erase the key-value-score tuples which match @tparam PredFunctor.\n   * @param pred A functor with template <K, V, S> defined an operator with\n   * signature:  __device__ (bool*)(const K&, const V*, const S&, const\n   * cg::thread_block_tile<GroupSize>&).\n   *  @param stream The CUDA stream that is used to execute the operation.\n   *\n   * @return The number of elements removed.\n   */\n\n  template <typename PredFunctor>\n  size_type erase_if_v2(PredFunctor& pred, cudaStream_t stream = 0) {\n    std::unique_ptr<update_read_lock> lock_ptr;\n    if (options_.api_lock) {\n      lock_ptr = std::make_unique<update_read_lock>(mutex_, stream);\n    }\n\n    auto dev_ws{dev_mem_pool_->get_workspace<1>(sizeof(size_type), stream)};\n    auto d_count{dev_ws.get<size_type*>(0)};\n\n    CUDA_CHECK(cudaMemsetAsync(d_count, 0, sizeof(size_type), stream));\n\n    {\n      /// Search_length should be multiple of GroupSize for communication.\n      uint64_t dim = table_->dim;\n      uint64_t n = options_.max_capacity;\n      auto kernel = [&] {\n        if (dim >= 32 && n % 32 == 0) {\n          return remove_kernel_v2<key_type, value_type, score_type, PredFunctor,\n                                  32>;\n        } else if (dim >= 16 && n % 16 == 0) {\n          return remove_kernel_v2<key_type, value_type, score_type, PredFunctor,\n                                  16>;\n        } else if (dim >= 8 && n % 8 == 0) {\n          return remove_kernel_v2<key_type, value_type, score_type, PredFunctor,\n                                  8>;\n        }\n        return remove_kernel_v2<key_type, value_type, score_type, PredFunctor,\n                                1>;\n      }();\n      uint64_t block_size = 128UL;\n      uint64_t grid_size =\n          std::min(sm_cnt_ * max_threads_per_block_ / block_size,\n                   SAFE_GET_GRID_SIZE(n, block_size));\n      kernel<<<grid_size, block_size, 0, stream>>>(\n          n, 0, pred, table_->buckets, table_->buckets_size,\n          table_->bucket_max_size, table_->dim, d_count);\n    }\n\n    size_type count = 0;\n    CUDA_CHECK(cudaMemcpyAsync(&count, d_count, sizeof(size_type),\n                               cudaMemcpyDeviceToHost, stream));\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    CudaCheckError();\n    return count;\n  }\n\n  /**\n   * @brief Removes all of the elements in the hash table with no release\n   * object.\n   */\n  void clear(cudaStream_t stream = 0) {\n    std::unique_ptr<update_read_lock> lock_ptr;\n    if (options_.api_lock) {\n      lock_ptr = std::make_unique<update_read_lock>(mutex_, stream);\n    }\n\n    const size_t block_size = options_.block_size;\n    const size_t N = table_->buckets_num * table_->bucket_max_size;\n    const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);\n\n    clear_kernel<key_type, value_type, score_type>\n        <<<grid_size, block_size, 0, stream>>>(d_table_, table_->buckets, N);\n\n    CudaCheckError();\n  }\n\n public:\n  /**\n   * @brief Exports a certain number of the key-value-score tuples from the\n   * hash table.\n   *\n   * @param n The maximum number of exported pairs.\n   * @param offset The position of the key to search.\n   * @param d_counter Accumulates amount of successfully exported values.\n   * @param keys The keys to dump from GPU-accessible memory with shape (n).\n   * @param values The values to dump from GPU-accessible memory with shape\n   * (n, DIM).\n   * @param scores The scores to search on GPU-accessible memory with shape (n).\n   * @parblock\n   * If @p scores is `nullptr`, the score for each key will not be returned.\n   * @endparblock\n   *\n   * @param stream The CUDA stream that is used to execute the operation.\n   *\n   * @return The number of elements dumped.\n   *\n   * @throw CudaException If the key-value size is too large for GPU shared\n   * memory. Reducing the value for @p n is currently required if this exception\n   * occurs.\n   */\n  void export_batch(size_type n, const size_type offset,\n                    size_type* d_counter,          // (1)\n                    key_type* keys,                // (n)\n                    value_type* values,            // (n, DIM)\n                    score_type* scores = nullptr,  // (n)\n                    cudaStream_t stream = 0) const {\n    std::unique_ptr<read_shared_lock> lock_ptr;\n    if (options_.api_lock) {\n      lock_ptr = std::make_unique<read_shared_lock>(mutex_, stream);\n    }\n\n    CUDA_CHECK(cudaMemsetAsync(d_counter, 0, sizeof(size_type), stream));\n    if (offset >= table_->capacity) {\n      return;\n    }\n    n = std::min(table_->capacity - offset, n);\n\n    size_type shared_size;\n    size_type block_size;\n    std::tie(shared_size, block_size) =\n        dump_kernel_shared_memory_size<K, V, S>(shared_mem_size_);\n\n    const size_t grid_size = SAFE_GET_GRID_SIZE(n, block_size);\n\n    dump_kernel<key_type, value_type, score_type>\n        <<<grid_size, block_size, shared_size, stream>>>(\n            d_table_, table_->buckets, keys, values, scores, offset, n,\n            d_counter);\n\n    CudaCheckError();\n  }\n\n  size_type export_batch(const size_type n, const size_type offset,\n                         key_type* keys,                // (n)\n                         value_type* values,            // (n, DIM)\n                         score_type* scores = nullptr,  // (n)\n                         cudaStream_t stream = 0) const {\n    auto dev_ws{dev_mem_pool_->get_workspace<1>(sizeof(size_type), stream)};\n    auto d_counter{dev_ws.get<size_type*>(0)};\n\n    CUDA_CHECK(cudaMemsetAsync(d_counter, 0, sizeof(size_type), stream));\n    export_batch(n, offset, d_counter, keys, values, scores, stream);\n\n    size_type counter = 0;\n    CUDA_CHECK(cudaMemcpyAsync(&counter, d_counter, sizeof(size_type),\n                               cudaMemcpyDeviceToHost, stream));\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    return counter;\n  }\n\n  /**\n   * @brief Exports a certain number of the key-value-score tuples which match\n   *\n   * @tparam PredFunctor A functor with template <K, S> defined an operator\n   * with signature:  __device__ (bool*)(const K&, S&, const K&, const S&).\n   * specified condition from the hash table.\n   *\n   * @param n The maximum number of exported pairs.\n   * The value for @p pred should be a function with type `Pred` defined like\n   * the following example:\n   *\n   *    ```\n   *    template <class K, class S>\n   *    struct ExportIfPredFunctor {\n   *      __forceinline__ __device__ bool operator()(const K& key,\n   *                                                 S& score,\n   *                                                 const K& pattern,\n   *                                                 const S& threshold) {\n   *        return score >= threshold;\n   *      }\n   *    };\n   *    ```\n   *\n   * @param pattern The third user-defined argument to @p pred with key_type\n   * type.\n   * @param threshold The fourth user-defined argument to @p pred with\n   * score_type type.\n   * @param offset The position of the key to search.\n   * @param keys The keys to dump from GPU-accessible memory with shape (n).\n   * @param values The values to dump from GPU-accessible memory with shape\n   * (n, DIM).\n   * @param scores The scores to search on GPU-accessible memory with shape (n).\n   * @parblock\n   * If @p scores is `nullptr`, the score for each key will not be returned.\n   * @endparblock\n   *\n   * @param stream The CUDA stream that is used to execute the operation.\n   *\n   * @return The number of elements dumped.\n   *\n   * @throw CudaException If the key-value size is too large for GPU shared\n   * memory. Reducing the value for @p n is currently required if this exception\n   * occurs.\n   */\n  template <template <typename, typename> class PredFunctor>\n  void export_batch_if(const key_type& pattern, const score_type& threshold,\n                       size_type n, const size_type offset,\n                       size_type* d_counter,\n                       key_type* keys,                // (n)\n                       value_type* values,            // (n, DIM)\n                       score_type* scores = nullptr,  // (n)\n                       cudaStream_t stream = 0) const {\n    std::unique_ptr<read_shared_lock> lock_ptr;\n    if (options_.api_lock) {\n      lock_ptr = std::make_unique<read_shared_lock>(mutex_, stream);\n    }\n    CUDA_CHECK(cudaMemsetAsync(d_counter, 0, sizeof(size_type), stream));\n\n    if (offset >= table_->capacity) {\n      return;\n    }\n    n = std::min(table_->capacity - offset, n);\n    if (n == 0) {\n      return;\n    }\n\n    bool match_fast_cond = true;\n    const size_t value_size = sizeof(V) * dim();\n    auto check_tile_size = [&](int tile_size) {\n      return options_.max_bucket_size % tile_size == 0 &&\n             options_.max_bucket_size >= tile_size && offset % tile_size == 0 &&\n             n % tile_size == 0;\n    };\n    auto select_tile_size = [&](auto vec) {\n      using VecV = decltype(vec);\n      size_t vec_dim = value_size / sizeof(VecV);\n      if (vec_dim >= 32 && check_tile_size(32)) {\n        return dump_kernel_v2<key_type, value_type, score_type, VecV,\n                              PredFunctor, 32>;\n      } else if (vec_dim >= 16 && check_tile_size(16)) {\n        return dump_kernel_v2<key_type, value_type, score_type, VecV,\n                              PredFunctor, 16>;\n      } else if (vec_dim >= 8 && check_tile_size(8)) {\n        return dump_kernel_v2<key_type, value_type, score_type, VecV,\n                              PredFunctor, 8>;\n      }\n      match_fast_cond = false;\n      return dump_kernel<key_type, value_type, score_type, PredFunctor>;\n    };\n    auto kernel = [&] {\n      if (value_size >= sizeof(float4) * 8 &&\n          value_size % sizeof(float4) == 0) {\n        return select_tile_size(float4{});\n      } else if (value_size >= sizeof(float2) * 8 &&\n                 value_size % sizeof(float2) == 0) {\n        return select_tile_size(float2{});\n      } else if (value_size >= sizeof(float) * 8 &&\n                 value_size % sizeof(float) == 0) {\n        return select_tile_size(float{});\n      } else if (value_size >= sizeof(uint16_t) * 8 &&\n                 value_size % sizeof(uint16_t) == 0) {\n        return select_tile_size(uint16_t{});\n      }\n      return select_tile_size(V{});\n    }();\n    size_t grid_size = 0, block_size = 0, shared_size = 0;\n    if (match_fast_cond) {\n      block_size = options_.block_size;\n      grid_size = std::min(sm_cnt_ * max_threads_per_block_ / block_size,\n                           SAFE_GET_GRID_SIZE(n, block_size));\n    } else {\n      const size_t score_size = scores ? sizeof(score_type) : 0;\n      const size_t kvm_size =\n          sizeof(key_type) + sizeof(value_type) * dim() + score_size;\n      block_size = std::min(shared_mem_size_ / 2 / kvm_size, 1024UL);\n      MERLIN_CHECK(\n          block_size > 0,\n          \"[HierarchicalKV] block_size <= 0, the K-V-S size may be too large!\");\n\n      shared_size = kvm_size * block_size;\n      grid_size = SAFE_GET_GRID_SIZE(n, block_size);\n    }\n    kernel<<<grid_size, block_size, shared_size, stream>>>(\n        d_table_, table_->buckets, pattern, threshold, keys, values, scores,\n        offset, n, d_counter);\n\n    CudaCheckError();\n  }\n\n  /**\n   * @brief Exports a certain number of key-value-score tuples that match a\n   * given predicate.\n   *\n   * @tparam PredFunctor A functor type with a template signature `<K, V, S>`.\n   * It should define an operator with the signature:\n   * `__device__ bool operator()(const K&, const V*, const S&,\n   * cg::thread_block_tile<GroupSize>&)`.\n   *\n   * @param pred A functor of type `PredFunctor` that defines the predicate for\n   * filtering tuples.\n   * @param n The maximum number of exported pairs.\n   * @param offset The position of the key to search.\n   * @param d_counter The number of elements dumped which is on device.\n   * @param keys The keys to dump from GPU-accessible memory with shape (n).\n   * @param values The values to dump from GPU-accessible memory with shape (n,\n   * DIM).\n   * @param scores The scores to search on GPU-accessible memory with shape (n).\n   * @parblock\n   * If @p scores is `nullptr`, the score for each key will not be returned.\n   * @endparblock\n   *\n   * @param stream The CUDA stream that is used to execute the operation.\n   *\n   * @return void\n   *\n   */\n\n  template <typename PredFunctor>\n  void export_batch_if_v2(PredFunctor& pred, size_type n,\n                          const size_type offset, size_type* d_counter,\n                          key_type* keys,                // (n)\n                          value_type* values,            // (n, DIM)\n                          score_type* scores = nullptr,  // (n)\n                          cudaStream_t stream = 0) const {\n    std::unique_ptr<read_shared_lock> lock_ptr;\n    if (options_.api_lock) {\n      lock_ptr = std::make_unique<read_shared_lock>(mutex_, stream);\n    }\n    CUDA_CHECK(cudaMemsetAsync(d_counter, 0, sizeof(size_type), stream));\n\n    if (offset >= table_->capacity) {\n      return;\n    }\n    n = std::min(table_->capacity - offset, n);\n    if (n == 0) {\n      return;\n    }\n\n    /// Search_length should be multiple of GroupSize for communication.\n    uint64_t dim = table_->dim;\n    auto kernel = [&] {\n      if (dim >= 32 && n % 32 == 0) {\n        return dump_kernel<key_type, value_type, score_type, PredFunctor, 32>;\n      } else if (dim >= 16 && n % 16 == 0) {\n        return dump_kernel<key_type, value_type, score_type, PredFunctor, 16>;\n      } else if (dim >= 8 && n % 8 == 0) {\n        return dump_kernel<key_type, value_type, score_type, PredFunctor, 8>;\n      }\n      return dump_kernel<key_type, value_type, score_type, PredFunctor, 1>;\n    }();\n    uint64_t block_size = 128UL;\n    uint64_t grid_size = std::min(sm_cnt_ * max_threads_per_block_ / block_size,\n                                  SAFE_GET_GRID_SIZE(n, block_size));\n    kernel<<<grid_size, block_size, 0, stream>>>(\n        n, offset, pred, table_->buckets, table_->bucket_max_size, dim, keys,\n        values, scores, d_counter);\n\n    CudaCheckError();\n  }\n\n  /**\n   * @brief Applies the given function to items in the range [first, last) in\n   * the table.\n   *\n   * @tparam ExecutionFunc A functor type with a template signature `<K, V, S>`.\n   * It should define an operator with the signature:\n   * `__device__ void operator()(const K&, V*, S*,\n   * cg::thread_block_tile<GroupSize>&)`.\n   *\n   * @param first The first element to which the function object will be\n   * applied.\n   * @param last The last element(excluding) to which the function object will\n   * be applied.\n   * @param f A functor of type `ExecutionFunc` that defines the predicate for\n   * filtering tuples. signature:  __device__ (bool*)(const K&, const V*, const\n   * S&, const cg::tiled_partition<GroupSize>&).\n   * @param stream The CUDA stream that is used to execute the operation.\n   *\n   * @return void\n   *\n   */\n\n  template <typename ExecutionFunc>\n  void for_each(const size_type first, const size_type last, ExecutionFunc& f,\n                cudaStream_t stream = 0) {\n    std::unique_ptr<update_read_lock> lock_ptr;\n    if (options_.api_lock) {\n      lock_ptr = std::make_unique<update_read_lock>(mutex_, stream);\n    }\n\n    if (first >= table_->capacity or last > table_->capacity or first >= last) {\n      return;\n    }\n    uint64_t n = last - first;\n\n    /// Search_length should be multiple of GroupSize for communication.\n    uint64_t dim = table_->dim;\n    auto kernel = [&] {\n      if (dim >= 32 && n % 32 == 0) {\n        return traverse_kernel<key_type, value_type, score_type, ExecutionFunc,\n                               32>;\n      } else if (dim >= 16 && n % 16 == 0) {\n        return traverse_kernel<key_type, value_type, score_type, ExecutionFunc,\n                               16>;\n      } else if (dim >= 8 && n % 8 == 0) {\n        return traverse_kernel<key_type, value_type, score_type, ExecutionFunc,\n                               8>;\n      }\n      return traverse_kernel<key_type, value_type, score_type, ExecutionFunc,\n                             1>;\n    }();\n    uint64_t block_size = 128UL;\n    uint64_t grid_size = std::min(sm_cnt_ * max_threads_per_block_ / block_size,\n                                  SAFE_GET_GRID_SIZE(n, block_size));\n    kernel<<<grid_size, block_size, 0, stream>>>(n, first, f, table_->buckets,\n                                                 table_->bucket_max_size, dim);\n\n    CudaCheckError();\n  }\n\n public:\n  /**\n   * @brief Indicates if the hash table has no elements.\n   *\n   * @param stream The CUDA stream that is used to execute the operation.\n   * @return `true` if the table is empty and `false` otherwise.\n   */\n  bool empty(cudaStream_t stream = 0) const { return size(stream) == 0; }\n\n  /**\n   * @brief Returns the hash table size.\n   *\n   * @param stream The CUDA stream that is used to execute the operation.\n   * @return The table size.\n   */\n  size_type size(cudaStream_t stream = 0) const {\n    std::unique_ptr<read_shared_lock> lock_ptr;\n    if (options_.api_lock) {\n      lock_ptr = std::make_unique<read_shared_lock>(mutex_, stream);\n    }\n\n    const size_type N = table_->buckets_num;\n\n    auto sumOp = SumOp<int, int64_t>();\n    auto d_sum_bytes = sumOp.get_storage_bytes(N, stream);\n\n    MultiVector<int64_t, uint8_t> mv(1, d_sum_bytes);\n    const size_type dev_ws_size = mv.total_size();\n    auto dev_ws{dev_mem_pool_->get_workspace<1>(dev_ws_size, stream)};\n    auto temp_storage = dev_ws.get<uint8_t*>(0);\n    auto d_total_size = get_vector<0>(mv, temp_storage);\n    auto d_sum_storage = get_vector<1>(mv, temp_storage);\n    sumOp.set_storage(reinterpret_cast<void*>(d_sum_storage));\n    sumOp.sum(N, table_->buckets_size, d_total_size, stream);\n\n    int64_t h_total_size = 0;\n    CUDA_CHECK(cudaMemcpyAsync(&h_total_size, d_total_size, sizeof(int64_t),\n                               cudaMemcpyDeviceToHost, stream));\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    CudaCheckError();\n    return static_cast<size_type>(h_total_size);\n  }\n\n  /**\n   * @brief Returns the number of keys if meet PredFunctor.\n   *\n   * @param stream The CUDA stream that is used to execute the operation.\n   * @return The table size match condiction of PredFunctor.\n   */\n  template <template <typename, typename> class PredFunctor>\n  void size_if(const key_type& pattern, const score_type& threshold,\n               size_type* d_counter, cudaStream_t stream = 0) const {\n    std::unique_ptr<read_shared_lock> lock_ptr;\n    if (options_.api_lock) {\n      lock_ptr = std::make_unique<read_shared_lock>(mutex_, stream);\n    }\n    CUDA_CHECK(cudaMemsetAsync(d_counter, 0, sizeof(size_type), stream));\n\n    size_t grid_size = SAFE_GET_GRID_SIZE(capacity(), options_.block_size);\n    grid_size = std::min(grid_size,\n                         static_cast<size_t>(sm_cnt_ * max_threads_per_block_ /\n                                             options_.block_size));\n    size_if_kernel<key_type, value_type, score_type, PredFunctor>\n        <<<grid_size, options_.block_size, 0, stream>>>(\n            d_table_, table_->buckets, pattern, threshold, d_counter);\n    CudaCheckError();\n  }\n\n  /**\n   * @brief Returns the hash table capacity.\n   *\n   * @note The value that is returned might be less than the actual capacity of\n   * the hash table because the hash table currently keeps the capacity to be\n   * a power of 2 for performance considerations.\n   *\n   * @return The table capacity.\n   */\n  size_type capacity() const { return table_->capacity; }\n\n  /**\n   * @brief Sets the number of buckets to the number that is needed to\n   * accommodate at least @p new_capacity elements without exceeding the maximum\n   * load factor. This method rehashes the hash table. Rehashing puts the\n   * elements into the appropriate buckets considering that total number of\n   * buckets has changed.\n   *\n   * @note If the value of @p new_capacity or double of @p new_capacity is\n   * greater or equal than `options_.max_capacity`, the reserve does not perform\n   * any change to the hash table.\n   *\n   * @param new_capacity The requested capacity for the hash table.\n   * @param stream The CUDA stream that is used to execute the operation.\n   */\n  void reserve(const size_type new_capacity, cudaStream_t stream = 0) {\n    MERLIN_CHECK(\n        !is_memory_mode(),\n        \"[MEMORY_MODE] reserve() is not supported in dual-bucket mode. \"\n        \"Rehash does not preserve dual-bucket mapping.\");\n    if (reach_max_capacity_ || new_capacity > options_.max_capacity) {\n      reach_max_capacity_ = (capacity() * 2 > options_.max_capacity);\n      return;\n    }\n\n    {\n      std::unique_ptr<update_read_lock> lock_ptr;\n      if (options_.api_lock) {\n        lock_ptr = std::make_unique<update_read_lock>(mutex_, stream);\n      }\n\n      // Once we have exclusive access, make sure that pending GPU calls have\n      // been processed.\n      CUDA_CHECK(cudaDeviceSynchronize());\n\n      while (capacity() < new_capacity &&\n             capacity() * 2 <= options_.max_capacity) {\n        double_capacity<key_type, value_type, score_type>(&table_, allocator_);\n        CUDA_CHECK(cudaDeviceSynchronize());\n        sync_table_configuration();\n\n        const size_t block_size = options_.block_size;\n        const size_t N = TILE_SIZE * table_->buckets_num / 2;\n        const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);\n\n        rehash_kernel_for_fast_mode<key_type, value_type, score_type, TILE_SIZE>\n            <<<grid_size, block_size, 0, stream>>>(d_table_, table_->buckets,\n                                                   N);\n      }\n      CUDA_CHECK(cudaDeviceSynchronize());\n      reach_max_capacity_ = (capacity() * 2 > options_.max_capacity);\n    }\n    CudaCheckError();\n  }\n\n  /**\n   * @brief Returns the average number of elements per slot, that is, size()\n   * divided by capacity().\n   *\n   * @param stream The CUDA stream that is used to execute the operation.\n   *\n   * @return The load factor\n   */\n  float load_factor(cudaStream_t stream = 0) const {\n    return static_cast<float>((size(stream) * 1.0) / (capacity() * 1.0));\n  }\n\n  /**\n   * @brief Set max_capacity of the table.\n   *\n   * @param new_max_capacity The new expecting max_capacity. It must be power\n   * of 2. Otherwise it will raise an error.\n   */\n  void set_max_capacity(size_type new_max_capacity) {\n    if (!is_power(2, new_max_capacity)) {\n      throw std::invalid_argument(\n          \"None power-of-2 new_max_capacity is not supported.\");\n    }\n\n    std::unique_ptr<update_read_lock> lock_ptr;\n    if (options_.api_lock) {\n      lock_ptr = std::make_unique<update_read_lock>(mutex_);\n    }\n\n    if (new_max_capacity < capacity()) {\n      return;\n    }\n    if (reach_max_capacity_) {\n      reach_max_capacity_ = false;\n    }\n    options_.max_capacity = new_max_capacity;\n  }\n\n  /**\n   * @brief Returns the dimension of the vectors.\n   *\n   * @return The dimension of the vectors.\n   */\n  size_type dim() const noexcept { return options_.dim; }\n\n  /**\n   * @brief Returns The length of each bucket.\n   *\n   * @return The length of each bucket.\n   */\n  size_type max_bucket_size() const noexcept {\n    return options_.max_bucket_size;\n  }\n\n  /**\n   * @brief Returns the number of buckets in the table.\n   *\n   * @return The number of buckets in the table.\n   */\n  size_type bucket_count() const noexcept { return table_->buckets_num; }\n\n  /**\n   * @brief Save keys, vectors, scores in table to file or files.\n   *\n   * @param file A BaseKVFile object defined the file format on host filesystem.\n   * @param max_workspace_size Saving is conducted in chunks. This value denotes\n   * the maximum amount of temporary memory to use when dumping the table.\n   * Larger values *can* lead to higher performance.\n   * @param stream The CUDA stream used to execute the operation.\n   *\n   * @return Number of KV pairs saved to file.\n   */\n  size_type save(BaseKVFile<K, V, S>* file,\n                 const size_t max_workspace_size = 1L * 1024 * 1024,\n                 cudaStream_t stream = 0) const {\n    const size_type tuple_size{sizeof(key_type) + sizeof(score_type) +\n                               sizeof(value_type) * dim()};\n    MERLIN_CHECK(max_workspace_size >= tuple_size,\n                 \"[HierarchicalKV] max_workspace_size is smaller than a single \"\n                 \"`key + scoredata + value` tuple! Please set a larger value!\");\n\n    size_type shared_size;\n    size_type block_size;\n    std::tie(shared_size, block_size) =\n        dump_kernel_shared_memory_size<K, V, S>(shared_mem_size_);\n\n    // Request exclusive access (to make sure capacity won't change anymore).\n    std::unique_ptr<update_read_lock> lock_ptr;\n    if (options_.api_lock) {\n      lock_ptr = std::make_unique<update_read_lock>(mutex_, stream);\n    }\n\n    const size_type total_size{capacity()};\n    const size_type n{std::min(max_workspace_size / tuple_size, total_size)};\n    const size_type grid_size{SAFE_GET_GRID_SIZE(n, block_size)};\n\n    // Grab temporary device and host memory.\n    const size_type host_ws_size{n * tuple_size};\n    auto host_ws{host_mem_pool_->get_workspace<1>(host_ws_size, stream)};\n    auto h_keys{host_ws.get<key_type*>(0)};\n    auto h_scores{reinterpret_cast<score_type*>(h_keys + n)};\n    auto h_values{reinterpret_cast<value_type*>(h_scores + n)};\n\n    const size_type dev_ws_size{sizeof(size_type) + host_ws_size};\n    auto dev_ws{dev_mem_pool_->get_workspace<1>(dev_ws_size, stream)};\n    auto d_count{dev_ws.get<size_type*>(0)};\n    auto d_keys{reinterpret_cast<key_type*>(d_count + 1)};\n    auto d_scores{reinterpret_cast<score_type*>(d_keys + n)};\n    auto d_values{reinterpret_cast<value_type*>(d_scores + n)};\n\n    // Step through table, dumping contents in batches.\n    size_type total_count{0};\n    for (size_type i{0}; i < total_size; i += n) {\n      // Dump the next batch to workspace, and then write it to the file.\n      CUDA_CHECK(cudaMemsetAsync(d_count, 0, sizeof(size_type), stream));\n\n      dump_kernel<key_type, value_type, score_type>\n          <<<grid_size, block_size, shared_size, stream>>>(\n              d_table_, table_->buckets, d_keys, d_values, d_scores, i,\n              std::min(total_size - i, n), d_count);\n\n      size_type count;\n      CUDA_CHECK(cudaMemcpyAsync(&count, d_count, sizeof(size_type),\n                                 cudaMemcpyDeviceToHost, stream));\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      if (count == n) {\n        CUDA_CHECK(cudaMemcpyAsync(h_keys, d_keys, host_ws_size,\n                                   cudaMemcpyDeviceToHost, stream));\n      } else {\n        CUDA_CHECK(cudaMemcpyAsync(h_keys, d_keys, sizeof(key_type) * count,\n                                   cudaMemcpyDeviceToHost, stream));\n        CUDA_CHECK(cudaMemcpyAsync(h_scores, d_scores,\n                                   sizeof(score_type) * count,\n                                   cudaMemcpyDeviceToHost, stream));\n        CUDA_CHECK(cudaMemcpyAsync(h_values, d_values,\n                                   sizeof(value_type) * dim() * count,\n                                   cudaMemcpyDeviceToHost, stream));\n      }\n\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      file->write(count, dim(), h_keys, h_values, h_scores);\n      total_count += count;\n    }\n\n    return total_count;\n  }\n\n  /**\n   * @brief Load keys, vectors, scores from file to table.\n   *\n   * @param file An BaseKVFile defined the file format within filesystem.\n   * @param max_workspace_size Loading is conducted in chunks. This value\n   * denotes the maximum size of such chunks. Larger values *can* lead to higher\n   * performance.\n   * @param stream The CUDA stream used to execute the operation.\n   *\n   * @return Number of keys loaded from file.\n   */\n  size_type load(BaseKVFile<K, V, S>* file,\n                 const size_t max_workspace_size = 1L * 1024 * 1024,\n                 cudaStream_t stream = 0) {\n    const size_type tuple_size{sizeof(key_type) + sizeof(score_type) +\n                               sizeof(value_type) * dim()};\n    MERLIN_CHECK(max_workspace_size >= tuple_size,\n                 \"[HierarchicalKV] max_workspace_size is smaller than a single \"\n                 \"`key + score + value` tuple! Please set a larger value!\");\n\n    const size_type n{max_workspace_size / tuple_size};\n    const size_type ws_size{n * tuple_size};\n\n    // Grab enough host memory to hold batch data.\n    auto host_ws{host_mem_pool_->get_workspace<1>(ws_size, stream)};\n    auto h_keys{host_ws.get<key_type*>(0)};\n    auto h_scores{reinterpret_cast<score_type*>(h_keys + n)};\n    auto h_values{reinterpret_cast<value_type*>(h_scores + n)};\n\n    // Attempt a first read.\n    size_type count{file->read(n, dim(), h_keys, h_values, h_scores)};\n    if (count == 0) {\n      return 0;\n    }\n\n    // Grab equal amount of device memory as temporary storage.\n    auto dev_ws{dev_mem_pool_->get_workspace<1>(ws_size, stream)};\n    auto d_keys{dev_ws.get<key_type*>(0)};\n    auto d_scores{reinterpret_cast<score_type*>(d_keys + n)};\n    auto d_values{reinterpret_cast<value_type*>(d_scores + n)};\n\n    size_type total_count{0};\n    do {\n      if (count == n) {\n        CUDA_CHECK(cudaMemcpyAsync(d_keys, h_keys, ws_size,\n                                   cudaMemcpyHostToDevice, stream));\n      } else {\n        CUDA_CHECK(cudaMemcpyAsync(d_keys, h_keys, sizeof(key_type) * count,\n                                   cudaMemcpyHostToDevice, stream));\n        CUDA_CHECK(cudaMemcpyAsync(d_scores, h_scores,\n                                   sizeof(score_type) * count,\n                                   cudaMemcpyHostToDevice, stream));\n        CUDA_CHECK(cudaMemcpyAsync(d_values, h_values,\n                                   sizeof(value_type) * dim() * count,\n                                   cudaMemcpyHostToDevice, stream));\n      }\n\n      set_global_epoch(static_cast<S>(IGNORED_GLOBAL_EPOCH));\n      insert_or_assign(count, d_keys, d_values, d_scores, stream, true, true);\n      total_count += count;\n\n      // Read next batch.\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      count = file->read(n, dim(), h_keys, h_values, h_scores);\n    } while (count > 0);\n\n    return total_count;\n  }\n\n  void set_global_epoch(const uint64_t epoch) { global_epoch_ = epoch; }\n\n private:\n  bool is_power(size_t base, size_t n) {\n    if (base < 2) {\n      throw std::invalid_argument(\"is_power with zero base.\");\n    }\n    while (n > 1) {\n      if (n % base != 0) {\n        return false;\n      }\n      n /= base;\n    }\n    return true;\n  }\n\n private:\n  inline bool is_fast_mode() const noexcept { return table_->is_pure_hbm; }\n\n  inline bool is_memory_mode() const noexcept {\n    return options_.table_mode == TableMode::kMemory;\n  }\n\n  /**\n   * @brief Returns the load factor by sampling up to 1024 buckets.\n   *\n   * @note For performance consideration, the returned load factor is\n   * inaccurate but within an error in 1% empirically which is enough for\n   * capacity control. But it's not suitable for end-users.\n   *\n   * @param delta A hypothetical upcoming change on table size.\n   * @param stream The CUDA stream used to execute the operation.\n   * @param need_lock If lock is needed.\n   *\n   * @return The evaluated load factor\n   */\n  inline float fast_load_factor(const size_type delta = 0,\n                                cudaStream_t stream = 0,\n                                const bool need_lock = true) const {\n    std::unique_ptr<read_shared_lock> lock_ptr;\n    if (options_.api_lock) {\n      lock_ptr =\n          std::make_unique<read_shared_lock>(mutex_, std::defer_lock, stream);\n      if (need_lock) {\n        lock_ptr->lock();\n      }\n    }\n\n    size_t N = std::min(table_->buckets_num, 1024UL);\n\n    auto sumOp = SumOp<int, int64_t>();\n    auto d_sum_bytes = sumOp.get_storage_bytes(N, stream);\n\n    MultiVector<int64_t, uint8_t> mv(1, d_sum_bytes);\n    const size_type dev_ws_size = mv.total_size();\n    auto dev_ws{dev_mem_pool_->get_workspace<1>(dev_ws_size, stream)};\n    auto temp_storage = dev_ws.get<uint8_t*>(0);\n    auto d_total_size = get_vector<0>(mv, temp_storage);\n    auto d_sum_storage = get_vector<1>(mv, temp_storage);\n    sumOp.set_storage(reinterpret_cast<void*>(d_sum_storage));\n    sumOp.sum(N, table_->buckets_size, d_total_size, stream);\n\n    int64_t h_total_size = 0;\n    CUDA_CHECK(cudaMemcpyAsync(&h_total_size, d_total_size, sizeof(int64_t),\n                               cudaMemcpyDeviceToHost, stream));\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    CudaCheckError();\n    return static_cast<float>((delta * 1.0) / (capacity() * 1.0) +\n                              (h_total_size * 1.0) /\n                                  (options_.max_bucket_size * N * 1.0));\n  }\n\n  inline void check_evict_strategy(const score_type* scores) {\n    if (evict_strategy == EvictStrategy::kLru ||\n        evict_strategy == EvictStrategy::kEpochLru) {\n      MERLIN_CHECK(scores == nullptr,\n                   \"the scores should not be specified when running on \"\n                   \"LRU or Epoch LRU mode.\");\n    }\n\n    if (evict_strategy == EvictStrategy::kLfu ||\n        evict_strategy == EvictStrategy::kEpochLfu) {\n      MERLIN_CHECK(scores != nullptr,\n                   \"the scores should be specified when running on \"\n                   \"LFU or Epoch LFU mode.\");\n    }\n\n    if (evict_strategy == EvictStrategy::kCustomized) {\n      MERLIN_CHECK(scores != nullptr,\n                   \"the scores should be specified when running on \"\n                   \"customized mode.\");\n    }\n\n    if ((evict_strategy == EvictStrategy::kEpochLru ||\n         evict_strategy == EvictStrategy::kEpochLfu)) {\n      MERLIN_CHECK(\n          global_epoch_ != static_cast<S>(IGNORED_GLOBAL_EPOCH),\n          \"the global_epoch is invalid and should be assigned by calling \"\n          \"`set_global_epoch` when running on \"\n          \"Epoch LRU or Epoch LFU mode.\");\n    }\n  }\n\n  /**\n   * @brief Synchronize the TableCore struct to replicas.\n   *\n   * @note For performance consideration, synchronize the TableCore struct to\n   * its replicas in constant memory and device memory when it's changed.\n   */\n  inline void sync_table_configuration() {\n    CUDA_CHECK(\n        cudaMemcpy(d_table_, table_, sizeof(TableCore), cudaMemcpyDefault));\n  }\n\n private:\n  HashTableOptions options_;\n  TableCore* table_ = nullptr;\n  TableCore* d_table_ = nullptr;\n  size_t shared_mem_size_ = 0;\n  int sm_cnt_ = 0;\n  int max_threads_per_block_ = 0;\n  std::atomic_bool reach_max_capacity_{false};\n  bool initialized_ = false;\n  mutable group_shared_mutex mutex_;\n  const unsigned int kernel_select_interval_ = 7;\n  std::unique_ptr<DeviceMemoryPool> dev_mem_pool_;\n  std::unique_ptr<HostMemoryPool> host_mem_pool_;\n  allocator_type* allocator_;\n  ThrustAllocator<uint8_t> thrust_allocator_;\n  bool default_allocator_ = true;\n  std::atomic<uint64_t> global_epoch_{\n      static_cast<uint64_t>(IGNORED_GLOBAL_EPOCH)};\n};\n\n}  // namespace merlin\n}  // namespace nv\n"
  },
  {
    "path": "include/merlin_localfile.hpp",
    "content": "/*\n * Copyright (c) 2022, NVIDIA CORPORATION.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *     http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n#pragma once\n\n#include <stddef.h>\n#include <stdio.h>\n#include <string>\n#include \"merlin/types.cuh\"\n\nnamespace nv {\nnamespace merlin {\n\n/**\n * The KV file on local file system. It only save/load keys and vectors\n * between table and file. `scores` are ignored in it since absolute\n * values of scores are commonly time-variant, while the time interval\n * between save/load calling is not deterministic, in default case. If\n * other specified rules are required, the BaseKVFile could be inherited\n * to implement customized read/write rules. The LocalKVFile uses compact,\n * consecutive binary format, where keys, values, and scores are stored in\n * seperated paths.\n *\n * @tparam K The data type of the key.\n * @tparam V The data type of the vector's elements.\n *         The item data type should be a basic data type of C++/CUDA.\n * @tparam S The data type for `score`.\n *           The currently supported data type is only `uint64_t`.\n *\n */\ntemplate <class K, class V, class M>\nclass LocalKVFile : public BaseKVFile<K, V, M> {\n public:\n  LocalKVFile() : keys_fp_(nullptr), values_fp_(nullptr), scores_fp_(nullptr) {}\n\n  ~LocalKVFile() { close(); }\n\n  /**\n   * @brief Open the file from local path. A LocalKVFile can only be\n   * read or written when it stays opened.\n   *\n   * @param keys_path Path to file to store keys.\n   * @param values_path Path to file to store values.\n   * @param scores_path Path to file to store scores.\n   * @params mode The mode to the file. The mode follows glibc style\n   *              and behavior like fopen.\n   */\n  bool open(const std::string& keys_path, const std::string& values_path,\n            const std::string& scores_path, const char* mode) {\n    close();\n    keys_fp_ = fopen(keys_path.c_str(), mode);\n    if (!keys_fp_) {\n      return false;\n    }\n    values_fp_ = fopen(values_path.c_str(), mode);\n    if (!values_fp_) {\n      close();\n      return false;\n    }\n    scores_fp_ = fopen(scores_path.c_str(), mode);\n    if (!scores_fp_) {\n      close();\n      return false;\n    }\n    return true;\n  }\n\n  /**\n   * @brief Close the file from open status and release fd(s) on files\n   * of keys, values, and scores.\n   */\n  void close() noexcept {\n    if (keys_fp_) {\n      fclose(keys_fp_);\n      keys_fp_ = nullptr;\n    }\n    if (values_fp_) {\n      fclose(values_fp_);\n      values_fp_ = nullptr;\n    }\n    if (scores_fp_) {\n      fclose(scores_fp_);\n      scores_fp_ = nullptr;\n    }\n  }\n\n  /**\n   * Read from file and fill into the keys, values, and scores buffer.\n   * When calling save/load method from table, it can assume that the\n   * received buffer of keys, vectors, and scores are automatically\n   * pre-allocated.\n   *\n   * @param n The number of KV pairs expect to read. `int64_t` was used\n   *          here to adapt to various filesytem and formats.\n   * @param dim The dimension of the `vectors`.\n   * @param keys The pointer to received buffer for keys.\n   * @param vectors The pointer to received buffer for vectors.\n   * @param scores The pointer to received buffer for scores.\n   *\n   * @return Number of KV pairs have been successfully read.\n   */\n  size_t read(const size_t n, const size_t dim, K* keys, V* vectors,\n              M* scores) override {\n    size_t nread_keys =\n        fread(keys, sizeof(K), static_cast<size_t>(n), keys_fp_);\n    size_t nread_vecs =\n        fread(vectors, sizeof(V) * dim, static_cast<size_t>(n), values_fp_);\n    size_t nread_scores =\n        fread(scores, sizeof(M), static_cast<size_t>(n), scores_fp_);\n    if (nread_keys != nread_vecs || nread_keys != nread_scores) {\n      return 0;\n    }\n    return nread_keys;\n  }\n\n  /**\n   * Write keys, values, scores from table to the file.\n   *\n   * @param n The number of KV pairs to be written. `int64_t` was used\n   *          here to adapt to various filesytem and formats.\n   * @param dim The dimension of the `vectors`.\n   * @param keys The keys will be written to file.\n   * @param vectors The vectors of values will be written to file.\n   * @param scores The scores will be written to file.\n   *\n   * @return Number of KV pairs have been successfully written.\n   */\n  size_t write(const size_t n, const size_t dim, const K* keys,\n               const V* vectors, const M* scores) override {\n    size_t nwritten_keys =\n        fwrite(keys, sizeof(K), static_cast<size_t>(n), keys_fp_);\n    size_t nwritten_vecs =\n        fwrite(vectors, sizeof(V) * dim, static_cast<size_t>(n), values_fp_);\n    size_t nwritten_scores =\n        fwrite(scores, sizeof(M), static_cast<size_t>(n), scores_fp_);\n    if (nwritten_keys != nwritten_vecs || nwritten_keys != nwritten_scores) {\n      return 0;\n    }\n    return nwritten_keys;\n  }\n\n private:\n  FILE* keys_fp_;\n  FILE* values_fp_;\n  FILE* scores_fp_;\n};\n\n}  // namespace merlin\n}  // namespace nv\n"
  },
  {
    "path": "run_all_tests.sh",
    "content": "#!/bin/bash\n\n# Usage : `bash run_all_tests.sh`\n\n# Search for all binary files that end with \"test\"\nfiles=$(find ./build/ -type f -name \"*_test\" -executable)\n\n# Execute each file found\nhas_fail=false\nfor file in $files\ndo\n    echo \"Executing $file ...\"\n    ./$file\n    if ! [ $? -eq 0 ]; then\n      has_fail=true\n    fi\ndone\n\nif [ \"$has_fail\" = true ] ; then\n    exit 1\nfi"
  },
  {
    "path": "tests/accum_or_assign_test.cc.cu",
    "content": "/*\n * Copyright (c) 2023, NVIDIA CORPORATION.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *     http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n#include <gtest/gtest.h>\n#include <math.h>\n#include <stdio.h>\n#include <stdlib.h>\n#include <algorithm>\n#include <array>\n#include <iostream>\n#include <thread>\n#include <unordered_map>\n#include <vector>\n#include \"merlin_hashtable.cuh\"\n#include \"test_util.cuh\"\n\nconstexpr size_t DIM = 16;\nusing K = uint64_t;\nusing V = float;\nusing S = uint64_t;\nusing TableOptions = nv::merlin::HashTableOptions;\nusing EvictStrategy = nv::merlin::EvictStrategy;\n\ntemplate <class K, class S>\nstruct EraseIfPredFunctor {\n  __forceinline__ __device__ bool operator()(const K& key, S& score,\n                                             const K& pattern,\n                                             const S& threshold) {\n    return (((key & 0x1u) == 0x1u) && (score > threshold));\n  }\n};\n\ntemplate <class K, class S>\nstruct ExportIfPredFunctor {\n  __forceinline__ __device__ bool operator()(const K& key, S& score,\n                                             const K& pattern,\n                                             const S& threshold) {\n    return score > threshold;\n  }\n};\n\nvoid test_basic_when_full(size_t max_hbm_for_vectors, int key_start) {\n  constexpr uint64_t INIT_CAPACITY = 1 * 1024 * 1024UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t KEY_NUM = 1 * 1024 * 1024UL;\n  constexpr uint64_t TEST_TIMES = 1;\n\n  K* h_keys;\n  S* h_scores;\n  V* h_vectors;\n  bool* h_accum_or_assigns;\n  bool* h_found;\n\n  TableOptions options;\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMallocHost(&h_accum_or_assigns, KEY_NUM * sizeof(bool)));\n  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));\n\n  CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n\n  test_util::create_random_keys<K, S, V, DIM>(h_keys, h_scores, nullptr,\n                                              KEY_NUM);\n\n  K* d_keys;\n  S* d_scores = nullptr;\n  V* d_vectors;\n  bool* d_accum_or_assigns;\n  V* d_def_val;\n  V** d_vectors_ptr;\n  bool* d_found;\n\n  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMalloc(&d_accum_or_assigns, KEY_NUM * sizeof(bool)));\n  CUDA_CHECK(cudaMalloc(&d_def_val, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));\n  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n\n  CUDA_CHECK(\n      cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),\n                        cudaMemcpyHostToDevice));\n\n  CUDA_CHECK(cudaMemset(d_vectors, 1, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMemset(d_def_val, 2, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMemset(d_vectors_ptr, 0, KEY_NUM * sizeof(V*)));\n  CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  uint64_t total_size = 0;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    table->accum_or_assign(KEY_NUM, d_keys, d_vectors, d_accum_or_assigns,\n                           d_scores, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    uint64_t total_size_after_insert = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    table->erase(KEY_NUM, d_keys, stream);\n    size_t total_size_after_erase = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size_after_erase, 0);\n\n    table->accum_or_assign(KEY_NUM, d_keys, d_vectors, d_accum_or_assigns,\n                           d_scores, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    uint64_t total_size_after_reinsert = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size_after_insert, total_size_after_reinsert);\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,\n                        cudaMemcpyDeviceToHost));\n\n  CUDA_CHECK(cudaFreeHost(h_keys));\n  CUDA_CHECK(cudaFreeHost(h_scores));\n  CUDA_CHECK(cudaFreeHost(h_found));\n  CUDA_CHECK(cudaFreeHost(h_accum_or_assigns));\n\n  CUDA_CHECK(cudaFree(d_keys));\n  CUDA_CHECK(cudaFree(d_scores));\n  CUDA_CHECK(cudaFree(d_vectors));\n  CUDA_CHECK(cudaFree(d_accum_or_assigns));\n  CUDA_CHECK(cudaFree(d_def_val));\n  CUDA_CHECK(cudaFree(d_vectors_ptr));\n  CUDA_CHECK(cudaFree(d_found));\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_erase_if_pred(size_t max_hbm_for_vectors, int key_start) {\n  constexpr uint64_t INIT_CAPACITY = 256UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t KEY_NUM = 128UL;\n  constexpr uint64_t TEST_TIMES = 1;\n  constexpr float true_ratio = 0.5;\n\n  K* h_keys;\n  S* h_scores;\n  V* h_vectors;\n  bool* h_found;\n  bool* h_accum_or_assigns;\n\n  TableOptions options;\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  std::unique_ptr<Table> table = std::make_unique<Table>();\n  table->init(options);\n\n  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));\n  CUDA_CHECK(cudaMallocHost(&h_accum_or_assigns, KEY_NUM * sizeof(bool)));\n\n  K* d_keys;\n  S* d_scores = nullptr;\n  V* d_vectors;\n  bool* d_found;\n  bool* d_accum_or_assigns;\n\n  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n  CUDA_CHECK(cudaMalloc(&d_accum_or_assigns, KEY_NUM * sizeof(bool)));\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  uint64_t total_size = 0;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    test_util::create_keys_in_one_buckets<K, S, V, DIM>(\n        h_keys, h_scores, h_vectors, KEY_NUM, INIT_CAPACITY);\n\n    test_util::create_random_bools<K>(h_accum_or_assigns, KEY_NUM, true_ratio);\n    CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(d_accum_or_assigns, h_accum_or_assigns,\n                          KEY_NUM * sizeof(bool), cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    table->accum_or_assign(KEY_NUM, d_keys, d_vectors, d_accum_or_assigns,\n                           d_scores, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    size_t expected_size = 0;\n    for (size_t i = 0; i < KEY_NUM; i++) {\n      if (!h_accum_or_assigns[i]) expected_size++;\n    }\n    ASSERT_EQ(total_size, expected_size);\n\n    K pattern = 100;\n    S threshold = 0;\n    size_t erase_num = table->template erase_if<EraseIfPredFunctor>(\n        pattern, threshold, stream);\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ((erase_num + total_size), expected_size);\n\n    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n    table->find(KEY_NUM, d_keys, d_vectors, d_found, d_scores, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    int found_num = 0;\n\n    CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));\n    CUDA_CHECK(cudaMemset(h_scores, 0, KEY_NUM * sizeof(S)));\n    CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n    for (int i = 0; i < KEY_NUM; i++) {\n      if (h_found[i]) {\n        found_num++;\n        ASSERT_EQ(h_scores[i], h_keys[i]);\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors[i * options.dim + j],\n                    static_cast<float>(h_keys[i] * 0.00001));\n        }\n      }\n    }\n    ASSERT_EQ(found_num, (expected_size - erase_num));\n\n    table->clear(stream);\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,\n                        cudaMemcpyDeviceToHost));\n\n  CUDA_CHECK(cudaFreeHost(h_keys));\n  CUDA_CHECK(cudaFreeHost(h_scores));\n  CUDA_CHECK(cudaFreeHost(h_found));\n  CUDA_CHECK(cudaFreeHost(h_accum_or_assigns));\n\n  CUDA_CHECK(cudaFree(d_keys));\n  CUDA_CHECK(cudaFree(d_scores));\n  CUDA_CHECK(cudaFree(d_vectors));\n  CUDA_CHECK(cudaFree(d_found));\n  CUDA_CHECK(cudaFree(d_accum_or_assigns));\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_rehash(size_t max_hbm_for_vectors, int key_start) {\n  constexpr uint64_t BUCKET_MAX_SIZE = 128ul;\n  constexpr uint64_t INIT_CAPACITY = BUCKET_MAX_SIZE;\n  constexpr uint64_t MAX_CAPACITY = 4 * INIT_CAPACITY;\n  constexpr uint64_t KEY_NUM = BUCKET_MAX_SIZE * 2;\n  constexpr uint64_t TEST_TIMES = 100;\n  constexpr float true_ratio = 0.5;\n\n  K* h_keys;\n  S* h_scores;\n  V* h_vectors;\n  bool* h_found;\n  bool* h_accum_or_assigns;\n\n  TableOptions options;\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_bucket_size = BUCKET_MAX_SIZE;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));\n  CUDA_CHECK(cudaMallocHost(&h_accum_or_assigns, KEY_NUM * sizeof(bool)));\n\n  K* d_keys;\n  S* d_scores = nullptr;\n  V* d_vectors;\n  bool* d_found;\n  bool* d_accum_or_assigns;\n  size_t dump_counter = 0;\n\n  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n  CUDA_CHECK(cudaMalloc(&d_accum_or_assigns, KEY_NUM * sizeof(bool)));\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  uint64_t total_size = 0;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n\n    test_util::create_random_bools<K>(h_accum_or_assigns, KEY_NUM, true_ratio);\n    test_util::create_keys_in_one_buckets<K, S, V, DIM>(\n        h_keys, h_scores, h_vectors, KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE);\n    CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(d_accum_or_assigns, h_accum_or_assigns,\n                          KEY_NUM * sizeof(bool), cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    table->accum_or_assign(KEY_NUM, d_keys, d_vectors, d_accum_or_assigns,\n                           d_scores, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaDeviceSynchronize());\n\n    size_t expected_size = 0;\n    for (size_t i = 0; i < KEY_NUM; i++) {\n      if (!h_accum_or_assigns[i]) expected_size++;\n    }\n    ASSERT_EQ(total_size, expected_size);\n\n    dump_counter = table->export_batch(table->capacity(), 0, d_keys, d_vectors,\n                                       d_scores, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(dump_counter, expected_size);\n\n    table->reserve(MAX_CAPACITY, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(table->capacity(), MAX_CAPACITY);\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, expected_size);\n\n    CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n    table->find(KEY_NUM, d_keys, d_vectors, d_found, d_scores, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    int found_num = 0;\n\n    CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));\n    CUDA_CHECK(cudaMemset(h_scores, 0, KEY_NUM * sizeof(S)));\n    CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n    CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n    for (int i = 0; i < KEY_NUM; i++) {\n      if (h_found[i]) {\n        found_num++;\n        ASSERT_EQ(h_scores[i], h_keys[i]);\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors[i * options.dim + j],\n                    static_cast<float>(h_keys[i] * 0.00001));\n        }\n      }\n    }\n    ASSERT_EQ(found_num, expected_size);\n\n    table->clear(stream);\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,\n                        cudaMemcpyDeviceToHost));\n\n  CUDA_CHECK(cudaFreeHost(h_keys));\n  CUDA_CHECK(cudaFreeHost(h_scores));\n  CUDA_CHECK(cudaFreeHost(h_found));\n  CUDA_CHECK(cudaFreeHost(h_accum_or_assigns));\n\n  CUDA_CHECK(cudaFree(d_keys));\n  CUDA_CHECK(cudaFree(d_scores));\n  CUDA_CHECK(cudaFree(d_vectors));\n  CUDA_CHECK(cudaFree(d_found));\n  CUDA_CHECK(cudaFree(d_accum_or_assigns));\n\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_rehash_on_big_batch(size_t max_hbm_for_vectors, int key_start) {\n  constexpr uint64_t INIT_CAPACITY = 1024;\n  constexpr uint64_t MAX_CAPACITY = 16 * 1024;\n  constexpr uint64_t INIT_KEY_NUM = 1024;\n  constexpr uint64_t KEY_NUM = 2048;\n\n  std::unordered_map<K, float> expected_values;\n  K* h_keys;\n  S* h_scores;\n  V* h_vectors;\n  bool* h_found;\n  bool* h_accum_or_assigns;\n  bool* h_accum_or_assigns_init;\n  float true_ratio = 0.6f;\n\n  TableOptions options;\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_bucket_size = 128;\n  options.max_load_factor = 0.6;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));\n  CUDA_CHECK(cudaMallocHost(&h_accum_or_assigns, KEY_NUM * sizeof(bool)));\n  CUDA_CHECK(cudaMallocHost(&h_accum_or_assigns_init, KEY_NUM * sizeof(bool)));\n\n  K* d_keys;\n  S* d_scores = nullptr;\n  V* d_vectors;\n  bool* d_accum_or_assigns;\n  bool* d_found;\n  size_t dump_counter = 0;\n\n  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMalloc(&d_accum_or_assigns, KEY_NUM * sizeof(bool)));\n  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  uint64_t total_size = 0;\n  uint64_t expected_size = 0;\n  std::unique_ptr<Table> table = std::make_unique<Table>();\n  table->init(options);\n\n  test_util::create_random_keys<K, S, V, DIM>(h_keys, h_scores, h_vectors,\n                                              KEY_NUM);\n\n  CUDA_CHECK(\n      cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),\n                        cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors, KEY_NUM * sizeof(V) * options.dim,\n                        cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n  CUDA_CHECK(cudaMemset(d_accum_or_assigns, 0, KEY_NUM * sizeof(bool)));\n\n  total_size = table->size(stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  ASSERT_EQ(total_size, 0);\n\n  test_util::create_random_bools<K>(h_accum_or_assigns, INIT_KEY_NUM,\n                                    true_ratio);\n  CUDA_CHECK(cudaMemcpy(d_accum_or_assigns, h_accum_or_assigns,\n                        INIT_KEY_NUM * sizeof(bool), cudaMemcpyHostToDevice));\n\n  table->accum_or_assign(INIT_KEY_NUM, d_keys, d_vectors, d_accum_or_assigns,\n                         d_scores, stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  ASSERT_GE(table->capacity(), INIT_CAPACITY * 2);\n\n  expected_size = 0;\n  for (int i = 0; i < INIT_KEY_NUM; i++)\n    expected_size += (h_accum_or_assigns[i] ? 0 : 1);\n\n  total_size = table->size(stream);\n  CUDA_CHECK(cudaDeviceSynchronize());\n  ASSERT_EQ(total_size, expected_size);\n  ASSERT_EQ(table->capacity(), (INIT_CAPACITY * 2));\n\n  CUDA_CHECK(cudaMemcpy(h_accum_or_assigns_init, h_accum_or_assigns,\n                        KEY_NUM * sizeof(bool), cudaMemcpyHostToHost));\n  test_util::create_random_bools<K>(h_accum_or_assigns, KEY_NUM, true_ratio);\n  CUDA_CHECK(cudaMemcpy(d_accum_or_assigns, h_accum_or_assigns,\n                        KEY_NUM * sizeof(bool), cudaMemcpyHostToDevice));\n\n  table->accum_or_assign(KEY_NUM, d_keys, d_vectors, d_accum_or_assigns,\n                         d_scores, stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n\n  expected_size = 0;\n  for (int i = 0; i < KEY_NUM; i++) {\n    if (i < INIT_KEY_NUM) {\n      if (h_accum_or_assigns_init[i]) {\n        if (h_accum_or_assigns[i]) {\n        } else {\n          expected_size++;\n          expected_values[h_keys[i]] = static_cast<float>(h_keys[i] * 0.00001);\n        }\n      } else {\n        expected_size++;\n        if (h_accum_or_assigns[i]) {\n          expected_values[h_keys[i]] = static_cast<float>(h_keys[i] * 0.00002);\n        } else {\n          expected_values[h_keys[i]] = static_cast<float>(h_keys[i] * 0.00001);\n        }\n      }\n    }\n    if (i >= INIT_KEY_NUM && (!h_accum_or_assigns[i])) {\n      expected_size++;\n      expected_values[h_keys[i]] = static_cast<float>(h_keys[i] * 0.00001);\n    }\n  }\n\n  total_size = table->size(stream);\n  CUDA_CHECK(cudaDeviceSynchronize());\n  ASSERT_EQ(total_size, expected_size);\n  ASSERT_GE(table->capacity(), KEY_NUM * 2);\n\n  dump_counter = table->export_batch(table->capacity(), 0, d_keys, d_vectors,\n                                     d_scores, stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  ASSERT_EQ(dump_counter, expected_size);\n\n  CUDA_CHECK(\n      cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMemset(d_scores, 0, KEY_NUM * sizeof(S)));\n  table->find(KEY_NUM, d_keys, d_vectors, d_found, d_scores, stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  int found_num = 0;\n\n  CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));\n  CUDA_CHECK(cudaMemset(h_scores, 0, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(\n      cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K), cudaMemcpyDeviceToHost));\n  CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                        cudaMemcpyDeviceToHost));\n  CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),\n                        cudaMemcpyDeviceToHost));\n  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,\n                        cudaMemcpyDeviceToHost));\n  for (int i = 0; i < KEY_NUM; i++) {\n    if (h_found[i]) {\n      found_num++;\n      ASSERT_EQ(h_scores[i], h_keys[i]);\n      for (int j = 0; j < options.dim; j++) {\n        ASSERT_EQ(h_vectors[i * options.dim + j], expected_values[h_keys[i]]);\n      }\n    }\n  }\n  ASSERT_EQ(found_num, expected_size);\n\n  table->clear(stream);\n  total_size = table->size(stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  ASSERT_EQ(total_size, 0);\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,\n                        cudaMemcpyDeviceToHost));\n\n  CUDA_CHECK(cudaFreeHost(h_keys));\n  CUDA_CHECK(cudaFreeHost(h_scores));\n  CUDA_CHECK(cudaFreeHost(h_accum_or_assigns));\n  CUDA_CHECK(cudaFreeHost(h_accum_or_assigns_init));\n  CUDA_CHECK(cudaFreeHost(h_found));\n\n  CUDA_CHECK(cudaFree(d_keys));\n  CUDA_CHECK(cudaFree(d_scores));\n  CUDA_CHECK(cudaFree(d_vectors));\n  CUDA_CHECK(cudaFree(d_accum_or_assigns));\n  CUDA_CHECK(cudaFree(d_found));\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n//\n// void test_dynamic_rehash_on_multi_threads(size_t max_hbm_for_vectors) {\n//  constexpr uint64_t BUCKET_MAX_SIZE = 128ul;\n//  constexpr uint64_t INIT_CAPACITY = 4 * 1024;\n//  constexpr uint64_t MAX_CAPACITY = 16 * 1024 * INIT_CAPACITY;\n//  constexpr uint64_t KEY_NUM = 256;\n//  constexpr uint64_t THREAD_N = 8;\n//\n//  std::vector<std::thread> threads;\n//\n//  TableOptions options;\n//\n//  options.init_capacity = INIT_CAPACITY;\n//  options.max_capacity = MAX_CAPACITY;\n//  options.dim = DIM;\n//  options.max_load_factor = 0.50f;\n//  options.max_bucket_size = BUCKET_MAX_SIZE;\n//  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n//  using Table = nv::merlin::HashTable<K, V, S,\n//  EvictStrategy::kLru>;\n//\n//  std::shared_ptr<Table> table = std::make_shared<Table>();\n//  table->init(options);\n//\n//  auto worker_function = [&table, KEY_NUM, options](int task_n) {\n//    K* h_keys;\n//    V* h_vectors;\n//    bool* h_found;\n//\n//    size_t current_capacity = table->capacity();\n//\n//    CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));\n//    CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));\n//    CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));\n//\n//    K* d_keys;\n//    V* d_vectors;\n//    bool* d_found;\n//\n//    CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));\n//    CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));\n//    CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n//\n//    cudaStream_t stream;\n//    CUDA_CHECK(cudaStreamCreate(&stream));\n//\n//    while (table->capacity() < MAX_CAPACITY) {\n//      test_util::create_random_keys<K, S, V, DIM>(h_keys, nullptr, h_vectors,\n//                                                  KEY_NUM);\n//      CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),\n//                            cudaMemcpyHostToDevice));\n//      CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,\n//                            KEY_NUM * sizeof(V) * options.dim,\n//                            cudaMemcpyHostToDevice));\n//      CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n//\n//      table->accum_or_assign(KEY_NUM, d_keys, d_vectors, nullptr, stream);\n//      CUDA_CHECK(cudaStreamSynchronize(stream));\n//\n//      CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n//      table->find(KEY_NUM, d_keys, d_vectors, d_found, nullptr, stream);\n//\n//      CUDA_CHECK(cudaStreamSynchronize(stream));\n//      int found_num = 0;\n//\n//      CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));\n//      CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n//      CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),\n//                            cudaMemcpyDeviceToHost));\n//      CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n//                            cudaMemcpyDeviceToHost));\n//\n//      CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n//                            KEY_NUM * sizeof(V) * options.dim,\n//                            cudaMemcpyDeviceToHost));\n//      for (int i = 0; i < KEY_NUM; i++) {\n//        if (h_found[i]) {\n//          found_num++;\n//          for (int j = 0; j < options.dim; j++) {\n//            ASSERT_EQ(h_vectors[i * options.dim + j],\n//                      static_cast<float>(h_keys[i] * 0.00001));\n//          }\n//        }\n//      }\n//      ASSERT_EQ(found_num, KEY_NUM);\n//      if (task_n == 0 && current_capacity != table->capacity()) {\n//        std::cout << \"[test_dynamic_rehash_on_multi_threads] The capacity \"\n//                     \"changed from \"\n//                  << current_capacity << \" to \" << table->capacity()\n//                  << std::endl;\n//        current_capacity = table->capacity();\n//      }\n//      CUDA_CHECK(cudaStreamSynchronize(stream));\n//    }\n//    CUDA_CHECK(cudaStreamDestroy(stream));\n//\n//    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n//                          KEY_NUM * sizeof(V) * options.dim,\n//                          cudaMemcpyDeviceToHost));\n//\n//    CUDA_CHECK(cudaFreeHost(h_keys));\n//    CUDA_CHECK(cudaFreeHost(h_found));\n//    CUDA_CHECK(cudaFreeHost(h_vectors));\n//\n//    CUDA_CHECK(cudaFree(d_keys));\n//    CUDA_CHECK(cudaFree(d_vectors));\n//    CUDA_CHECK(cudaFree(d_found));\n//    CUDA_CHECK(cudaDeviceSynchronize());\n//\n//    CudaCheckError();\n//  };\n//\n//  for (int i = 0; i < THREAD_N; ++i)\n//    threads.emplace_back(std::thread(worker_function, i));\n//\n//  for (auto& th : threads) {\n//    th.join();\n//  }\n//  ASSERT_EQ(table->capacity(), MAX_CAPACITY);\n//}\n//\nvoid test_export_batch_if(size_t max_hbm_for_vectors, int key_start) {\n  constexpr uint64_t INIT_CAPACITY = 256UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t KEY_NUM = 128UL;\n  constexpr uint64_t TEST_TIMES = 1;\n  constexpr float true_ratio = 0.6;\n\n  K* h_keys;\n  S* h_scores;\n  V* h_vectors;\n  bool* h_accum_or_assigns;\n  size_t h_dump_counter = 0;\n\n  TableOptions options;\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>;\n\n  std::unique_ptr<Table> table = std::make_unique<Table>();\n  table->init(options);\n\n  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMallocHost(&h_accum_or_assigns, KEY_NUM * sizeof(bool)));\n  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));\n\n  K* d_keys;\n  S* d_scores = nullptr;\n  V* d_vectors;\n  bool* d_found;\n  bool* d_accum_or_assigns;\n  size_t* d_dump_counter;\n  int found_num = 0;\n  bool* h_found;\n\n  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n  CUDA_CHECK(cudaMalloc(&d_accum_or_assigns, KEY_NUM * sizeof(bool)));\n  CUDA_CHECK(cudaMalloc(&d_dump_counter, sizeof(size_t)));\n\n  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  uint64_t total_size = 0;\n  K pattern = 100;\n  S threshold = test_util::host_nano<S>(stream);\n\n  for (int i = 0; i < TEST_TIMES; i++) {\n    test_util::create_random_bools<K>(h_accum_or_assigns, KEY_NUM, true_ratio);\n    test_util::create_random_keys<K, S, V, DIM>(h_keys, h_scores, h_vectors,\n                                                KEY_NUM);\n\n    CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(d_accum_or_assigns, h_accum_or_assigns,\n                          KEY_NUM * sizeof(bool), cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyHostToDevice));\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    table->accum_or_assign(KEY_NUM, d_keys, d_vectors, d_accum_or_assigns,\n                           nullptr, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    size_t expected_size = 0;\n    for (size_t i = 0; i < KEY_NUM; i++) {\n      if (!h_accum_or_assigns[i]) expected_size++;\n    }\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, expected_size);\n\n    CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));\n    CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n\n    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n    table->find(KEY_NUM, d_keys, d_vectors, d_found, nullptr, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                          cudaMemcpyDeviceToHost));\n\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n    found_num = 0;\n    for (int i = 0; i < KEY_NUM; i++) {\n      if (h_found[i]) {\n        found_num++;\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors[i * options.dim + j],\n                    static_cast<float>(h_keys[i] * 0.00001));\n        }\n      }\n    }\n    ASSERT_EQ(found_num, expected_size);\n\n    table->template export_batch_if<ExportIfPredFunctor>(\n        pattern, threshold, table->capacity(), 0, d_dump_counter, d_keys,\n        d_vectors, d_scores, stream);\n\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    CUDA_CHECK(cudaMemcpy(&h_dump_counter, d_dump_counter, sizeof(size_t),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, h_dump_counter * sizeof(S),\n                          cudaMemcpyDeviceToHost));\n\n    size_t expected_export_count = 0;\n    for (int i = 0; i < h_dump_counter; i++) {\n      if (h_scores[i] > threshold) expected_export_count++;\n    }\n    ASSERT_EQ(expected_export_count, h_dump_counter);\n\n    CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n\n    for (int i = 0; i < h_dump_counter; i++) {\n      ASSERT_GT(h_scores[i], threshold);\n      for (int j = 0; j < options.dim; j++) {\n        ASSERT_EQ(h_vectors[i * options.dim + j],\n                  static_cast<float>(h_keys[i] * 0.00001));\n      }\n    }\n\n    table->clear(stream);\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n  }\n  CUDA_CHECK(cudaDeviceSynchronize());\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaFreeHost(h_keys));\n  CUDA_CHECK(cudaFreeHost(h_scores));\n  CUDA_CHECK(cudaFreeHost(h_found));\n  CUDA_CHECK(cudaFreeHost(h_accum_or_assigns));\n\n  CUDA_CHECK(cudaFree(d_keys));\n  CUDA_CHECK(cudaFree(d_scores));\n  CUDA_CHECK(cudaFree(d_vectors));\n  CUDA_CHECK(cudaFree(d_found));\n  CUDA_CHECK(cudaFree(d_accum_or_assigns));\n  CUDA_CHECK(cudaFree(d_dump_counter));\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_evict_strategy_lru_basic(size_t max_hbm_for_vectors) {\n  constexpr uint64_t BUCKET_NUM = 8UL;\n  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;\n  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;\n  constexpr uint64_t TEST_KEY_NUM = 4;\n  constexpr uint64_t TEMP_KEY_NUM =\n      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;\n  constexpr uint64_t TEST_TIMES = 128;\n  constexpr float true_ratio = 0.5;\n\n  TableOptions options;\n\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>;\n\n  std::vector<K> h_keys_base(BASE_KEY_NUM);\n  std::vector<S> h_scores_base(BASE_KEY_NUM);\n  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);\n  std::vector<uint8_t> h_accum_or_assigns_base(BASE_KEY_NUM);\n\n  std::vector<K> h_keys_test(TEST_KEY_NUM);\n  std::vector<S> h_scores_test(TEST_KEY_NUM);\n  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);\n  std::vector<uint8_t> h_accum_or_assigns_test(TEST_KEY_NUM);\n\n  std::vector<K> h_keys_temp(TEMP_KEY_NUM);\n  std::vector<S> h_scores_temp(TEMP_KEY_NUM);\n  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);\n\n  K* d_keys_temp;\n  S* d_scores_temp = nullptr;\n  V* d_vectors_temp;\n  bool* d_accum_or_assigns_temp;\n\n  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));\n  CUDA_CHECK(\n      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));\n\n  CUDA_CHECK(cudaMalloc(&d_accum_or_assigns_temp, TEMP_KEY_NUM * sizeof(bool)));\n\n  test_util::create_random_bools<K>(\n      reinterpret_cast<bool*>(h_accum_or_assigns_base.data()), BASE_KEY_NUM,\n      true_ratio);\n  test_util::create_keys_in_one_buckets<K, S, V, DIM>(\n      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),\n      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF);\n\n  test_util::create_random_bools<K>(\n      reinterpret_cast<bool*>(h_accum_or_assigns_test.data()), TEST_KEY_NUM,\n      true_ratio);\n  test_util::create_keys_in_one_buckets<K, S, V, DIM>(\n      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),\n      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,\n      0xFFFFFFFFFFFFFFFD);\n\n  h_accum_or_assigns_base[72] = false;\n  h_accum_or_assigns_base[73] = false;\n\n  h_keys_test[2] = h_keys_base[72];\n  h_keys_test[3] = h_keys_base[73];\n\n  h_accum_or_assigns_test[2] = true;\n  h_accum_or_assigns_test[3] = false;\n\n  for (int i = 0; i < options.dim; i++) {\n    h_vectors_test[2 * options.dim + i] = h_vectors_base[72 * options.dim + i];\n    h_vectors_test[3 * options.dim + i] = h_vectors_base[73 * options.dim + i];\n  }\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  size_t total_size = 0;\n  size_t dump_counter = 0;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    {\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      CUDA_CHECK(\n          cudaMemcpy(d_accum_or_assigns_temp, h_accum_or_assigns_base.data(),\n                     BASE_KEY_NUM * sizeof(uint8_t), cudaMemcpyHostToDevice));\n      S start_ts = test_util::host_nano<S>(stream);\n      table->accum_or_assign(BASE_KEY_NUM, d_keys_temp, d_vectors_temp,\n                             d_accum_or_assigns_temp, nullptr, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      S end_ts = test_util::host_nano<S>(stream);\n\n      size_t total_size = table->size(stream);\n      size_t expected_size = 0;\n      for (int i = 0; i < BASE_KEY_NUM; i++) {\n        if (!h_accum_or_assigns_base[i]) expected_size++;\n      }\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, expected_size);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, expected_size);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      for (int i = 0; i < dump_counter; i++) {\n        ASSERT_GE(h_scores_temp[i], start_ts);\n        ASSERT_LE(h_scores_temp[i], end_ts);\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n\n    {\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),\n                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),\n                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),\n                            TEST_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      CUDA_CHECK(\n          cudaMemcpy(d_accum_or_assigns_temp, h_accum_or_assigns_test.data(),\n                     TEST_KEY_NUM * sizeof(bool), cudaMemcpyHostToDevice));\n      S start_ts = test_util::host_nano<S>(stream);\n      table->accum_or_assign(TEST_KEY_NUM, d_keys_temp, d_vectors_temp,\n                             d_accum_or_assigns_temp, nullptr, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      S end_ts = test_util::host_nano<S>(stream);\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      size_t expected_size = 0;\n      for (int i = 0; i < BASE_KEY_NUM; i++) {\n        if (!h_accum_or_assigns_base[i]) expected_size++;\n      }\n\n      for (int i = 0; i < TEST_KEY_NUM; i++) {\n        if ((h_keys_base.end() == std::find(h_keys_base.begin(),\n                                            h_keys_base.end(),\n                                            h_keys_test[i])) &&\n            !h_accum_or_assigns_test[i])\n          expected_size++;\n      }\n      ASSERT_EQ(total_size, expected_size);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, expected_size);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            TEMP_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      for (int i = 0; i < dump_counter; i++) {\n        bool is_accum = (h_keys_temp[i] == h_keys_test[2]);\n        bool is_new_insert =\n            (h_keys_test.end() !=\n             std::find(h_keys_test.begin(), h_keys_test.end(), h_keys_temp[i]));\n        if (is_accum) {\n          for (int j = 0; j < options.dim; j++) {\n            ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                      static_cast<float>(h_keys_temp[i] * 0.00002));\n          }\n        } else {\n          for (int j = 0; j < options.dim; j++) {\n            ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                      static_cast<float>(h_keys_temp[i] * 0.00001));\n          }\n        }\n        if (is_accum || (is_new_insert && (h_keys_temp[i] != h_keys_test[3]))) {\n          ASSERT_GE(h_scores_temp[i], start_ts);\n          ASSERT_LE(h_scores_temp[i], end_ts);\n        } else {\n          ASSERT_LE(h_scores_temp[i], start_ts);\n        }\n      }\n    }\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaFree(d_keys_temp));\n  CUDA_CHECK(cudaFree(d_scores_temp));\n  CUDA_CHECK(cudaFree(d_vectors_temp));\n  CUDA_CHECK(cudaFree(d_accum_or_assigns_temp));\n\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_evict_strategy_lfu_basic(size_t max_hbm_for_vectors, int key_start) {\n  constexpr uint64_t BUCKET_NUM = 8UL;\n  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;\n  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;\n  constexpr uint64_t TEST_KEY_NUM = 4;\n  constexpr uint64_t TEMP_KEY_NUM =\n      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;\n  constexpr uint64_t TEST_TIMES = 1024;\n  constexpr float true_ratio = 0.5;\n\n  TableOptions options;\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLfu>;\n\n  std::vector<K> h_keys_base(BASE_KEY_NUM);\n  std::vector<S> h_scores_base(BASE_KEY_NUM);\n  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);\n  std::vector<uint8_t> h_accum_or_assigns_base(BASE_KEY_NUM);\n\n  std::vector<K> h_keys_test(TEST_KEY_NUM);\n  std::vector<S> h_scores_test(TEST_KEY_NUM);\n  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);\n  std::vector<uint8_t> h_accum_or_assigns_test(TEST_KEY_NUM);\n\n  std::vector<K> h_keys_temp(TEMP_KEY_NUM);\n  std::vector<S> h_scores_temp(TEMP_KEY_NUM);\n  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);\n\n  K* d_keys_temp;\n  S* d_scores_temp = nullptr;\n  V* d_vectors_temp;\n  bool* d_accum_or_assigns_temp;\n\n  int freq_range = 1000;\n\n  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));\n  CUDA_CHECK(\n      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));\n\n  CUDA_CHECK(cudaMalloc(&d_accum_or_assigns_temp, TEMP_KEY_NUM * sizeof(bool)));\n\n  test_util::create_random_bools<K>(\n      reinterpret_cast<bool*>(h_accum_or_assigns_base.data()), BASE_KEY_NUM,\n      true_ratio);\n\n  test_util::create_random_bools<K>(\n      reinterpret_cast<bool*>(h_accum_or_assigns_test.data()), TEST_KEY_NUM,\n      true_ratio);\n\n  for (int i = 0; i < TEST_TIMES; i++) {\n    test_util::create_keys_in_one_buckets_lfu<K, S, V, DIM>(\n        h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),\n        BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF,\n        freq_range);\n\n    test_util::create_keys_in_one_buckets_lfu<K, S, V, DIM>(\n        h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),\n        TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,\n        0xFFFFFFFFFFFFFFFD, freq_range);\n\n    h_accum_or_assigns_base[72] = false;\n    h_accum_or_assigns_base[73] = false;\n\n    h_keys_test[2] = h_keys_base[72];\n    h_keys_test[3] = h_keys_base[73];\n\n    h_accum_or_assigns_test[2] = true;\n    h_accum_or_assigns_test[3] = false;\n\n    h_scores_test[2] = h_keys_base[72] % freq_range;\n    h_scores_test[3] = h_keys_base[73] % freq_range;\n\n    for (int i = 0; i < options.dim; i++) {\n      h_vectors_test[2 * options.dim + i] =\n          h_vectors_base[72 * options.dim + i];\n      h_vectors_test[3 * options.dim + i] =\n          h_vectors_base[73 * options.dim + i];\n    }\n    cudaStream_t stream;\n    CUDA_CHECK(cudaStreamCreate(&stream));\n\n    size_t total_size = 0;\n    size_t dump_counter = 0;\n    S global_epoch = 1;\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    {\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      CUDA_CHECK(\n          cudaMemcpy(d_accum_or_assigns_temp, h_accum_or_assigns_base.data(),\n                     BASE_KEY_NUM * sizeof(uint8_t), cudaMemcpyHostToDevice));\n\n      table->set_global_epoch(global_epoch);\n      table->accum_or_assign(BASE_KEY_NUM, d_keys_temp, d_vectors_temp,\n                             d_accum_or_assigns_temp, d_scores_temp, stream);\n\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      size_t total_size = table->size(stream);\n      size_t expected_size = 0;\n      for (int i = 0; i < BASE_KEY_NUM; i++) {\n        if (!h_accum_or_assigns_base[i]) expected_size++;\n      }\n\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, expected_size);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, expected_size);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      for (int i = 0; i < dump_counter; i++) {\n        ASSERT_EQ(h_scores_temp[i], h_keys_temp[i] % freq_range);\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n\n    {\n      global_epoch++;\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),\n                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),\n                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),\n                            TEST_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      CUDA_CHECK(\n          cudaMemcpy(d_accum_or_assigns_temp, h_accum_or_assigns_test.data(),\n                     TEST_KEY_NUM * sizeof(bool), cudaMemcpyHostToDevice));\n\n      table->set_global_epoch(global_epoch);\n      table->accum_or_assign(TEST_KEY_NUM, d_keys_temp, d_vectors_temp,\n                             d_accum_or_assigns_temp, d_scores_temp, stream);\n\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      size_t expected_size = 0;\n      for (int i = 0; i < BASE_KEY_NUM; i++) {\n        if (!h_accum_or_assigns_base[i]) expected_size++;\n      }\n\n      for (int i = 0; i < TEST_KEY_NUM; i++) {\n        if ((h_keys_base.end() == std::find(h_keys_base.begin(),\n                                            h_keys_base.end(),\n                                            h_keys_test[i])) &&\n            !h_accum_or_assigns_test[i])\n          expected_size++;\n      }\n      ASSERT_EQ(total_size, expected_size);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, expected_size);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            TEMP_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      for (int i = 0; i < dump_counter; i++) {\n        bool is_accum = (h_keys_temp[i] == h_keys_test[2]);\n        bool is_new_insert =\n            (h_keys_test.end() !=\n             std::find(h_keys_test.begin(), h_keys_test.end(), h_keys_temp[i]));\n\n        if (is_accum) {\n          ASSERT_EQ(h_scores_temp[i], (h_keys_temp[i] % freq_range) * 2);\n          for (int j = 0; j < options.dim; j++) {\n            ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                      static_cast<float>(h_keys_temp[i] * 0.00002));\n          }\n        } else {\n          ASSERT_EQ(h_scores_temp[i], (h_keys_temp[i] % freq_range));\n          for (int j = 0; j < options.dim; j++) {\n            ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                      static_cast<float>(h_keys_temp[i] * 0.00001));\n          }\n        }\n      }\n    }\n    CUDA_CHECK(cudaStreamDestroy(stream));\n  }\n\n  CUDA_CHECK(cudaFree(d_keys_temp));\n  CUDA_CHECK(cudaFree(d_scores_temp));\n  CUDA_CHECK(cudaFree(d_vectors_temp));\n  CUDA_CHECK(cudaFree(d_accum_or_assigns_temp));\n\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_evict_strategy_epochlru_basic(size_t max_hbm_for_vectors,\n                                        int key_start = 0) {\n  constexpr int RSHIFT_ON_NANO = 20;\n\n  constexpr uint64_t BUCKET_NUM = 8UL;\n  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;\n  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;\n  constexpr uint64_t TEST_KEY_NUM = 4;\n  constexpr uint64_t TEMP_KEY_NUM =\n      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;\n  constexpr uint64_t TEST_TIMES = 128;\n  constexpr float true_ratio = 0.5;\n\n  TableOptions options;\n\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kEpochLru>;\n\n  std::vector<K> h_keys_base(BASE_KEY_NUM);\n  std::vector<S> h_scores_base(BASE_KEY_NUM);\n  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);\n  std::vector<uint8_t> h_accum_or_assigns_base(BASE_KEY_NUM);\n\n  std::vector<K> h_keys_test(TEST_KEY_NUM);\n  std::vector<S> h_scores_test(TEST_KEY_NUM);\n  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);\n  std::vector<uint8_t> h_accum_or_assigns_test(TEST_KEY_NUM);\n\n  std::vector<K> h_keys_temp(TEMP_KEY_NUM);\n  std::vector<S> h_scores_temp(TEMP_KEY_NUM);\n  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);\n\n  K* d_keys_temp;\n  S* d_scores_temp = nullptr;\n  V* d_vectors_temp;\n  bool* d_accum_or_assigns_temp;\n\n  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));\n  CUDA_CHECK(\n      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));\n\n  CUDA_CHECK(cudaMalloc(&d_accum_or_assigns_temp, TEMP_KEY_NUM * sizeof(bool)));\n\n  test_util::create_random_bools<K>(\n      reinterpret_cast<bool*>(h_accum_or_assigns_base.data()), BASE_KEY_NUM,\n      true_ratio);\n  test_util::create_keys_in_one_buckets<K, S, V, DIM>(\n      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),\n      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF);\n\n  test_util::create_random_bools<K>(\n      reinterpret_cast<bool*>(h_accum_or_assigns_test.data()), TEST_KEY_NUM,\n      true_ratio);\n  test_util::create_keys_in_one_buckets<K, S, V, DIM>(\n      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),\n      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,\n      0xFFFFFFFFFFFFFFFD);\n\n  h_accum_or_assigns_base[72] = false;\n  h_accum_or_assigns_base[73] = false;\n\n  h_keys_test[2] = h_keys_base[72];\n  h_keys_test[3] = h_keys_base[73];\n\n  h_accum_or_assigns_test[2] = true;\n  h_accum_or_assigns_test[3] = false;\n\n  for (int i = 0; i < options.dim; i++) {\n    h_vectors_test[2 * options.dim + i] = h_vectors_base[72 * options.dim + i];\n    h_vectors_test[3 * options.dim + i] = h_vectors_base[73 * options.dim + i];\n  }\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  size_t total_size = 0;\n  size_t dump_counter = 0;\n  S global_epoch = 1;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    {\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      CUDA_CHECK(\n          cudaMemcpy(d_accum_or_assigns_temp, h_accum_or_assigns_base.data(),\n                     BASE_KEY_NUM * sizeof(uint8_t), cudaMemcpyHostToDevice));\n      S start_ts =\n          (test_util::host_nano<S>(stream) >> RSHIFT_ON_NANO) & 0xFFFFFFFF;\n      table->set_global_epoch(global_epoch);\n      table->accum_or_assign(BASE_KEY_NUM, d_keys_temp, d_vectors_temp,\n                             d_accum_or_assigns_temp, nullptr, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      S end_ts =\n          (test_util::host_nano<S>(stream) >> RSHIFT_ON_NANO) & 0xFFFFFFFF;\n\n      size_t total_size = table->size(stream);\n      size_t expected_size = 0;\n      for (int i = 0; i < BASE_KEY_NUM; i++) {\n        if (!h_accum_or_assigns_base[i]) expected_size++;\n      }\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, expected_size);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, expected_size);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      for (int i = 0; i < dump_counter; i++) {\n        ASSERT_GE(h_scores_temp[i] & 0xFFFFFFFF, start_ts);\n        ASSERT_LE(h_scores_temp[i] & 0xFFFFFFFF, end_ts);\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n\n    {\n      global_epoch++;\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),\n                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),\n                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),\n                            TEST_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      CUDA_CHECK(\n          cudaMemcpy(d_accum_or_assigns_temp, h_accum_or_assigns_test.data(),\n                     TEST_KEY_NUM * sizeof(bool), cudaMemcpyHostToDevice));\n      S start_ts =\n          (test_util::host_nano<S>(stream) >> RSHIFT_ON_NANO) & 0xFFFFFFFF;\n      table->set_global_epoch(global_epoch);\n      table->accum_or_assign(TEST_KEY_NUM, d_keys_temp, d_vectors_temp,\n                             d_accum_or_assigns_temp, nullptr, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      S end_ts =\n          (test_util::host_nano<S>(stream) >> RSHIFT_ON_NANO) & 0xFFFFFFFF;\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      size_t expected_size = 0;\n      for (int i = 0; i < BASE_KEY_NUM; i++) {\n        if (!h_accum_or_assigns_base[i]) expected_size++;\n      }\n\n      for (int i = 0; i < TEST_KEY_NUM; i++) {\n        if ((h_keys_base.end() == std::find(h_keys_base.begin(),\n                                            h_keys_base.end(),\n                                            h_keys_test[i])) &&\n            !h_accum_or_assigns_test[i])\n          expected_size++;\n      }\n      ASSERT_EQ(total_size, expected_size);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, expected_size);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            TEMP_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      for (int i = 0; i < dump_counter; i++) {\n        bool is_accum = (h_keys_temp[i] == h_keys_test[2]);\n        bool is_new_insert =\n            (h_keys_test.end() !=\n             std::find(h_keys_test.begin(), h_keys_test.end(), h_keys_temp[i]));\n        if (is_accum) {\n          for (int j = 0; j < options.dim; j++) {\n            ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                      static_cast<float>(h_keys_temp[i] * 0.00002));\n          }\n        } else {\n          for (int j = 0; j < options.dim; j++) {\n            ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                      static_cast<float>(h_keys_temp[i] * 0.00001));\n          }\n        }\n        if (is_accum || (is_new_insert && (h_keys_temp[i] != h_keys_test[3]))) {\n          ASSERT_GE(h_scores_temp[i] & 0xffffffff, start_ts);\n          ASSERT_LE(h_scores_temp[i] & 0xffffffff, end_ts);\n          ASSERT_EQ(h_scores_temp[i] >> 32 & 0xffffffff, global_epoch);\n        } else {\n          ASSERT_LE(h_scores_temp[i] & 0xffffffff, start_ts);\n          ASSERT_EQ(h_scores_temp[i] >> 32 & 0xffffffff, global_epoch - 1);\n        }\n      }\n    }\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaFree(d_keys_temp));\n  CUDA_CHECK(cudaFree(d_scores_temp));\n  CUDA_CHECK(cudaFree(d_vectors_temp));\n  CUDA_CHECK(cudaFree(d_accum_or_assigns_temp));\n\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_evict_strategy_epochlfu_basic(size_t max_hbm_for_vectors,\n                                        int key_start = 0) {\n  constexpr uint64_t BUCKET_NUM = 8UL;\n  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;\n  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;\n  constexpr uint64_t TEST_KEY_NUM = 4;\n  constexpr uint64_t TEMP_KEY_NUM =\n      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;\n  constexpr uint64_t TEST_TIMES = 1024;\n  constexpr float true_ratio = 0.5;\n\n  TableOptions options;\n\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kEpochLfu>;\n\n  std::vector<K> h_keys_base(BASE_KEY_NUM);\n  std::vector<S> h_scores_base(BASE_KEY_NUM);\n  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);\n  std::vector<uint8_t> h_accum_or_assigns_base(BASE_KEY_NUM);\n\n  std::vector<K> h_keys_test(TEST_KEY_NUM);\n  std::vector<S> h_scores_test(TEST_KEY_NUM);\n  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);\n  std::vector<uint8_t> h_accum_or_assigns_test(TEST_KEY_NUM);\n\n  std::vector<K> h_keys_temp(TEMP_KEY_NUM);\n  std::vector<S> h_scores_temp(TEMP_KEY_NUM);\n  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);\n\n  K* d_keys_temp;\n  S* d_scores_temp = nullptr;\n  V* d_vectors_temp;\n  bool* d_accum_or_assigns_temp;\n\n  int freq_range = 1000;\n\n  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));\n  CUDA_CHECK(\n      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));\n\n  CUDA_CHECK(cudaMalloc(&d_accum_or_assigns_temp, TEMP_KEY_NUM * sizeof(bool)));\n\n  test_util::create_random_bools<K>(\n      reinterpret_cast<bool*>(h_accum_or_assigns_base.data()), BASE_KEY_NUM,\n      true_ratio);\n\n  test_util::create_random_bools<K>(\n      reinterpret_cast<bool*>(h_accum_or_assigns_test.data()), TEST_KEY_NUM,\n      true_ratio);\n\n  test_util::create_keys_in_one_buckets_lfu<K, S, V, DIM>(\n      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),\n      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF,\n      freq_range);\n\n  test_util::create_keys_in_one_buckets_lfu<K, S, V, DIM>(\n      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),\n      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,\n      0xFFFFFFFFFFFFFFFD, freq_range);\n\n  h_accum_or_assigns_base[71] = false;\n  h_accum_or_assigns_base[72] = false;\n  h_accum_or_assigns_base[73] = false;\n\n  // Simulate overflow of low 32bits.\n  h_scores_base[71] = static_cast<S>(std::numeric_limits<uint32_t>::max() -\n                                     static_cast<uint32_t>(1));\n\n  h_keys_test[1] = h_keys_base[71];\n  h_keys_test[2] = h_keys_base[72];\n  h_keys_test[3] = h_keys_base[73];\n\n  h_accum_or_assigns_test[1] = true;\n  h_accum_or_assigns_test[2] = true;\n  h_accum_or_assigns_test[3] = false;\n\n  h_scores_test[1] = h_scores_base[71];\n  h_scores_test[2] = h_keys_base[72] % freq_range;\n  h_scores_test[3] = h_keys_base[73] % freq_range;\n\n  for (int i = 0; i < options.dim; i++) {\n    h_vectors_test[1 * options.dim + i] = h_vectors_base[71 * options.dim + i];\n    h_vectors_test[2 * options.dim + i] = h_vectors_base[72 * options.dim + i];\n    h_vectors_test[3 * options.dim + i] = h_vectors_base[73 * options.dim + i];\n  }\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  size_t total_size = 0;\n  size_t dump_counter = 0;\n  S global_epoch = 1;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    {\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      CUDA_CHECK(\n          cudaMemcpy(d_accum_or_assigns_temp, h_accum_or_assigns_base.data(),\n                     BASE_KEY_NUM * sizeof(uint8_t), cudaMemcpyHostToDevice));\n\n      table->set_global_epoch(global_epoch);\n      table->accum_or_assign(BASE_KEY_NUM, d_keys_temp, d_vectors_temp,\n                             d_accum_or_assigns_temp, d_scores_temp, stream);\n\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      size_t total_size = table->size(stream);\n      size_t expected_size = 0;\n      for (int i = 0; i < BASE_KEY_NUM; i++) {\n        if (!h_accum_or_assigns_base[i]) expected_size++;\n      }\n\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, expected_size);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, expected_size);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      for (int i = 0; i < dump_counter; i++) {\n        if (h_keys_temp[i] == h_keys_base[71]) {\n          S expected_score = test_util::make_expected_score_for_epochlfu<S>(\n              global_epoch, h_scores_base[71]);\n          ASSERT_EQ(h_scores_temp[i], expected_score);\n        } else {\n          S expected_score = test_util::make_expected_score_for_epochlfu<S>(\n              global_epoch, (h_keys_temp[i] % freq_range));\n          ASSERT_EQ(h_scores_temp[i], expected_score);\n        }\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n\n    {\n      global_epoch++;\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),\n                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),\n                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),\n                            TEST_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      CUDA_CHECK(\n          cudaMemcpy(d_accum_or_assigns_temp, h_accum_or_assigns_test.data(),\n                     TEST_KEY_NUM * sizeof(bool), cudaMemcpyHostToDevice));\n\n      table->set_global_epoch(global_epoch);\n      table->accum_or_assign(TEST_KEY_NUM, d_keys_temp, d_vectors_temp,\n                             d_accum_or_assigns_temp, d_scores_temp, stream);\n\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      size_t expected_size = 0;\n      for (int i = 0; i < BASE_KEY_NUM; i++) {\n        if (!h_accum_or_assigns_base[i]) expected_size++;\n      }\n\n      for (int i = 0; i < TEST_KEY_NUM; i++) {\n        if ((h_keys_base.end() == std::find(h_keys_base.begin(),\n                                            h_keys_base.end(),\n                                            h_keys_test[i])) &&\n            !h_accum_or_assigns_test[i])\n          expected_size++;\n      }\n      ASSERT_EQ(total_size, expected_size);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, expected_size);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            TEMP_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      ASSERT_TRUE(h_keys_temp.end() != std::find(h_keys_temp.begin(),\n                                                 h_keys_temp.end(),\n                                                 h_keys_base[71]));\n\n      for (int i = 0; i < dump_counter; i++) {\n        bool in_base =\n            h_keys_base.end() !=\n            std::find(h_keys_base.begin(), h_keys_base.end(), h_keys_temp[i]);\n\n        bool is_accum = (h_keys_temp[i] == h_keys_test[1] ||\n                         h_keys_temp[i] == h_keys_test[2]);\n        bool is_new_insert =\n            (h_keys_test.end() !=\n             std::find(h_keys_test.begin(), h_keys_test.end(), h_keys_temp[i]));\n\n        if (is_accum) {\n          if (h_keys_temp[i] == h_keys_base[71]) {\n            S expected_score = test_util::make_expected_score_for_epochlfu<S>(\n                global_epoch, h_scores_base[71] * 2);\n            ASSERT_EQ(h_scores_temp[i], expected_score);\n          } else {\n            S expected_score = test_util::make_expected_score_for_epochlfu<S>(\n                global_epoch, (h_keys_temp[i] % freq_range) * 2);\n            ASSERT_EQ(h_scores_temp[i], expected_score);\n          }\n        } else {\n          if (h_keys_temp[i] == h_keys_base[71]) {\n            S expected_score = test_util::make_expected_score_for_epochlfu<S>(\n                global_epoch, h_scores_base[71] * 2);\n            ASSERT_EQ(h_scores_temp[i], expected_score);\n          } else {\n            S expected_score = test_util::make_expected_score_for_epochlfu<S>(\n                global_epoch - static_cast<S>(in_base),\n                (h_keys_temp[i] % freq_range));\n\n            ASSERT_EQ(h_scores_temp[i], expected_score);\n          }\n        }\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] *\n                                       (is_accum ? 0.00002 : 0.00001)))\n              << \",i=\" << i << \",is_accum=\" << is_accum;\n        }\n      }\n    }\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaFree(d_keys_temp));\n  CUDA_CHECK(cudaFree(d_scores_temp));\n  CUDA_CHECK(cudaFree(d_vectors_temp));\n  CUDA_CHECK(cudaFree(d_accum_or_assigns_temp));\n\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_evict_strategy_customized_basic(size_t max_hbm_for_vectors,\n                                          int key_start = 0) {\n  constexpr uint64_t BUCKET_NUM = 8UL;\n  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;\n  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;\n  constexpr uint64_t TEST_KEY_NUM = 128;\n  constexpr uint64_t TEMP_KEY_NUM =\n      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;\n  constexpr uint64_t TEST_TIMES = 128;\n  constexpr float true_ratio = 0.3;\n\n  TableOptions options;\n\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  std::vector<K> h_keys_base(BASE_KEY_NUM);\n  std::vector<S> h_scores_base(BASE_KEY_NUM);\n  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);\n  std::vector<uint8_t> h_accum_or_assigns_base(BASE_KEY_NUM);\n\n  std::vector<K> h_keys_test(TEST_KEY_NUM);\n  std::vector<S> h_scores_test(TEST_KEY_NUM);\n  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);\n  std::vector<uint8_t> h_accum_or_assigns_test(TEST_KEY_NUM);\n\n  std::vector<K> h_keys_temp(TEMP_KEY_NUM);\n  std::vector<S> h_scores_temp(TEMP_KEY_NUM);\n  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);\n  std::vector<uint8_t> h_found_temp(TEMP_KEY_NUM);\n\n  K* d_keys_temp;\n  S* d_scores_temp = nullptr;\n  V* d_vectors_temp;\n  bool* d_accum_or_assigns_temp;\n  bool* d_found_temp;\n\n  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));\n  CUDA_CHECK(\n      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));\n\n  CUDA_CHECK(cudaMalloc(&d_accum_or_assigns_temp, TEMP_KEY_NUM * sizeof(bool)));\n  CUDA_CHECK(cudaMalloc(&d_found_temp, TEMP_KEY_NUM * sizeof(bool)));\n\n  test_util::create_random_bools<K>(\n      reinterpret_cast<bool*>(h_accum_or_assigns_base.data()), BASE_KEY_NUM,\n      true_ratio);\n  test_util::create_random_bools<K>(\n      reinterpret_cast<bool*>(h_accum_or_assigns_test.data()), TEST_KEY_NUM,\n      true_ratio);\n\n  test_util::create_keys_in_one_buckets<K, S, V, DIM>(\n      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),\n      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF);\n\n  const S base_score_start = 1000;\n  for (int i = 0; i < BASE_KEY_NUM; i++) {\n    h_scores_base[i] = base_score_start + i;\n  }\n\n  test_util::create_keys_in_one_buckets<K, S, V, DIM>(\n      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),\n      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,\n      0xFFFFFFFFFFFFFFFD);\n  const S test_score_start = base_score_start + BASE_KEY_NUM;\n  for (int i = 0; i < TEST_KEY_NUM; i++) {\n    h_scores_test[i] = test_score_start + i;\n  }\n  for (int i = 64; i < TEST_KEY_NUM; i++) {\n    h_keys_test[i] = h_keys_base[i];\n    //    h_scores_test[i] = h_scores_base[i];\n    for (int j = 0; j < options.dim; j++) {\n      h_vectors_test[i * options.dim + j] = h_vectors_base[i * options.dim + j];\n    }\n  }\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  size_t total_size = 0;\n  size_t dump_counter = 0;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    {\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      CUDA_CHECK(\n          cudaMemcpy(d_accum_or_assigns_temp, h_accum_or_assigns_base.data(),\n                     BASE_KEY_NUM * sizeof(uint8_t), cudaMemcpyHostToDevice));\n      table->accum_or_assign(BASE_KEY_NUM, d_keys_temp, d_vectors_temp,\n                             d_accum_or_assigns_temp, d_scores_temp, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      size_t total_size = table->size(stream);\n      size_t expected_size = 0;\n      for (int i = 0; i < BASE_KEY_NUM; i++) {\n        if (!h_accum_or_assigns_base[i]) expected_size++;\n      }\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, expected_size);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, expected_size);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      for (int i = 0; i < dump_counter; i++) {\n        S expected_score = 0ul;\n        bool is_accum = false;\n        for (int j = 0; j < BASE_KEY_NUM; j++) {\n          if (h_keys_base[j] == h_keys_temp[i]) {\n            expected_score = h_scores_base[j];\n            is_accum = h_accum_or_assigns_base[j];\n          }\n        }\n        ASSERT_FALSE(is_accum);\n        ASSERT_EQ(expected_score, h_scores_temp[i]);\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n\n    {\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),\n                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),\n                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(\n          cudaMemcpy(d_accum_or_assigns_temp, h_accum_or_assigns_test.data(),\n                     TEST_KEY_NUM * sizeof(bool), cudaMemcpyHostToDevice));\n      table->find(TEST_KEY_NUM, d_keys_temp, d_vectors_temp, d_found_temp,\n                  nullptr, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      CUDA_CHECK(cudaMemcpy(h_found_temp.data(), d_found_temp,\n                            TEST_KEY_NUM * sizeof(bool),\n                            cudaMemcpyDeviceToHost));\n\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),\n                            TEST_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      table->accum_or_assign(TEST_KEY_NUM, d_keys_temp, d_vectors_temp,\n                             d_accum_or_assigns_temp, d_scores_temp, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      size_t expected_size = 0;\n      for (int i = 0; i < BASE_KEY_NUM; i++) {\n        if (!h_accum_or_assigns_base[i]) expected_size++;\n      }\n\n      for (int i = 0; i < TEST_KEY_NUM; i++) {\n        if ((h_keys_base.end() == std::find(h_keys_base.begin(),\n                                            h_keys_base.end(),\n                                            h_keys_test[i])) &&\n            !h_accum_or_assigns_test[i])\n          expected_size++;\n      }\n      expected_size = std::min(expected_size, BUCKET_MAX_SIZE);\n\n      // Some keys in base could be evicted in one operation that allows the\n      // same key with `assign` flag in the test can be inserted.\n      ASSERT_GE(total_size, expected_size);\n      ASSERT_LE(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, total_size);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            TEMP_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      for (int i = 0; i < dump_counter; i++) {\n        S expected_score = 888ul;\n        S base_score = 888ul;\n        S test_score = 888ul;\n        bool is_accum_test = false;\n        bool is_found_on_base = false;\n        for (int j = 0; j < BASE_KEY_NUM; j++) {\n          if (h_keys_base[j] == h_keys_temp[i]) {\n            is_found_on_base = !h_accum_or_assigns_base[j];\n            base_score = h_scores_base[j];\n            break;\n          }\n        }\n        for (int j = 0; j < TEST_KEY_NUM; j++) {\n          if (h_keys_test[j] == h_keys_temp[i]) {\n            is_accum_test = h_accum_or_assigns_test[j];\n            test_score = h_scores_test[j];\n            break;\n          }\n        }\n        if (is_found_on_base && is_accum_test) expected_score = test_score;\n        if (is_found_on_base && !is_accum_test) expected_score = base_score;\n        if (!is_found_on_base && is_accum_test) assert(false);\n        if (!is_found_on_base && !is_accum_test) expected_score = test_score;\n\n        // Some keys in base could be evicted in one operation that allows the\n        // same key with `assign` flag in the test can be inserted.\n        ASSERT_EQ(expected_score, h_scores_temp[i])\n            << \" \" << is_found_on_base << \" \" << is_accum_test << \" \"\n            << base_score << \" \" << test_score;\n        if (is_found_on_base && is_accum_test) {\n          for (int j = 0; j < options.dim; j++) {\n            ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                      static_cast<float>(h_keys_temp[i] * 0.00002));\n          }\n        } else {\n          for (int j = 0; j < options.dim; j++) {\n            ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                      static_cast<float>(h_keys_temp[i] * 0.00001));\n          }\n        }\n      }\n    }\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaFree(d_keys_temp));\n  CUDA_CHECK(cudaFree(d_scores_temp));\n  CUDA_CHECK(cudaFree(d_vectors_temp));\n  CUDA_CHECK(cudaFree(d_accum_or_assigns_temp));\n  CUDA_CHECK(cudaFree(d_found_temp));\n\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_evict_strategy_customized_advanced(size_t max_hbm_for_vectors,\n                                             int key_start = 0) {\n  constexpr uint64_t BUCKET_NUM = 8UL;\n  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;\n  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;\n  constexpr uint64_t TEST_KEY_NUM = 8;\n  constexpr uint64_t TEMP_KEY_NUM =\n      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;\n  constexpr uint64_t TEST_TIMES = 256;\n  constexpr float base_true_ratio = 0.0f;\n  constexpr float test_true_ratio = 0.5f;\n\n  TableOptions options;\n\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  std::vector<K> h_keys_base(BASE_KEY_NUM);\n  std::vector<S> h_scores_base(BASE_KEY_NUM);\n  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);\n  std::vector<uint8_t> h_accum_or_assigns_base(BASE_KEY_NUM);\n\n  std::vector<K> h_keys_test(TEST_KEY_NUM);\n  std::vector<S> h_scores_test(TEST_KEY_NUM);\n  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);\n  std::vector<uint8_t> h_accum_or_assigns_test(TEST_KEY_NUM);\n\n  std::vector<K> h_keys_temp(TEMP_KEY_NUM);\n  std::vector<S> h_scores_temp(TEMP_KEY_NUM);\n  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);\n\n  K* d_keys_temp;\n  S* d_scores_temp = nullptr;\n  V* d_vectors_temp;\n  bool* d_accum_or_assigns_temp;\n\n  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));\n  CUDA_CHECK(\n      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMalloc(&d_accum_or_assigns_temp, TEMP_KEY_NUM * sizeof(bool)));\n\n  test_util::create_random_bools<K>(\n      reinterpret_cast<bool*>(h_accum_or_assigns_base.data()), BASE_KEY_NUM,\n      base_true_ratio);\n  test_util::create_keys_in_one_buckets<K, S, V, DIM>(\n      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),\n      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF);\n\n  const S base_score_start = 1000;\n  for (int i = 0; i < BASE_KEY_NUM; i++) {\n    h_scores_base[i] = base_score_start + i;\n  }\n\n  test_util::create_random_bools<K>(\n      reinterpret_cast<bool*>(h_accum_or_assigns_test.data()), TEST_KEY_NUM,\n      test_true_ratio);\n  test_util::create_keys_in_one_buckets<K, S, V, DIM>(\n      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),\n      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,\n      0xFFFFFFFFFFFFFFFD);\n\n  h_keys_test[4] = h_keys_base[72];\n  h_keys_test[5] = h_keys_base[73];\n  h_keys_test[6] = h_keys_base[74];\n  h_keys_test[7] = h_keys_base[75];\n\n  h_accum_or_assigns_base[72] = false;\n  h_accum_or_assigns_base[73] = false;\n  h_accum_or_assigns_base[74] = false;\n  h_accum_or_assigns_base[75] = false;\n  // replace four new keys to lower scores, would not be inserted.\n  h_scores_test[0] = 20;\n  h_scores_test[1] = 78;\n  h_scores_test[2] = 97;\n  h_scores_test[3] = 98;\n\n  // replace three exist keys to new scores, just refresh the score for them.\n  h_scores_test[4] = 99;\n  h_scores_test[5] = 1010;\n  h_scores_test[6] = 1020;\n  h_scores_test[7] = 1035;\n\n  h_accum_or_assigns_test[0] = false;\n  h_accum_or_assigns_test[1] = false;\n  h_accum_or_assigns_test[2] = false;\n  h_accum_or_assigns_test[3] = false;\n\n  h_accum_or_assigns_test[4] = true;\n  h_accum_or_assigns_test[5] = true;\n  h_accum_or_assigns_test[6] = true;\n  h_accum_or_assigns_test[7] = false;\n\n  for (int i = 4; i < TEST_KEY_NUM; i++) {\n    for (int j = 0; j < options.dim; j++) {\n      h_vectors_test[i * options.dim + j] =\n          static_cast<V>(h_keys_test[i] * 0.00001);\n    }\n  }\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  size_t total_size = 0;\n  size_t dump_counter = 0;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    {\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      CUDA_CHECK(\n          cudaMemcpy(d_accum_or_assigns_temp, h_accum_or_assigns_base.data(),\n                     BASE_KEY_NUM * sizeof(uint8_t), cudaMemcpyHostToDevice));\n      table->accum_or_assign(BASE_KEY_NUM, d_keys_temp, d_vectors_temp,\n                             d_accum_or_assigns_temp, d_scores_temp, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      size_t expected_size = 0;\n      for (const auto accum : h_accum_or_assigns_base) {\n        if (!accum) expected_size++;\n      }\n\n      ASSERT_EQ(total_size, expected_size);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, expected_size);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      std::vector<S> h_scores_temp_sorted(h_scores_temp);\n      std::sort(h_scores_temp_sorted.begin(), h_scores_temp_sorted.end());\n\n      for (int i = 0; i < dump_counter; i++) {\n        S expected_score = 0ul;\n        for (int j = 0; j < BASE_KEY_NUM; j++) {\n          if (h_keys_temp[i] == h_keys_base[j]) {\n            expected_score = h_scores_base[j];\n            break;\n          }\n        }\n        ASSERT_EQ(h_scores_temp[i], expected_score);\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n\n    {\n      std::unordered_map<K, bool> base_found_map;\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),\n                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),\n                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(\n          cudaMemcpy(d_accum_or_assigns_temp, h_accum_or_assigns_test.data(),\n                     TEST_KEY_NUM * sizeof(bool), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),\n                            TEST_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n\n      table->accum_or_assign(TEST_KEY_NUM, d_keys_temp, d_vectors_temp,\n                             d_accum_or_assigns_temp, d_scores_temp, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      size_t expected_size = 0;\n      for (const auto accum : h_accum_or_assigns_base) {\n        if (!accum) expected_size++;\n      }\n      expected_size = std::max(expected_size, BUCKET_MAX_SIZE);\n      ASSERT_EQ(total_size, expected_size);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, expected_size);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            TEMP_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      for (int i = 0; i < TEST_KEY_NUM; i++) {\n        if (i < 4) {\n          ASSERT_EQ(h_keys_temp.end(),\n                    std::find(h_keys_temp.begin(), h_keys_temp.end(),\n                              h_keys_test[i]));\n        } else {\n          ASSERT_NE(h_keys_temp.end(),\n                    std::find(h_keys_temp.begin(), h_keys_temp.end(),\n                              h_keys_test[i]));\n        }\n      }\n      for (int i = 0; i < TEMP_KEY_NUM; i++) {\n        if (h_keys_temp[i] == h_keys_test[4])\n          ASSERT_EQ(h_scores_temp[i], h_scores_test[4]);\n        if (h_keys_temp[i] == h_keys_test[5])\n          ASSERT_EQ(h_scores_temp[i], h_scores_test[5]);\n        if (h_keys_temp[i] == h_keys_test[6])\n          ASSERT_EQ(h_scores_temp[i], h_scores_test[6]);\n        if (h_keys_temp[i] == h_keys_test[7])\n          ASSERT_NE(h_scores_temp[i], h_scores_test[7]);\n        bool is_accum =\n            (h_keys_temp[i] != h_keys_test[7]) &&\n            (h_keys_test.end() != std::find(h_keys_test.begin() + 4,\n                                            h_keys_test.end(), h_keys_temp[i]));\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] *\n                                       (is_accum ? 0.00002 : 0.00001)));\n        }\n      }\n    }\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaFree(d_keys_temp));\n  CUDA_CHECK(cudaFree(d_scores_temp));\n  CUDA_CHECK(cudaFree(d_vectors_temp));\n  CUDA_CHECK(cudaFree(d_accum_or_assigns_temp));\n\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_evict_strategy_customized_correct_rate(size_t max_hbm_for_vectors,\n                                                 int key_start = 0) {\n  constexpr uint64_t BATCH_SIZE = 1024 * 1024ul;\n  constexpr uint64_t STEPS = 128;\n  constexpr uint64_t MAX_BUCKET_SIZE = 128;\n  constexpr uint64_t INIT_CAPACITY = BATCH_SIZE * STEPS;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t TEST_TIMES = 1;\n  float expected_correct_rate = 0.964;\n  const int rounds = 3;\n  constexpr float true_ratio = 0.0;\n\n  TableOptions options;\n\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_bucket_size = MAX_BUCKET_SIZE;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  K* h_keys_base = test_util::HostBuffer<K>(BATCH_SIZE).ptr();\n  S* h_scores_base = test_util::HostBuffer<S>(BATCH_SIZE).ptr();\n  V* h_vectors_base = test_util::HostBuffer<V>(BATCH_SIZE * options.dim).ptr();\n  bool* h_accum_or_assigns_base = test_util::HostBuffer<bool>(BATCH_SIZE).ptr();\n\n  K* h_keys_temp = test_util::HostBuffer<K>(MAX_CAPACITY).ptr();\n  S* h_scores_temp = test_util::HostBuffer<S>(MAX_CAPACITY).ptr();\n  V* h_vectors_temp =\n      test_util::HostBuffer<V>(MAX_CAPACITY * options.dim).ptr();\n\n  K* d_keys_temp;\n  S* d_scores_temp = nullptr;\n  V* d_vectors_temp;\n  bool* d_accum_or_assigns_temp;\n\n  CUDA_CHECK(cudaMalloc(&d_keys_temp, MAX_CAPACITY * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores_temp, MAX_CAPACITY * sizeof(S)));\n  CUDA_CHECK(\n      cudaMalloc(&d_vectors_temp, MAX_CAPACITY * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMalloc(&d_accum_or_assigns_temp, MAX_CAPACITY * sizeof(bool)));\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  size_t total_size = 0;\n  size_t global_start_key = 100000;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n    size_t start_key = global_start_key;\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    for (int r = 0; r < rounds; r++) {\n      size_t expected_min_key = global_start_key + INIT_CAPACITY * r;\n      size_t expected_max_key = global_start_key + INIT_CAPACITY * (r + 1) - 1;\n      size_t expected_table_size =\n          (r == 0) ? size_t(expected_correct_rate * INIT_CAPACITY)\n                   : INIT_CAPACITY;\n\n      for (int s = 0; s < STEPS; s++) {\n        test_util::create_random_bools<K>(h_accum_or_assigns_base, BATCH_SIZE,\n                                          true_ratio);\n        test_util::create_continuous_keys<K, S, V, DIM>(\n            h_keys_base, h_scores_base, h_vectors_base, BATCH_SIZE, start_key);\n        start_key += BATCH_SIZE;\n\n        CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base, BATCH_SIZE * sizeof(K),\n                              cudaMemcpyHostToDevice));\n        CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base,\n                              BATCH_SIZE * sizeof(S), cudaMemcpyHostToDevice));\n        CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base,\n                              BATCH_SIZE * sizeof(V) * options.dim,\n                              cudaMemcpyHostToDevice));\n        CUDA_CHECK(cudaMemcpy(d_accum_or_assigns_temp, h_accum_or_assigns_base,\n                              BATCH_SIZE * sizeof(bool),\n                              cudaMemcpyHostToDevice));\n        table->accum_or_assign(BATCH_SIZE, d_keys_temp, d_vectors_temp,\n                               d_accum_or_assigns_temp, d_scores_temp, stream);\n        CUDA_CHECK(cudaStreamSynchronize(stream));\n      }\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_GE(total_size, expected_table_size);\n      ASSERT_EQ(MAX_CAPACITY, table->capacity());\n\n      size_t dump_counter = table->export_batch(\n          MAX_CAPACITY, 0, d_keys_temp, d_vectors_temp, d_scores_temp, stream);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp, d_keys_temp, MAX_CAPACITY * sizeof(K),\n                            cudaMemcpyDeviceToHost));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp, d_scores_temp,\n                            MAX_CAPACITY * sizeof(S), cudaMemcpyDeviceToHost));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp, d_vectors_temp,\n                            MAX_CAPACITY * sizeof(V) * options.dim,\n                            cudaMemcpyDeviceToHost));\n\n      size_t bigger_score_counter = 0;\n      K max_key = 0;\n      size_t values_error_counter = 0;\n      for (int i = 0; i < dump_counter; i++) {\n        ASSERT_EQ(h_keys_temp[i], h_scores_temp[i]);\n        max_key = std::max(max_key, h_keys_temp[i]);\n        if (h_scores_temp[i] >= expected_min_key) bigger_score_counter++;\n        for (int j = 0; j < options.dim; j++) {\n          if (h_vectors_temp[i * options.dim + j] !=\n              static_cast<float>(h_keys_temp[i] * 0.00001)) {\n            values_error_counter++;\n          }\n        }\n      }\n\n      ASSERT_EQ(values_error_counter, 0);\n      float correct_rate = (bigger_score_counter * 1.0) / MAX_CAPACITY;\n      std::cout << std::setprecision(3) << \"[Round \" << r << \"]\"\n                << \"correct_rate=\" << correct_rate << std::endl;\n      ASSERT_GE(max_key, expected_max_key);\n      ASSERT_GE(correct_rate, expected_correct_rate);\n    }\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaFree(d_keys_temp));\n  CUDA_CHECK(cudaFree(d_scores_temp));\n  CUDA_CHECK(cudaFree(d_vectors_temp));\n  CUDA_CHECK(cudaFree(d_accum_or_assigns_temp));\n\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_dynamic_rehash_on_multi_threads(size_t max_hbm_for_vectors,\n                                          int key_start = 0) {\n  constexpr uint64_t BUCKET_MAX_SIZE = 128ul;\n  constexpr uint64_t INIT_CAPACITY = 4 * 1024 - BUCKET_MAX_SIZE - 1;\n  constexpr uint64_t MAX_CAPACITY = 16 * 1024 * INIT_CAPACITY;\n  constexpr uint64_t KEY_NUM = 256;\n  constexpr uint64_t THREAD_N = 8;\n\n  std::vector<std::thread> threads;\n\n  TableOptions options;\n\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_load_factor = 0.50f;\n  options.max_bucket_size = BUCKET_MAX_SIZE;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>;\n\n  std::shared_ptr<Table> table = std::make_shared<Table>();\n  table->init(options);\n  ASSERT_EQ(table->bucket_count(), 32);\n\n  auto worker_function = [&table, KEY_NUM, options](int task_n) {\n    constexpr float true_ratio = 0.5;\n    K* h_keys;\n    V* h_vectors;\n    bool* h_found;\n    bool* h_accum_or_assigns;\n    K* h_keys_temp;\n    V* h_vectors_temp;\n    bool* h_found_temp;\n\n    size_t current_capacity = table->capacity();\n\n    CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));\n    CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));\n    CUDA_CHECK(cudaMallocHost(&h_keys_temp, KEY_NUM * sizeof(K)));\n    CUDA_CHECK(\n        cudaMallocHost(&h_vectors_temp, KEY_NUM * sizeof(V) * options.dim));\n    CUDA_CHECK(cudaMallocHost(&h_found_temp, KEY_NUM * sizeof(bool)));\n    CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));\n    CUDA_CHECK(cudaMallocHost(&h_accum_or_assigns, KEY_NUM * sizeof(bool)));\n\n    K* d_keys;\n    V* d_vectors;\n    bool* d_found;\n    bool* d_accum_or_assigns;\n\n    CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));\n    CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));\n    CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n    CUDA_CHECK(cudaMalloc(&d_accum_or_assigns, KEY_NUM * sizeof(bool)));\n\n    cudaStream_t stream;\n    CUDA_CHECK(cudaStreamCreate(&stream));\n\n    while (table->capacity() * 2 < MAX_CAPACITY) {\n      test_util::create_random_bools<K>(h_accum_or_assigns, KEY_NUM,\n                                        true_ratio);\n\n      test_util::create_random_keys<K, S, V, DIM>(h_keys, nullptr, h_vectors,\n                                                  KEY_NUM);\n      CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),\n                            cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,\n                            KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_accum_or_assigns, h_accum_or_assigns,\n                            KEY_NUM * sizeof(bool), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n\n      table->find(KEY_NUM, d_keys, d_vectors, d_found, nullptr, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp, d_keys, KEY_NUM * sizeof(K),\n                            cudaMemcpyDeviceToHost));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp, d_vectors,\n                            KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDeviceToHost));\n      CUDA_CHECK(cudaMemcpy(h_found_temp, d_found, KEY_NUM * sizeof(bool),\n                            cudaMemcpyDeviceToHost));\n\n      table->accum_or_assign(KEY_NUM, d_keys, d_vectors, d_accum_or_assigns,\n                             nullptr, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n      table->find(KEY_NUM, d_keys, d_vectors, d_found, nullptr, stream);\n\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      int found_num = 0;\n      size_t expected_size = 0;\n      for (int i = 0; i < KEY_NUM; i++) {\n        if (h_found_temp[i] || !h_accum_or_assigns[i]) expected_size++;\n      }\n\n      CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));\n      CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n      CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),\n                            cudaMemcpyDeviceToHost));\n      CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                            cudaMemcpyDeviceToHost));\n\n      CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                            KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDeviceToHost));\n      for (int i = 0; i < KEY_NUM; i++) {\n        if (h_found[i]) {\n          found_num++;\n          for (int j = 0; j < options.dim; j++) {\n            if (h_found_temp[i] && h_accum_or_assigns[i]) {\n              ASSERT_EQ(h_vectors[i * options.dim + j],\n                        h_vectors_temp[i * options.dim + j] +\n                            static_cast<float>(h_keys[i] * 0.00001));\n            } else {\n              ASSERT_EQ(h_vectors[i * options.dim + j],\n                        static_cast<float>(h_keys[i] * 0.00001));\n            }\n          }\n        }\n      }\n      ASSERT_EQ(found_num, expected_size);\n      if (task_n == 0 && current_capacity != table->capacity()) {\n        std::cout << \"[test_dynamic_rehash_on_multi_threads] The capacity \"\n                     \"changed from \"\n                  << current_capacity << \" to \" << table->capacity()\n                  << std::endl;\n        current_capacity = table->capacity();\n      }\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n    }\n    CUDA_CHECK(cudaStreamDestroy(stream));\n\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n\n    CUDA_CHECK(cudaFreeHost(h_keys));\n    CUDA_CHECK(cudaFreeHost(h_found));\n    CUDA_CHECK(cudaFreeHost(h_vectors));\n    CUDA_CHECK(cudaFreeHost(h_keys_temp));\n    CUDA_CHECK(cudaFreeHost(h_found_temp));\n    CUDA_CHECK(cudaFreeHost(h_vectors_temp));\n    CUDA_CHECK(cudaFreeHost(h_accum_or_assigns));\n\n    CUDA_CHECK(cudaFree(d_keys));\n    CUDA_CHECK(cudaFree(d_vectors));\n    CUDA_CHECK(cudaFree(d_found));\n    CUDA_CHECK(cudaFree(d_accum_or_assigns));\n    CUDA_CHECK(cudaDeviceSynchronize());\n\n    CudaCheckError();\n  };\n\n  for (int i = 0; i < THREAD_N; ++i)\n    threads.emplace_back(std::thread(worker_function, i));\n\n  for (auto& th : threads) {\n    th.join();\n  }\n  ASSERT_GE(table->capacity() * 2, MAX_CAPACITY);\n}\n\ntemplate <typename K, typename V, typename S, typename Table, size_t dim = 64>\nvoid CheckAccumOrAssignValues(Table* table,\n                              test_util::KVMSBuffer<K, V, S>& data_buffer,\n                              size_t len, cudaStream_t stream) {\n  std::map<K, test_util::ValueArray<V, dim>> map_before_insert;\n  std::map<K, test_util::ValueArray<V, dim>> map_after_insert;\n  std::unordered_set<K> map_current_batch;\n\n  K* keys = data_buffer.keys_ptr();\n  V* values = data_buffer.values_ptr();\n  S* scores = data_buffer.scores_ptr();\n\n  for (int i = 0; i < len; i++) {\n    map_current_batch.insert(data_buffer.keys_ptr(false)[i]);\n  }\n\n  K* h_tmp_keys = nullptr;\n  V* h_tmp_values = nullptr;\n  S* h_tmp_scores = nullptr;\n  bool* h_tmp_accum_or_assigns = nullptr;\n\n  K* d_tmp_keys = nullptr;\n  V* d_tmp_values = nullptr;\n  S* d_tmp_scores = nullptr;\n  bool* d_tmp_accum_or_assigns = nullptr;\n\n  size_t table_size_before = table->size(stream);\n  size_t cap = table_size_before + len;\n\n  h_tmp_keys = (K*)malloc(cap * sizeof(K));\n  h_tmp_values = (V*)malloc(cap * dim * sizeof(V));\n  h_tmp_scores = (S*)malloc(cap * sizeof(S));\n  h_tmp_accum_or_assigns = (bool*)malloc(cap * sizeof(bool));\n\n  CUDA_CHECK(cudaMallocAsync(&d_tmp_keys, cap * sizeof(K), stream));\n  CUDA_CHECK(cudaMemsetAsync(d_tmp_keys, 0, cap * sizeof(K), stream));\n  CUDA_CHECK(cudaMallocAsync(&d_tmp_values, cap * dim * sizeof(V), stream));\n  CUDA_CHECK(cudaMemsetAsync(d_tmp_values, 0, cap * dim * sizeof(V), stream));\n  CUDA_CHECK(cudaMallocAsync(&d_tmp_scores, cap * sizeof(S), stream));\n  CUDA_CHECK(cudaMemsetAsync(d_tmp_scores, 0, cap * sizeof(S), stream));\n  CUDA_CHECK(\n      cudaMallocAsync(&d_tmp_accum_or_assigns, cap * sizeof(bool), stream));\n  CUDA_CHECK(\n      cudaMemsetAsync(d_tmp_accum_or_assigns, 0, cap * sizeof(bool), stream));\n\n  table->find(len, keys, d_tmp_values, d_tmp_accum_or_assigns, nullptr, stream);\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_accum_or_assigns, d_tmp_accum_or_assigns,\n                             len * sizeof(bool), cudaMemcpyDeviceToHost,\n                             stream));\n\n  CUDA_CHECK(cudaMemsetAsync(d_tmp_keys, 0, cap * sizeof(K), stream));\n  CUDA_CHECK(cudaMemsetAsync(d_tmp_values, 0, cap * dim * sizeof(V), stream));\n  CUDA_CHECK(cudaMemsetAsync(d_tmp_scores, 0, cap * sizeof(S), stream));\n\n  size_t table_size_verify0 = table->export_batch(\n      table->capacity(), 0, d_tmp_keys, d_tmp_values, d_tmp_scores, stream);\n  ASSERT_EQ(table_size_before, table_size_verify0);\n\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys,\n                             table_size_before * sizeof(K),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,\n                             table_size_before * dim * sizeof(V),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores,\n                             table_size_before * sizeof(S),\n                             cudaMemcpyDeviceToHost, stream));\n\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n\n  for (size_t i = 0; i < table_size_verify0; i++) {\n    test_util::ValueArray<V, dim>* vec =\n        reinterpret_cast<test_util::ValueArray<V, dim>*>(h_tmp_values +\n                                                         i * dim);\n    map_before_insert[h_tmp_keys[i]] = *vec;\n  }\n\n  auto start = std::chrono::steady_clock::now();\n  table->accum_or_assign(len, keys, values, d_tmp_accum_or_assigns, nullptr,\n                         stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  auto end = std::chrono::steady_clock::now();\n  auto diff = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start);\n\n  float dur = diff.count();\n\n  size_t table_size_after = table->size(stream);\n  size_t table_size_verify1 = table->export_batch(\n      table->capacity(), 0, d_tmp_keys, d_tmp_values, d_tmp_scores, stream);\n\n  ASSERT_EQ(table_size_verify1, table_size_after);\n\n  size_t new_cap = table_size_after;\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys,\n                             table_size_after * sizeof(K),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,\n                             table_size_after * dim * sizeof(V),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores,\n                             table_size_after * sizeof(S),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_accum_or_assigns, d_tmp_accum_or_assigns,\n                             table_size_after * sizeof(bool),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  int64_t new_cap_K = (int64_t)new_cap;\n  for (int64_t i = new_cap_K - 1; i >= 0; i--) {\n    test_util::ValueArray<V, dim>* vec =\n        reinterpret_cast<test_util::ValueArray<V, dim>*>(h_tmp_values +\n                                                         i * dim);\n    map_after_insert[h_tmp_keys[i]] = *vec;\n  }\n\n  size_t value_diff_cnt = 0;\n  for (auto& it : map_after_insert) {\n    test_util::ValueArray<V, dim>& vec = map_after_insert.at(it.first);\n    bool existed =\n        (map_before_insert.end() != map_before_insert.find(it.first));\n    bool required =\n        (map_current_batch.end() != map_current_batch.find(it.first));\n    V expected_value = 0;\n\n    if (existed) {\n      if (required) {\n        expected_value = (map_before_insert.at(it.first)[0] +\n                          static_cast<V>(it.first * 0.00001));\n      } else {\n        expected_value = map_before_insert.at(it.first)[0];\n      }\n    } else {\n      if (required) {\n        expected_value = static_cast<V>(it.first * 0.00001);\n      }\n    }\n    for (size_t j = 0; j < dim; j++) {\n      if (vec[j] != expected_value) {\n        ++value_diff_cnt;\n        break;\n      }\n    }\n  }\n  ASSERT_EQ(value_diff_cnt, 0);\n  std::cout << \"Check accum_or_assign behavior got \"\n            << \"value_diff_cnt: \" << value_diff_cnt\n            << \", while table_size_before: \" << table_size_before\n            << \", while table_size_after: \" << table_size_after\n            << \", while len: \" << len << std::endl;\n\n  CUDA_CHECK(cudaFreeAsync(d_tmp_keys, stream));\n  CUDA_CHECK(cudaFreeAsync(d_tmp_values, stream));\n  CUDA_CHECK(cudaFreeAsync(d_tmp_scores, stream));\n  CUDA_CHECK(cudaFreeAsync(d_tmp_accum_or_assigns, stream));\n  free(h_tmp_keys);\n  free(h_tmp_values);\n  free(h_tmp_scores);\n  free(h_tmp_accum_or_assigns);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n}\n\nvoid test_accum_or_assign_values_check(size_t max_hbm_for_vectors) {\n  const size_t U = 524288;\n  const size_t init_capacity = 1024;\n  const size_t B = 524288 + 13;\n  constexpr size_t dim = 64;\n\n  TableOptions opt;\n\n  opt.max_capacity = U;\n  opt.init_capacity = init_capacity;\n  opt.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>;\n  opt.dim = dim;\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  std::unique_ptr<Table> table = std::make_unique<Table>();\n  table->init(opt);\n\n  test_util::KVMSBuffer<K, V, S> data_buffer;\n  data_buffer.Reserve(B, dim, stream);\n\n  size_t offset = 0;\n  S score = 0;\n  for (int i = 0; i < 20; i++) {\n    test_util::create_random_keys<K, S, V, dim>(\n        data_buffer.keys_ptr(false), data_buffer.scores_ptr(false),\n        data_buffer.values_ptr(false), (int)B, B * 16);\n    data_buffer.SyncData(true, stream);\n\n    CheckAccumOrAssignValues<K, V, S, Table, dim>(table.get(), data_buffer, B,\n                                                  stream);\n\n    offset += B;\n    score += 1;\n  }\n}\n\nTEST(AccumOrAssignTest, test_export_batch_if) {\n  test_export_batch_if(16, 22);\n  test_export_batch_if(0, 0);\n}\nTEST(AccumOrAssignTest, test_basic_when_full) {\n  test_basic_when_full(16, 2);\n  test_basic_when_full(0, 0);\n}\nTEST(AccumOrAssignTest, test_erase_if_pred) {\n  test_erase_if_pred(16, 0);\n  test_erase_if_pred(0, 5);\n}\nTEST(AccumOrAssignTest, test_rehash) {\n  test_rehash(16, 7);\n  test_rehash(0, 0);\n}\nTEST(AccumOrAssignTest, test_rehash_on_big_batch) {\n  test_rehash_on_big_batch(16, 9);\n  test_rehash_on_big_batch(0, 0);\n}\nTEST(AccumOrAssignTest, test_dynamic_rehash_on_multi_threads) {\n  test_dynamic_rehash_on_multi_threads(16, 56);\n  test_dynamic_rehash_on_multi_threads(0);\n}\nTEST(AccumOrAssignTest, test_evict_strategy_lru_basic) {\n  test_evict_strategy_lru_basic(16);\n  test_evict_strategy_lru_basic(0);\n}\nTEST(AccumOrAssignTest, test_evict_strategy_lfu_basic) {\n  test_evict_strategy_lfu_basic(16, 3);\n  // TODO(rhdong): Add back when diff error issue fixed in hybrid mode.\n  // test_evict_strategy_lfu_basic(0);\n}\n\nTEST(AccumOrAssignTest, test_evict_strategy_epochlru_basic) {\n  test_evict_strategy_epochlru_basic(16, 33);\n  test_evict_strategy_epochlru_basic(0);\n}\n\nTEST(AccumOrAssignTest, test_evict_strategy_epochlfu_basic) {\n  test_evict_strategy_epochlfu_basic(16);\n  test_evict_strategy_epochlfu_basic(0, 44);\n}\nTEST(AccumOrAssignTest, test_evict_strategy_customized_basic) {\n  test_evict_strategy_customized_basic(16);\n  test_evict_strategy_customized_basic(0, 23);\n}\nTEST(AccumOrAssignTest, test_evict_strategy_customized_advanced) {\n  test_evict_strategy_customized_advanced(16, 16);\n  test_evict_strategy_customized_advanced(0);\n}\nTEST(AccumOrAssignTest, test_evict_strategy_customized_correct_rate) {\n  // TODO(rhdong): after blossom CI issue is resolved, the skip logic.\n  const bool skip_hmem_check = (nullptr != std::getenv(\"IS_BLOSSOM_CI\"));\n  test_evict_strategy_customized_correct_rate(16, 61);\n  if (!skip_hmem_check) {\n    test_evict_strategy_customized_correct_rate(0);\n  } else {\n    std::cout << \"The HMEM check is skipped in blossom CI!\" << std::endl;\n  }\n}\n\nTEST(AccumOrAssignTest, test_accum_or_assign_values_check) {\n  test_accum_or_assign_values_check(16);\n  // TODO(rhdong): Add back when diff error issue fixed in hybrid mode.\n  // test_insert_or_assign_values_check(0);\n}"
  },
  {
    "path": "tests/assign_score_test.cc.cu",
    "content": "/*\n * Copyright (c) 2023, NVIDIA CORPORATION.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *     http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n/*\n * test APIs: find_or_insert and assign,\n * move insert operation from `insert_or_assign` to `find`.\n */\n\n#include <gtest/gtest.h>\n#include <math.h>\n#include <stdio.h>\n#include <stdlib.h>\n#include <algorithm>\n#include <array>\n#include <iostream>\n#include <thread>\n#include <vector>\n#include \"merlin_hashtable.cuh\"\n#include \"test_util.cuh\"\n\nconstexpr size_t DIM = 16;\nusing K = uint64_t;\nusing V = float;\nusing S = uint64_t;\nusing EvictStrategy = nv::merlin::EvictStrategy;\nusing TableOptions = nv::merlin::HashTableOptions;\n\nvoid test_evict_strategy_lru_basic(size_t max_hbm_for_vectors,\n                                   int key_start = 0) {\n  constexpr uint64_t BUCKET_NUM = 8UL;\n  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;\n  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;\n  constexpr uint64_t TEST_KEY_NUM = 4;\n  constexpr uint64_t TEMP_KEY_NUM =\n      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;\n  constexpr uint64_t TEST_TIMES = 128;\n\n  TableOptions options;\n\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>;\n\n  std::vector<K> h_keys_base(BASE_KEY_NUM);\n  std::vector<S> h_scores_base(BASE_KEY_NUM);\n  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_test(TEST_KEY_NUM);\n  std::vector<S> h_scores_test(TEST_KEY_NUM);\n  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_temp(TEMP_KEY_NUM);\n  std::vector<S> h_scores_temp(TEMP_KEY_NUM);\n  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);\n\n  K* d_keys_temp;\n  S* d_scores_temp = nullptr;\n  V* d_vectors_temp;\n\n  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));\n  CUDA_CHECK(\n      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));\n\n  test_util::create_keys_in_one_buckets<K, S, V, DIM>(\n      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),\n      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF);\n\n  test_util::create_keys_in_one_buckets<K, S, V, DIM>(\n      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),\n      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,\n      0xFFFFFFFFFFFFFFFD);\n\n  h_keys_test[2] = h_keys_base[72];\n  h_keys_test[3] = h_keys_base[73];\n\n  for (int i = 0; i < options.dim; i++) {\n    h_vectors_test[2 * options.dim + i] = h_vectors_base[72 * options.dim + i];\n    h_vectors_test[3 * options.dim + i] = h_vectors_base[73 * options.dim + i];\n  }\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  size_t total_size = 0;\n  size_t dump_counter = 0;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    {\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      S start_ts = test_util::host_nano<S>(stream);\n      table->find_or_insert(BASE_KEY_NUM, d_keys_temp, d_vectors_temp, nullptr,\n                            stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      S end_ts = test_util::host_nano<S>(stream);\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      std::vector<S> h_scores_temp_sorted(h_scores_temp);\n      std::sort(h_scores_temp_sorted.begin(), h_scores_temp_sorted.end());\n\n      ASSERT_GE(h_scores_temp_sorted[0], start_ts);\n      ASSERT_LE(h_scores_temp_sorted[TEST_KEY_NUM - 1], end_ts);\n      for (int i = 0; i < dump_counter; i++) {\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n\n    {\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),\n                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),\n                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),\n                            TEST_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      table->find_or_insert(TEST_KEY_NUM, d_keys_temp, d_vectors_temp, nullptr,\n                            stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      S start_ts = test_util::host_nano<S>(stream);\n      table->assign(TEST_KEY_NUM, d_keys_temp, nullptr, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      S end_ts = test_util::host_nano<S>(stream);\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            TEMP_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      std::vector<S> h_scores_temp_sorted(TEST_KEY_NUM);\n      int ctr = 0;\n      for (int i = 0; i < TEMP_KEY_NUM; i++) {\n        if (h_keys_test.end() !=\n            std::find(h_keys_test.begin(), h_keys_test.end(), h_keys_temp[i])) {\n          ASSERT_GT(h_scores_temp[i], BUCKET_MAX_SIZE);\n          h_scores_temp_sorted[ctr++] = h_scores_temp[i];\n        } else {\n          ASSERT_LE(h_scores_temp[i], start_ts);\n        }\n      }\n      std::sort(h_scores_temp_sorted.begin(),\n                h_scores_temp_sorted.begin() + ctr);\n\n      ASSERT_GE(h_scores_temp_sorted[0], start_ts);\n      ASSERT_LE(h_scores_temp_sorted[ctr - 1], end_ts);\n      for (int i = 0; i < dump_counter; i++) {\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaFree(d_keys_temp));\n  CUDA_CHECK(cudaFree(d_scores_temp));\n  CUDA_CHECK(cudaFree(d_vectors_temp));\n\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_evict_strategy_lfu_basic(size_t max_hbm_for_vectors,\n                                   int key_start = 0) {\n  constexpr uint64_t BUCKET_NUM = 8UL;\n  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;\n  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;\n  constexpr uint64_t TEST_KEY_NUM = 4;\n  constexpr uint64_t TEMP_KEY_NUM =\n      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;\n  constexpr uint64_t TEST_TIMES = 128;\n\n  TableOptions options;\n\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLfu>;\n\n  std::vector<K> h_keys_base(BASE_KEY_NUM);\n  std::vector<S> h_scores_base(BASE_KEY_NUM);\n  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_test(TEST_KEY_NUM);\n  std::vector<S> h_scores_test(TEST_KEY_NUM);\n  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_temp(TEMP_KEY_NUM);\n  std::vector<S> h_scores_temp(TEMP_KEY_NUM);\n  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);\n\n  K* d_keys_temp;\n  S* d_scores_temp = nullptr;\n  V* d_vectors_temp;\n\n  int freq_range = 1000;\n\n  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));\n  CUDA_CHECK(\n      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));\n\n  test_util::create_keys_in_one_buckets_lfu<K, S, V, DIM>(\n      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),\n      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF,\n      freq_range);\n\n  test_util::create_keys_in_one_buckets_lfu<K, S, V, DIM>(\n      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),\n      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,\n      0xFFFFFFFFFFFFFFFD, freq_range);\n\n  h_keys_test[2] = h_keys_base[72];\n  h_keys_test[3] = h_keys_base[73];\n\n  h_scores_test[2] = h_keys_base[72] % freq_range;\n  h_scores_test[3] = h_keys_base[73] % freq_range;\n\n  for (int i = 0; i < options.dim; i++) {\n    h_vectors_test[2 * options.dim + i] = h_vectors_base[72 * options.dim + i];\n    h_vectors_test[3 * options.dim + i] = h_vectors_base[73 * options.dim + i];\n  }\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  size_t total_size = 0;\n  size_t dump_counter = 0;\n  S global_epoch = 1;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    {\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      table->find_or_insert(BASE_KEY_NUM, d_keys_temp, d_vectors_temp,\n                            d_scores_temp, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      for (int i = 0; i < dump_counter; i++) {\n        ASSERT_EQ(h_scores_temp[i], h_keys_temp[i] % freq_range);\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n\n    {\n      global_epoch++;\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),\n                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),\n                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),\n                            TEST_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n\n      table->assign(TEST_KEY_NUM, d_keys_temp, d_scores_temp, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      table->find_or_insert(TEST_KEY_NUM, d_keys_temp, d_vectors_temp,\n                            d_scores_temp, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            TEMP_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      for (int i = 0; i < dump_counter; i++) {\n        bool in_base =\n            h_keys_base.end() !=\n            std::find(h_keys_base.begin(), h_keys_base.end(), h_keys_temp[i]);\n        bool in_test =\n            h_keys_test.end() !=\n            std::find(h_keys_test.begin(), h_keys_test.end(), h_keys_temp[i]);\n        if (in_base && in_test) {\n          ASSERT_EQ(h_scores_temp[i], (h_keys_temp[i] % freq_range) *\n                                          3);  // will update score when found.\n        } else {\n          ASSERT_EQ(h_scores_temp[i], (h_keys_temp[i] % freq_range));\n        }\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaFree(d_keys_temp));\n  CUDA_CHECK(cudaFree(d_scores_temp));\n  CUDA_CHECK(cudaFree(d_vectors_temp));\n\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_evict_strategy_epochlru_basic(size_t max_hbm_for_vectors,\n                                        int key_start = 0) {\n  constexpr int RSHIFT_ON_NANO = 20;\n\n  constexpr uint64_t BUCKET_NUM = 8UL;\n  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;\n  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;\n  constexpr uint64_t TEST_KEY_NUM = 4;\n  constexpr uint64_t TEMP_KEY_NUM =\n      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;\n  constexpr uint64_t TEST_TIMES = 128;\n\n  TableOptions options;\n\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kEpochLru>;\n\n  std::vector<K> h_keys_base(BASE_KEY_NUM);\n  std::vector<S> h_scores_base(BASE_KEY_NUM);\n  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_test(TEST_KEY_NUM);\n  std::vector<S> h_scores_test(TEST_KEY_NUM);\n  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_temp(TEMP_KEY_NUM);\n  std::vector<S> h_scores_temp(TEMP_KEY_NUM);\n  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);\n\n  K* d_keys_temp;\n  S* d_scores_temp = nullptr;\n  V* d_vectors_temp;\n\n  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));\n  CUDA_CHECK(\n      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));\n\n  test_util::create_keys_in_one_buckets<K, S, V, DIM>(\n      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),\n      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF);\n\n  test_util::create_keys_in_one_buckets<K, S, V, DIM>(\n      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),\n      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,\n      0xFFFFFFFFFFFFFFFD);\n\n  h_keys_test[2] = h_keys_base[72];\n  h_keys_test[3] = h_keys_base[73];\n\n  for (int i = 0; i < options.dim; i++) {\n    h_vectors_test[2 * options.dim + i] = h_vectors_base[72 * options.dim + i];\n    h_vectors_test[3 * options.dim + i] = h_vectors_base[73 * options.dim + i];\n  }\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  size_t total_size = 0;\n  size_t dump_counter = 0;\n  S global_epoch = 1;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    {\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      S start_ts =\n          (test_util::host_nano<S>(stream) >> RSHIFT_ON_NANO) & 0xFFFFFFFF;\n      table->set_global_epoch(global_epoch);\n      table->find_or_insert(BASE_KEY_NUM, d_keys_temp, d_vectors_temp, nullptr,\n                            stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      S end_ts =\n          (test_util::host_nano<S>(stream) >> RSHIFT_ON_NANO) & 0xFFFFFFFF;\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      std::vector<S> h_scores_temp_sorted(h_scores_temp);\n      std::sort(h_scores_temp_sorted.begin(), h_scores_temp_sorted.end());\n\n      ASSERT_GE(h_scores_temp_sorted[0], (global_epoch << 32 | start_ts));\n      ASSERT_LE(h_scores_temp_sorted[BASE_KEY_NUM - 1],\n                (global_epoch << 32 | end_ts));\n      for (int i = 0; i < dump_counter; i++) {\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n\n    {\n      global_epoch++;\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),\n                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),\n                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),\n                            TEST_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      S start_ts =\n          (test_util::host_nano<S>(stream) >> RSHIFT_ON_NANO) & 0xFFFFFFFF;\n      table->set_global_epoch(global_epoch);\n      table->assign(TEST_KEY_NUM, d_keys_temp, nullptr, stream);\n      table->find_or_insert(TEST_KEY_NUM, d_keys_temp, d_vectors_temp, nullptr,\n                            stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      S end_ts =\n          (test_util::host_nano<S>(stream) >> RSHIFT_ON_NANO) & 0xFFFFFFFF;\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            TEMP_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      std::vector<S> h_scores_temp_sorted(TEST_KEY_NUM);\n      int ctr = 0;\n      for (int i = 0; i < TEMP_KEY_NUM; i++) {\n        if (h_keys_test.end() !=\n            std::find(h_keys_test.begin(), h_keys_test.end(), h_keys_temp[i])) {\n          ASSERT_GE(h_scores_temp[i], (global_epoch << 32 | start_ts));\n          h_scores_temp_sorted[ctr++] = h_scores_temp[i];\n        } else {\n          ASSERT_LE(h_scores_temp[i], (global_epoch << 32 | start_ts));\n        }\n      }\n      std::sort(h_scores_temp_sorted.begin(),\n                h_scores_temp_sorted.begin() + ctr);\n\n      ASSERT_GE(h_scores_temp_sorted[0], (global_epoch << 32 | start_ts));\n      ASSERT_LE(h_scores_temp_sorted[ctr - 1], (global_epoch << 32 | end_ts));\n      for (int i = 0; i < dump_counter; i++) {\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaFree(d_keys_temp));\n  CUDA_CHECK(cudaFree(d_scores_temp));\n  CUDA_CHECK(cudaFree(d_vectors_temp));\n\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_evict_strategy_epochlfu_basic(size_t max_hbm_for_vectors,\n                                        int key_start = 0) {\n  constexpr uint64_t BUCKET_NUM = 8UL;\n  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;\n  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;\n  constexpr uint64_t TEST_KEY_NUM = 4;\n  constexpr uint64_t TEMP_KEY_NUM =\n      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;\n  constexpr uint64_t TEST_TIMES = 128;\n\n  TableOptions options;\n\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kEpochLfu>;\n\n  std::vector<K> h_keys_base(BASE_KEY_NUM);\n  std::vector<S> h_scores_base(BASE_KEY_NUM);\n  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_test(TEST_KEY_NUM);\n  std::vector<S> h_scores_test(TEST_KEY_NUM);\n  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_temp(TEMP_KEY_NUM);\n  std::vector<S> h_scores_temp(TEMP_KEY_NUM);\n  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);\n\n  K* d_keys_temp;\n  S* d_scores_temp = nullptr;\n  V* d_vectors_temp;\n\n  int freq_range = 1000;\n\n  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));\n  CUDA_CHECK(\n      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));\n\n  test_util::create_keys_in_one_buckets_lfu<K, S, V, DIM>(\n      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),\n      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF,\n      freq_range);\n\n  test_util::create_keys_in_one_buckets_lfu<K, S, V, DIM>(\n      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),\n      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,\n      0xFFFFFFFFFFFFFFFD, freq_range);\n\n  // Simulate overflow of low 32bits.\n  h_scores_base[71] = static_cast<S>(std::numeric_limits<uint32_t>::max() -\n                                     static_cast<uint32_t>(1));\n\n  h_keys_test[1] = h_keys_base[71];\n  h_keys_test[2] = h_keys_base[72];\n  h_keys_test[3] = h_keys_base[73];\n\n  h_scores_test[1] = h_scores_base[71];\n  h_scores_test[2] = h_keys_base[72] % freq_range;\n  h_scores_test[3] = h_keys_base[73] % freq_range;\n\n  for (int i = 0; i < options.dim; i++) {\n    h_vectors_test[1 * options.dim + i] = h_vectors_base[71 * options.dim + i];\n    h_vectors_test[2 * options.dim + i] = h_vectors_base[72 * options.dim + i];\n    h_vectors_test[3 * options.dim + i] = h_vectors_base[73 * options.dim + i];\n  }\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  size_t total_size = 0;\n  size_t dump_counter = 0;\n  S global_epoch = 1;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    {\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      table->set_global_epoch(global_epoch);\n      table->find_or_insert(BASE_KEY_NUM, d_keys_temp, d_vectors_temp,\n                            d_scores_temp, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      for (int i = 0; i < dump_counter; i++) {\n        if (h_keys_temp[i] == h_keys_base[71]) {\n          S expected_score = test_util::make_expected_score_for_epochlfu<S>(\n              global_epoch, h_scores_base[71]);\n          ASSERT_EQ(h_scores_temp[i], expected_score);\n        } else {\n          S expected_score = test_util::make_expected_score_for_epochlfu<S>(\n              global_epoch, (h_keys_temp[i] % freq_range));\n          ASSERT_EQ(h_scores_temp[i], expected_score);\n        }\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n\n    {\n      global_epoch++;\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),\n                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),\n                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),\n                            TEST_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      table->set_global_epoch(global_epoch);\n      table->assign(TEST_KEY_NUM, d_keys_temp, d_scores_temp, stream);\n      table->find_or_insert(TEST_KEY_NUM, d_keys_temp, d_vectors_temp,\n                            d_scores_temp, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            TEMP_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      ASSERT_TRUE(h_keys_temp.end() != std::find(h_keys_temp.begin(),\n                                                 h_keys_temp.end(),\n                                                 h_keys_base[71]));\n\n      for (int i = 0; i < dump_counter; i++) {\n        bool in_base =\n            h_keys_base.end() !=\n            std::find(h_keys_base.begin(), h_keys_base.end(), h_keys_temp[i]);\n        bool in_test =\n            h_keys_test.end() !=\n            std::find(h_keys_test.begin(), h_keys_test.end(), h_keys_temp[i]);\n\n        if (in_base && in_test) {\n          if (h_keys_temp[i] == h_keys_base[71]) {\n            S expected_score = test_util::make_expected_score_for_epochlfu<S>(\n                global_epoch, h_scores_base[71] * 2);\n            ASSERT_EQ(h_scores_temp[i], expected_score);\n          } else {\n            S expected_score = test_util::make_expected_score_for_epochlfu<S>(\n                global_epoch, (h_keys_temp[i] % freq_range) *\n                                  3);  // will update score when found.\n            ASSERT_EQ(h_scores_temp[i], expected_score);\n          }\n        } else {\n          if (h_keys_temp[i] == h_keys_base[71]) {\n            S expected_score = test_util::make_expected_score_for_epochlfu<S>(\n                global_epoch - static_cast<S>(in_base), h_scores_base[71]);\n            ASSERT_EQ(h_scores_temp[i], expected_score);\n          } else {\n            S expected_score = test_util::make_expected_score_for_epochlfu<S>(\n                global_epoch - static_cast<S>(in_base),\n                (h_keys_temp[i] % freq_range));\n\n            ASSERT_EQ(h_scores_temp[i], expected_score);\n          }\n        }\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaFree(d_keys_temp));\n  CUDA_CHECK(cudaFree(d_scores_temp));\n  CUDA_CHECK(cudaFree(d_vectors_temp));\n\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_evict_strategy_customized_basic(size_t max_hbm_for_vectors,\n                                          int key_start = 0) {\n  constexpr uint64_t BUCKET_NUM = 8UL;\n  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;\n  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;\n  constexpr uint64_t TEST_KEY_NUM = 128;\n  constexpr uint64_t TEMP_KEY_NUM =\n      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;\n  constexpr uint64_t TEST_TIMES = 128;\n\n  TableOptions options;\n\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  std::vector<K> h_keys_base(BASE_KEY_NUM);\n  std::vector<S> h_scores_base(BASE_KEY_NUM);\n  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_test(TEST_KEY_NUM);\n  std::vector<S> h_scores_test(TEST_KEY_NUM);\n  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_temp(TEMP_KEY_NUM);\n  std::vector<S> h_scores_temp(TEMP_KEY_NUM);\n  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);\n\n  K* d_keys_temp;\n  S* d_scores_temp = nullptr;\n  V* d_vectors_temp;\n\n  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));\n  CUDA_CHECK(\n      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));\n\n  test_util::create_keys_in_one_buckets<K, S, V, DIM>(\n      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),\n      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF);\n\n  const S base_score_start = 1000;\n  for (int i = 0; i < BASE_KEY_NUM; i++) {\n    h_scores_base[i] = base_score_start + i;\n  }\n\n  test_util::create_keys_in_one_buckets<K, S, V, DIM>(\n      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),\n      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,\n      0xFFFFFFFFFFFFFFFD);\n  const S test_score_start = base_score_start + BASE_KEY_NUM;\n  for (int i = 0; i < TEST_KEY_NUM; i++) {\n    h_scores_test[i] = test_score_start + i;\n  }\n  for (int i = 64; i < TEST_KEY_NUM; i++) {\n    h_keys_test[i] = h_keys_base[i];\n    for (int j = 0; j < options.dim; j++) {\n      h_vectors_test[i * options.dim + j] = h_vectors_base[i * options.dim + j];\n    }\n  }\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  size_t total_size = 0;\n  size_t dump_counter = 0;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    {\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      table->find_or_insert(BASE_KEY_NUM, d_keys_temp, d_vectors_temp,\n                            d_scores_temp, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      std::vector<S> h_scores_temp_sorted(h_scores_temp);\n      std::sort(h_scores_temp_sorted.begin(), h_scores_temp_sorted.end());\n\n      auto expected_range = test_util::range<S, TEMP_KEY_NUM>(base_score_start);\n      ASSERT_TRUE(std::equal(h_scores_temp_sorted.begin(),\n                             h_scores_temp_sorted.end(),\n                             expected_range.begin()));\n      for (int i = 0; i < dump_counter; i++) {\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n\n    {\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),\n                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),\n                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),\n                            TEST_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      table->assign(TEST_KEY_NUM, d_keys_temp, d_scores_temp, stream);\n      table->find_or_insert(TEST_KEY_NUM, d_keys_temp, d_vectors_temp,\n                            d_scores_temp, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            TEMP_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      std::vector<S> h_scores_temp_sorted(h_scores_temp);\n      std::sort(h_scores_temp_sorted.begin(), h_scores_temp_sorted.end());\n\n      auto expected_range_test =\n          test_util::range<S, TEST_KEY_NUM>(test_score_start);\n      ASSERT_TRUE(std::equal(h_scores_temp_sorted.begin(),\n                             h_scores_temp_sorted.end(),\n                             expected_range_test.begin()));\n      for (int i = 0; i < dump_counter; i++) {\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaFree(d_keys_temp));\n  CUDA_CHECK(cudaFree(d_scores_temp));\n  CUDA_CHECK(cudaFree(d_vectors_temp));\n\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_evict_strategy_customized_advanced(size_t max_hbm_for_vectors,\n                                             int key_start = 0) {\n  constexpr uint64_t BUCKET_NUM = 8UL;\n  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;\n  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;\n  constexpr uint64_t TEST_KEY_NUM = 8;\n  constexpr uint64_t TEMP_KEY_NUM =\n      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;\n  constexpr uint64_t TEST_TIMES = 256;\n\n  TableOptions options;\n\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  std::vector<K> h_keys_base(BASE_KEY_NUM);\n  std::vector<S> h_scores_base(BASE_KEY_NUM);\n  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_test(TEST_KEY_NUM);\n  std::vector<S> h_scores_test(TEST_KEY_NUM);\n  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_temp(TEMP_KEY_NUM);\n  std::vector<S> h_scores_temp(TEMP_KEY_NUM);\n  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);\n\n  K* d_keys_temp;\n  S* d_scores_temp = nullptr;\n  V* d_vectors_temp;\n\n  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));\n  CUDA_CHECK(\n      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));\n\n  test_util::create_keys_in_one_buckets<K, S, V, DIM>(\n      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),\n      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF);\n\n  const S base_score_start = 1000;\n  for (int i = 0; i < BASE_KEY_NUM; i++) {\n    h_scores_base[i] = base_score_start + i;\n  }\n\n  test_util::create_keys_in_one_buckets<K, S, V, DIM>(\n      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),\n      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,\n      0xFFFFFFFFFFFFFFFD);\n\n  h_keys_test[4] = h_keys_base[72];\n  h_keys_test[5] = h_keys_base[73];\n  h_keys_test[6] = h_keys_base[74];\n  h_keys_test[7] = h_keys_base[75];\n\n  // replace four new keys to lower scores, would not be inserted.\n  h_scores_test[0] = 20;\n  h_scores_test[1] = 78;\n  h_scores_test[2] = 97;\n  h_scores_test[3] = 98;\n\n  // replace three exist keys to new scores, just refresh the score for them.\n  h_scores_test[4] = 99;\n  h_scores_test[5] = 1010;\n  h_scores_test[6] = 1020;\n  h_scores_test[7] = 1035;\n\n  for (int i = 4; i < TEST_KEY_NUM; i++) {\n    for (int j = 0; j < options.dim; j++) {\n      h_vectors_test[i * options.dim + j] =\n          static_cast<V>(h_keys_test[i] * 0.00001);\n    }\n  }\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  size_t total_size = 0;\n  size_t dump_counter = 0;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    {\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      table->find_or_insert(BASE_KEY_NUM, d_keys_temp, d_vectors_temp,\n                            d_scores_temp, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      std::vector<S> h_scores_temp_sorted(h_scores_temp);\n      std::sort(h_scores_temp_sorted.begin(), h_scores_temp_sorted.end());\n\n      auto expected_range = test_util::range<S, TEMP_KEY_NUM>(base_score_start);\n      ASSERT_TRUE(std::equal(h_scores_temp_sorted.begin(),\n                             h_scores_temp_sorted.end(),\n                             expected_range.begin()));\n      for (int i = 0; i < dump_counter; i++) {\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n\n    {\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),\n                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),\n                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),\n                            TEST_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      table->assign(TEST_KEY_NUM, d_keys_temp, d_scores_temp, stream);\n      table->find_or_insert(TEST_KEY_NUM, d_keys_temp, d_vectors_temp,\n                            d_scores_temp, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            TEMP_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      for (int i = 0; i < TEST_KEY_NUM; i++) {\n        if (i < 4) {\n          ASSERT_EQ(h_keys_temp.end(),\n                    std::find(h_keys_temp.begin(), h_keys_temp.end(),\n                              h_keys_test[i]));\n        } else {\n          ASSERT_NE(h_keys_temp.end(),\n                    std::find(h_keys_temp.begin(), h_keys_temp.end(),\n                              h_keys_test[i]));\n        }\n      }\n      for (int i = 0; i < TEMP_KEY_NUM; i++) {\n        if (h_keys_temp[i] == h_keys_test[4])\n          ASSERT_EQ(h_scores_temp[i], h_scores_test[4]);\n        if (h_keys_temp[i] == h_keys_test[5])\n          ASSERT_EQ(h_scores_temp[i], h_scores_test[5]);\n        if (h_keys_temp[i] == h_keys_test[6])\n          ASSERT_EQ(h_scores_temp[i], h_scores_test[6]);\n        if (h_keys_temp[i] == h_keys_test[7])\n          ASSERT_EQ(h_scores_temp[i], h_scores_test[7]);\n\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaFree(d_keys_temp));\n  CUDA_CHECK(cudaFree(d_scores_temp));\n  CUDA_CHECK(cudaFree(d_vectors_temp));\n\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\ntemplate <typename K, typename V, typename S, typename Table, size_t dim = 64>\nvoid CheckAssignOnEpochLfu(Table* table,\n                           test_util::KVMSBuffer<K, V, S>* data_buffer,\n                           test_util::KVMSBuffer<K, V, S>* evict_buffer,\n                           test_util::KVMSBuffer<K, V, S>* pre_data_buffer,\n                           size_t len, cudaStream_t stream, TableOptions& opt,\n                           unsigned int global_epoch) {\n  std::map<K, test_util::ValueArray<V, dim>> values_map_before_insert;\n  std::map<K, test_util::ValueArray<V, dim>> values_map_after_insert;\n\n  std::unordered_map<K, S> scores_map_before_insert;\n  std::map<K, S> scores_map_after_insert;\n\n  std::map<K, S> scores_map_current_batch;\n  std::map<K, S> scores_map_current_evict;\n\n  K* keys = data_buffer->keys_ptr();\n  V* values = data_buffer->values_ptr();\n  S* scores = data_buffer->scores_ptr();\n\n  K* evicted_keys = evict_buffer->keys_ptr();\n  V* evicted_values = evict_buffer->values_ptr();\n  S* evicted_scores = evict_buffer->scores_ptr();\n\n  for (size_t i = 0; i < len; i++) {\n    scores_map_current_batch[data_buffer->keys_ptr(false)[i]] =\n        data_buffer->scores_ptr(false)[i];\n  }\n\n  K* h_tmp_keys = nullptr;\n  V* h_tmp_values = nullptr;\n  S* h_tmp_scores = nullptr;\n  bool* h_tmp_founds = nullptr;\n\n  K* d_tmp_keys = nullptr;\n  V* d_tmp_values = nullptr;\n  S* d_tmp_scores = nullptr;\n  bool* d_tmp_founds = nullptr;\n\n  size_t table_size_before = table->size(stream);\n  size_t cap = table_size_before + len;\n\n  CUDA_CHECK(cudaMallocAsync(&d_tmp_keys, cap * sizeof(K), stream));\n  CUDA_CHECK(cudaMemsetAsync(d_tmp_keys, 0, cap * sizeof(K), stream));\n  CUDA_CHECK(cudaMallocAsync(&d_tmp_values, cap * dim * sizeof(V), stream));\n  CUDA_CHECK(cudaMemsetAsync(d_tmp_values, 0, cap * dim * sizeof(V), stream));\n  CUDA_CHECK(cudaMallocAsync(&d_tmp_scores, cap * sizeof(S), stream));\n  CUDA_CHECK(cudaMemsetAsync(d_tmp_scores, 0, cap * sizeof(S), stream));\n  CUDA_CHECK(cudaMallocAsync(&d_tmp_founds, cap * sizeof(bool), stream));\n  CUDA_CHECK(cudaMemsetAsync(d_tmp_founds, 0, cap * sizeof(bool), stream));\n  h_tmp_keys = (K*)malloc(cap * sizeof(K));\n  h_tmp_values = (V*)malloc(cap * dim * sizeof(V));\n  h_tmp_scores = (S*)malloc(cap * sizeof(S));\n  h_tmp_founds = (bool*)malloc(cap * sizeof(bool));\n\n  size_t table_size_verify0 = table->export_batch(\n      table->capacity(), 0, d_tmp_keys, d_tmp_values, d_tmp_scores, stream);\n  ASSERT_EQ(table_size_before, table_size_verify0);\n\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys,\n                             table_size_before * sizeof(K),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,\n                             table_size_before * dim * sizeof(V),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores,\n                             table_size_before * sizeof(S),\n                             cudaMemcpyDeviceToHost, stream));\n\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys + table_size_before, keys,\n                             len * sizeof(K), cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values + table_size_before * dim, values,\n                             len * dim * sizeof(V), cudaMemcpyDeviceToHost,\n                             stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores + table_size_before, scores,\n                             len * sizeof(S), cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n\n  for (size_t i = 0; i < cap; i++) {\n    test_util::ValueArray<V, dim>* vec =\n        reinterpret_cast<test_util::ValueArray<V, dim>*>(h_tmp_values +\n                                                         i * dim);\n    values_map_before_insert[h_tmp_keys[i]] = *vec;\n  }\n\n  for (size_t i = 0; i < table_size_before; i++) {\n    scores_map_before_insert[h_tmp_keys[i]] = h_tmp_scores[i];\n  }\n\n  table->set_global_epoch(global_epoch);\n  table->assign(len, keys, scores, stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n\n  {\n    size_t table_size_verify1 = table->export_batch(\n        table->capacity(), 0, d_tmp_keys, d_tmp_values, d_tmp_scores, stream);\n\n    CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys,\n                               table_size_before * sizeof(K),\n                               cudaMemcpyDeviceToHost, stream));\n    CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,\n                               table_size_before * dim * sizeof(V),\n                               cudaMemcpyDeviceToHost, stream));\n    CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores,\n                               table_size_before * sizeof(S),\n                               cudaMemcpyDeviceToHost, stream));\n\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(table_size_verify1, table_size_before);\n\n    size_t score_error_cnt = 0;\n\n    for (int64_t i = table_size_before - 1; i >= 0; i--) {\n      test_util::ValueArray<V, dim>* vec =\n          reinterpret_cast<test_util::ValueArray<V, dim>*>(h_tmp_values +\n                                                           i * dim);\n      values_map_after_insert[h_tmp_keys[i]] = *vec;\n      scores_map_after_insert[h_tmp_keys[i]] = h_tmp_scores[i];\n    }\n\n    for (auto it : scores_map_current_batch) {\n      const K key = it.first;\n      const K score = it.second;\n      S current_score = scores_map_after_insert[key];\n      S score_before_insert = 0;\n      if (scores_map_before_insert.find(key) !=\n          scores_map_before_insert.end()) {\n        score_before_insert = scores_map_before_insert[key];\n        bool valid =\n            ((current_score >> 32) == global_epoch) &&\n            ((current_score & 0xFFFFFFFF) ==\n             ((0xFFFFFFFF & score_before_insert) + (0xFFFFFFFF & score)));\n\n        if (!valid) {\n          score_error_cnt++;\n        }\n      }\n    }\n    std::cout << \"Check assign behavior got \"\n              << \", score_error_cnt: \" << score_error_cnt\n              << \", while len: \" << len << std::endl;\n    ASSERT_EQ(score_error_cnt, 0);\n  }\n\n  for (int64_t i = 0; i < table_size_before; i++) {\n    values_map_before_insert[h_tmp_keys[i]] =\n        values_map_after_insert[h_tmp_keys[i]];\n    scores_map_before_insert[h_tmp_keys[i]] =\n        scores_map_after_insert[h_tmp_keys[i]];\n  }\n  values_map_after_insert.clear();\n  scores_map_after_insert.clear();\n\n  auto start = std::chrono::steady_clock::now();\n  size_t filtered_len = table->insert_and_evict(\n      len, keys, values,\n      (Table::evict_strategy == EvictStrategy::kLru ||\n       Table::evict_strategy == EvictStrategy::kEpochLru)\n          ? nullptr\n          : scores,\n      evicted_keys, evicted_values, evicted_scores, stream);\n  evict_buffer->SyncData(false, stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  auto end = std::chrono::steady_clock::now();\n  auto diff = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start);\n\n  for (size_t i = 0; i < filtered_len; i++) {\n    scores_map_current_evict[evict_buffer->keys_ptr(false)[i]] =\n        evict_buffer->scores_ptr(false)[i];\n  }\n\n  float dur = diff.count();\n\n  size_t table_size_after = table->size(stream);\n  size_t table_size_verify1 = table->export_batch(\n      table->capacity(), 0, d_tmp_keys, d_tmp_values, d_tmp_scores, stream);\n\n  ASSERT_EQ(table_size_verify1, table_size_after);\n\n  size_t new_cap = table_size_after + filtered_len;\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys,\n                             table_size_after * sizeof(K),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,\n                             table_size_after * dim * sizeof(V),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores,\n                             table_size_after * sizeof(S),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys + table_size_after, evicted_keys,\n                             filtered_len * sizeof(K), cudaMemcpyDeviceToHost,\n                             stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values + table_size_after * dim,\n                             evicted_values, filtered_len * dim * sizeof(V),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores + table_size_after, evicted_scores,\n                             filtered_len * sizeof(S), cudaMemcpyDeviceToHost,\n                             stream));\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n\n  size_t key_miss_cnt = 0;\n  size_t value_diff_cnt = 0;\n  size_t score_error_cnt1 = 0;\n  size_t score_error_cnt2 = 0;\n\n  for (int64_t i = new_cap - 1; i >= 0; i--) {\n    test_util::ValueArray<V, dim>* vec =\n        reinterpret_cast<test_util::ValueArray<V, dim>*>(h_tmp_values +\n                                                         i * dim);\n    values_map_after_insert[h_tmp_keys[i]] = *vec;\n    scores_map_after_insert[h_tmp_keys[i]] = h_tmp_scores[i];\n    if (i >= (new_cap - filtered_len)) {\n      bool valid = ((h_tmp_scores[i] >> 32) < (global_epoch - 2));\n      if (!valid) {\n        score_error_cnt1++;\n      }\n    }\n  }\n\n  for (auto it : scores_map_current_batch) {\n    const K key = it.first;\n    const K score = it.second;\n    S current_score = scores_map_after_insert[key];\n    S score_before_insert = 0;\n    if (values_map_after_insert.find(key) != values_map_after_insert.end() &&\n        scores_map_current_evict.find(key) == scores_map_current_evict.end()) {\n      score_before_insert = scores_map_before_insert[key];\n    }\n    bool valid = ((current_score >> 32) == global_epoch) &&\n                 ((current_score & 0xFFFFFFFF) ==\n                  ((0xFFFFFFFF & score_before_insert) + (0xFFFFFFFF & score)));\n\n    if (!valid) {\n      score_error_cnt2++;\n    }\n  }\n\n  for (auto& it : values_map_before_insert) {\n    if (values_map_after_insert.find(it.first) ==\n        values_map_after_insert.end()) {\n      ++key_miss_cnt;\n      continue;\n    }\n    test_util::ValueArray<V, dim>& vec0 = it.second;\n    test_util::ValueArray<V, dim>& vec1 = values_map_after_insert.at(it.first);\n    for (size_t j = 0; j < dim; j++) {\n      if (vec0[j] != vec1[j]) {\n        ++value_diff_cnt;\n        break;\n      }\n    }\n  }\n\n  std::cout << \"Check insert_and_evict behavior got \"\n            << \"key_miss_cnt: \" << key_miss_cnt\n            << \", value_diff_cnt: \" << value_diff_cnt\n            << \", score_error_cnt1: \" << score_error_cnt1\n            << \", score_error_cnt2: \" << score_error_cnt2\n            << \", while table_size_before: \" << table_size_before\n            << \", while table_size_after: \" << table_size_after\n            << \", while len: \" << len << std::endl;\n\n  ASSERT_EQ(key_miss_cnt, 0);\n  ASSERT_EQ(value_diff_cnt, 0);\n  ASSERT_EQ(score_error_cnt1, 0);\n  ASSERT_EQ(score_error_cnt2, 0);\n\n  CUDA_CHECK(cudaFreeAsync(d_tmp_keys, stream));\n  CUDA_CHECK(cudaFreeAsync(d_tmp_values, stream));\n  CUDA_CHECK(cudaFreeAsync(d_tmp_scores, stream));\n  CUDA_CHECK(cudaFreeAsync(d_tmp_founds, stream));\n  free(h_tmp_keys);\n  free(h_tmp_values);\n  free(h_tmp_scores);\n  free(h_tmp_founds);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n}\n\nvoid test_assign_advanced_on_epochlfu(size_t max_hbm_for_vectors) {\n  const size_t U = 1024 * 1024;\n  const size_t B = 100000;\n  constexpr size_t dim = 16;\n\n  TableOptions opt;\n\n  opt.max_capacity = U;\n  opt.init_capacity = U;\n  opt.max_hbm_for_vectors = U * dim * sizeof(V);\n  opt.max_bucket_size = 128;\n  opt.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kEpochLfu>;\n  opt.dim = dim;\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  std::unique_ptr<Table> table = std::make_unique<Table>();\n  table->init(opt);\n\n  test_util::KVMSBuffer<K, V, S> evict_buffer;\n  evict_buffer.Reserve(B, dim, stream);\n  evict_buffer.ToZeros(stream);\n\n  test_util::KVMSBuffer<K, V, S> data_buffer;\n  test_util::KVMSBuffer<K, V, S> pre_data_buffer;\n  data_buffer.Reserve(B, dim, stream);\n  pre_data_buffer.Reserve(B, dim, stream);\n\n  size_t offset = 0;\n  int freq_range = 100;\n  float repeat_rate = 0.9;\n  for (unsigned int global_epoch = 1; global_epoch <= 20; global_epoch++) {\n    repeat_rate = global_epoch <= 1 ? 0.0 : 0.1;\n    if (global_epoch <= 1) {\n      test_util::create_random_keys_advanced<K, S, V>(\n          dim, data_buffer.keys_ptr(false), data_buffer.scores_ptr(false),\n          data_buffer.values_ptr(false), (int)B, B * 32, freq_range);\n    } else {\n      test_util::create_random_keys_advanced<K, S, V>(\n          dim, data_buffer.keys_ptr(false), pre_data_buffer.keys_ptr(false),\n          data_buffer.scores_ptr(false), data_buffer.values_ptr(false), (int)B,\n          B * 32, freq_range, repeat_rate);\n    }\n    data_buffer.SyncData(true, stream);\n    if (global_epoch <= 1) {\n      pre_data_buffer.CopyFrom(data_buffer, stream);\n    }\n\n    CheckAssignOnEpochLfu<K, V, S, Table, dim>(table.get(), &data_buffer,\n                                               &evict_buffer, &pre_data_buffer,\n                                               B, stream, opt, global_epoch);\n\n    pre_data_buffer.CopyFrom(data_buffer, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    offset += B;\n  }\n}\n\nvoid test_evict_strategy_customized_correct_rate(size_t max_hbm_for_vectors,\n                                                 int key_start = 0) {\n  constexpr uint64_t BATCH_SIZE = 1024 * 1024ul;\n  constexpr uint64_t STEPS = 128;\n  constexpr uint64_t MAX_BUCKET_SIZE = 128;\n  constexpr uint64_t INIT_CAPACITY = BATCH_SIZE * STEPS;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t TEST_TIMES = 1;\n  float expected_correct_rate = 0.964;\n  const int rounds = 12;\n\n  TableOptions options;\n\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_bucket_size = MAX_BUCKET_SIZE;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  K* h_keys_base = test_util::HostBuffer<K>(BATCH_SIZE).ptr();\n  S* h_scores_base = test_util::HostBuffer<S>(BATCH_SIZE).ptr();\n  V* h_vectors_base = test_util::HostBuffer<V>(BATCH_SIZE * options.dim).ptr();\n\n  K* h_keys_temp = test_util::HostBuffer<K>(MAX_CAPACITY).ptr();\n  S* h_scores_temp = test_util::HostBuffer<S>(MAX_CAPACITY).ptr();\n  V* h_vectors_temp =\n      test_util::HostBuffer<V>(MAX_CAPACITY * options.dim).ptr();\n\n  K* d_keys_temp;\n  S* d_scores_temp = nullptr;\n  V* d_vectors_temp;\n\n  CUDA_CHECK(cudaMalloc(&d_keys_temp, MAX_CAPACITY * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores_temp, MAX_CAPACITY * sizeof(S)));\n  CUDA_CHECK(\n      cudaMalloc(&d_vectors_temp, MAX_CAPACITY * sizeof(V) * options.dim));\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  size_t total_size = 0;\n  size_t global_start_key = 100000;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n    size_t start_key = global_start_key;\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    for (int r = 0; r < rounds; r++) {\n      size_t expected_min_key = global_start_key + INIT_CAPACITY * r;\n      size_t expected_max_key = global_start_key + INIT_CAPACITY * (r + 1) - 1;\n      size_t expected_table_size =\n          (r == 0) ? size_t(expected_correct_rate * INIT_CAPACITY)\n                   : INIT_CAPACITY;\n\n      for (int s = 0; s < STEPS; s++) {\n        test_util::create_continuous_keys<K, S, V, DIM>(\n            h_keys_base, h_scores_base, h_vectors_base, BATCH_SIZE, start_key);\n        start_key += BATCH_SIZE;\n\n        CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base, BATCH_SIZE * sizeof(K),\n                              cudaMemcpyHostToDevice));\n        CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base,\n                              BATCH_SIZE * sizeof(S), cudaMemcpyHostToDevice));\n        CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base,\n                              BATCH_SIZE * sizeof(V) * options.dim,\n                              cudaMemcpyHostToDevice));\n        table->assign(BATCH_SIZE, d_keys_temp, d_scores_temp, stream);\n        table->find_or_insert(BATCH_SIZE, d_keys_temp, d_vectors_temp,\n                              d_scores_temp, stream);\n        CUDA_CHECK(cudaStreamSynchronize(stream));\n      }\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_GE(total_size, expected_table_size);\n      ASSERT_EQ(MAX_CAPACITY, table->capacity());\n\n      size_t dump_counter = table->export_batch(\n          MAX_CAPACITY, 0, d_keys_temp, d_vectors_temp, d_scores_temp, stream);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp, d_keys_temp, MAX_CAPACITY * sizeof(K),\n                            cudaMemcpyDeviceToHost));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp, d_scores_temp,\n                            MAX_CAPACITY * sizeof(S), cudaMemcpyDeviceToHost));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp, d_vectors_temp,\n                            MAX_CAPACITY * sizeof(V) * options.dim,\n                            cudaMemcpyDeviceToHost));\n\n      size_t bigger_score_counter = 0;\n      K max_key = 0;\n      size_t values_error_counter = 0;\n      for (int i = 0; i < dump_counter; i++) {\n        ASSERT_EQ(h_keys_temp[i], h_scores_temp[i]);\n        max_key = std::max(max_key, h_keys_temp[i]);\n        if (h_scores_temp[i] >= expected_min_key) bigger_score_counter++;\n        for (int j = 0; j < options.dim; j++) {\n          if (h_vectors_temp[i * options.dim + j] !=\n              static_cast<float>(h_keys_temp[i] * 0.00001)) {\n            values_error_counter++;\n          }\n        }\n      }\n\n      ASSERT_EQ(values_error_counter, 0);\n      float correct_rate = (bigger_score_counter * 1.0) / MAX_CAPACITY;\n      std::cout << std::setprecision(3) << \"[Round \" << r << \"]\"\n                << \"correct_rate=\" << correct_rate << std::endl;\n      ASSERT_GE(max_key, expected_max_key);\n      ASSERT_GE(correct_rate, expected_correct_rate);\n    }\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaFree(d_keys_temp));\n  CUDA_CHECK(cudaFree(d_scores_temp));\n  CUDA_CHECK(cudaFree(d_vectors_temp));\n\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\ntemplate <typename K, typename V, typename S, typename Table, size_t dim = 64>\nvoid CheckFindOrInsertValues(Table* table, K* keys, V* values, S* scores,\n                             size_t len, cudaStream_t stream) {\n  std::map<K, test_util::ValueArray<V, dim>> map_before_insert;\n  std::map<K, test_util::ValueArray<V, dim>> map_after_insert;\n  K* h_tmp_keys = nullptr;\n  V* h_tmp_values = nullptr;\n  S* h_tmp_scores = nullptr;\n\n  K* d_tmp_keys = nullptr;\n  V* d_tmp_values = nullptr;\n  S* d_tmp_scores = nullptr;\n\n  size_t table_size_before = table->size(stream);\n  size_t cap = table_size_before + len;\n\n  CUDA_CHECK(cudaMallocAsync(&d_tmp_keys, cap * sizeof(K), stream));\n  CUDA_CHECK(cudaMemsetAsync(d_tmp_keys, 0, cap * sizeof(K), stream));\n  CUDA_CHECK(cudaMallocAsync(&d_tmp_values, cap * dim * sizeof(V), stream));\n  CUDA_CHECK(cudaMemsetAsync(d_tmp_values, 0, cap * dim * sizeof(V), stream));\n  CUDA_CHECK(cudaMallocAsync(&d_tmp_scores, cap * sizeof(S), stream));\n  CUDA_CHECK(cudaMemsetAsync(d_tmp_scores, 0, cap * sizeof(S), stream));\n  h_tmp_keys = (K*)malloc(cap * sizeof(K));\n  h_tmp_values = (V*)malloc(cap * dim * sizeof(V));\n  h_tmp_scores = (S*)malloc(cap * sizeof(S));\n\n  size_t table_size_verify0 = table->export_batch(\n      table->capacity(), 0, d_tmp_keys, d_tmp_values, d_tmp_scores, stream);\n  ASSERT_EQ(table_size_before, table_size_verify0);\n\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys,\n                             table_size_before * sizeof(K),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,\n                             table_size_before * dim * sizeof(V),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores,\n                             table_size_before * sizeof(S),\n                             cudaMemcpyDeviceToHost, stream));\n\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n\n  for (size_t i = 0; i < table_size_verify0; i++) {\n    test_util::ValueArray<V, dim>* vec =\n        reinterpret_cast<test_util::ValueArray<V, dim>*>(h_tmp_values +\n                                                         i * dim);\n    map_before_insert[h_tmp_keys[i]] = *vec;\n  }\n\n  auto start = std::chrono::steady_clock::now();\n  table->find_or_insert(len, keys, values, nullptr, stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  auto end = std::chrono::steady_clock::now();\n  auto diff = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start);\n\n  float dur = diff.count();\n\n  size_t table_size_after = table->size(stream);\n  size_t table_size_verify1 = table->export_batch(\n      table->capacity(), 0, d_tmp_keys, d_tmp_values, d_tmp_scores, stream);\n\n  ASSERT_EQ(table_size_verify1, table_size_after);\n\n  size_t new_cap = table_size_after;\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys,\n                             table_size_after * sizeof(K),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,\n                             table_size_after * dim * sizeof(V),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores,\n                             table_size_after * sizeof(S),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  int64_t new_cap_K = (int64_t)new_cap;\n  for (int64_t i = new_cap_K - 1; i >= 0; i--) {\n    test_util::ValueArray<V, dim>* vec =\n        reinterpret_cast<test_util::ValueArray<V, dim>*>(h_tmp_values +\n                                                         i * dim);\n    map_after_insert[h_tmp_keys[i]] = *vec;\n  }\n\n  size_t value_diff_cnt = 0;\n  for (auto& it : map_after_insert) {\n    test_util::ValueArray<V, dim>& vec = map_after_insert.at(it.first);\n    for (size_t j = 0; j < dim; j++) {\n      if (vec[j] != static_cast<float>(it.first * 0.00001)) {\n        ++value_diff_cnt;\n        break;\n      }\n    }\n  }\n  ASSERT_EQ(value_diff_cnt, 0);\n  std::cout << \"Check find_or_insert behavior got \"\n            << \"value_diff_cnt: \" << value_diff_cnt\n            << \", while table_size_before: \" << table_size_before\n            << \", while table_size_after: \" << table_size_after\n            << \", while len: \" << len << std::endl;\n\n  CUDA_CHECK(cudaFreeAsync(d_tmp_keys, stream));\n  CUDA_CHECK(cudaFreeAsync(d_tmp_values, stream));\n  CUDA_CHECK(cudaFreeAsync(d_tmp_scores, stream));\n  free(h_tmp_keys);\n  free(h_tmp_values);\n  free(h_tmp_scores);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n}\n\nvoid test_find_or_insert_values_check(size_t max_hbm_for_vectors) {\n  const size_t U = 524288;\n  const size_t init_capacity = 1024;\n  const size_t B = 524288 + 13;\n  constexpr size_t dim = 64;\n\n  TableOptions opt;\n\n  opt.max_capacity = U;\n  opt.init_capacity = init_capacity;\n  opt.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>;\n  opt.dim = 64;\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  std::unique_ptr<Table> table = std::make_unique<Table>();\n  table->init(opt);\n\n  test_util::KVMSBuffer<K, V, S> data_buffer;\n  data_buffer.Reserve(B, dim, stream);\n\n  size_t offset = 0;\n  S score = 0;\n  for (int i = 0; i < 20; i++) {\n    test_util::create_random_keys<K, S, V, dim>(\n        data_buffer.keys_ptr(false), data_buffer.scores_ptr(false),\n        data_buffer.values_ptr(false), (int)B, B * 16);\n    data_buffer.SyncData(true, stream);\n\n    CheckFindOrInsertValues<K, V, S, Table, dim>(\n        table.get(), data_buffer.keys_ptr(), data_buffer.values_ptr(),\n        data_buffer.scores_ptr(), B, stream);\n\n    offset += B;\n    score += 1;\n  }\n}\n\nTEST(AssignScoreTest, test_evict_strategy_lru_basic) {\n  test_evict_strategy_lru_basic(16);\n  test_evict_strategy_lru_basic(0, 34);\n}\nTEST(AssignScoreTest, test_evict_strategy_lfu_basic) {\n  test_evict_strategy_lfu_basic(16);\n  test_evict_strategy_lfu_basic(0, 2);\n}\nTEST(AssignScoreTest, test_evict_strategy_epochlru_basic) {\n  test_evict_strategy_epochlru_basic(16, 51);\n  test_evict_strategy_epochlru_basic(0);\n}\nTEST(AssignScoreTest, test_evict_strategy_epochlfu_basic) {\n  test_evict_strategy_epochlfu_basic(16, 4);\n  test_evict_strategy_epochlfu_basic(0);\n}\nTEST(AssignScoreTest, test_evict_strategy_customized_basic) {\n  test_evict_strategy_customized_basic(16);\n  test_evict_strategy_customized_basic(0, 11);\n}\nTEST(AssignScoreTest, test_evict_strategy_customized_advanced) {\n  test_evict_strategy_customized_advanced(16, 33);\n  test_evict_strategy_customized_advanced(0);\n}\nTEST(AssignScoreTest, test_assign_advanced_on_epochlfu) {\n  test_assign_advanced_on_epochlfu(16);\n}\nTEST(AssignScoreTest, test_evict_strategy_customized_correct_rate) {\n  // TODO(rhdong): after blossom CI issue is resolved, the skip logic.\n  const bool skip_hmem_check = (nullptr != std::getenv(\"IS_BLOSSOM_CI\"));\n  test_evict_strategy_customized_correct_rate(16, 44);\n  if (!skip_hmem_check) {\n    test_evict_strategy_customized_correct_rate(0);\n  } else {\n    std::cout << \"The HMEM check is skipped in blossom CI!\" << std::endl;\n  }\n}"
  },
  {
    "path": "tests/assign_values_test.cc.cu",
    "content": "/*\n * Copyright (c) 2023, NVIDIA CORPORATION.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *     http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n/*\n * test API: assign_values\n */\n\n#include <gtest/gtest.h>\n#include <math.h>\n#include <stdio.h>\n#include <stdlib.h>\n#include <algorithm>\n#include <array>\n#include <iostream>\n#include <thread>\n#include <vector>\n#include \"merlin_hashtable.cuh\"\n#include \"test_util.cuh\"\n\nconstexpr size_t DIM = 16;\nusing K = uint64_t;\nusing V = float;\nusing S = uint64_t;\nusing EvictStrategy = nv::merlin::EvictStrategy;\nusing TableOptions = nv::merlin::HashTableOptions;\n\nvoid test_evict_strategy_lru_basic(size_t max_hbm_for_vectors,\n                                   int key_start = 0) {\n  constexpr uint64_t BUCKET_NUM = 8UL;\n  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;\n  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;\n  constexpr uint64_t TEST_KEY_NUM = 4;\n  constexpr uint64_t TEMP_KEY_NUM =\n      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;\n  constexpr uint64_t TEST_TIMES = 128;\n\n  TableOptions options;\n\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>;\n\n  std::vector<K> h_keys_base(BASE_KEY_NUM);\n  std::vector<S> h_scores_base(BASE_KEY_NUM);\n  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_test(TEST_KEY_NUM);\n  std::vector<S> h_scores_test(TEST_KEY_NUM);\n  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_temp(TEMP_KEY_NUM);\n  std::vector<S> h_scores_temp(TEMP_KEY_NUM);\n  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);\n\n  K* d_keys_temp;\n  S* d_scores_temp = nullptr;\n  V* d_vectors_temp;\n\n  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));\n  CUDA_CHECK(\n      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));\n\n  test_util::create_keys_in_one_buckets<K, S, V, DIM>(\n      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),\n      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF);\n\n  test_util::create_keys_in_one_buckets<K, S, V, DIM>(\n      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),\n      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,\n      0xFFFFFFFFFFFFFFFD);\n\n  h_keys_test[2] = h_keys_base[72];\n  h_keys_test[3] = h_keys_base[73];\n\n  for (int i = 0; i < options.dim; i++) {\n    h_vectors_test[2 * options.dim + i] =\n        static_cast<float>(h_keys_base[72] * 0.00002);\n    h_vectors_test[3 * options.dim + i] =\n        static_cast<float>(h_keys_base[73] * 0.00002);\n  }\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  size_t total_size = 0;\n  size_t dump_counter = 0;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    {\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      S start_ts = test_util::host_nano<S>(stream);\n      table->find_or_insert(BASE_KEY_NUM, d_keys_temp, d_vectors_temp, nullptr,\n                            stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      S end_ts = test_util::host_nano<S>(stream);\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      std::vector<S> h_scores_temp_sorted(h_scores_temp);\n      std::sort(h_scores_temp_sorted.begin(), h_scores_temp_sorted.end());\n\n      ASSERT_GE(h_scores_temp_sorted[0], start_ts);\n      ASSERT_LE(h_scores_temp_sorted[TEST_KEY_NUM - 1], end_ts);\n      for (int i = 0; i < dump_counter; i++) {\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n\n    {\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),\n                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),\n                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),\n                            TEST_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n\n      S start_ts = test_util::host_nano<S>(stream);\n      table->assign_values(TEST_KEY_NUM, d_keys_temp, d_vectors_temp, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            TEMP_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      for (int i = 0; i < TEMP_KEY_NUM; i++) {\n        V expected_v = (h_keys_temp[i] == h_keys_test[2] ||\n                        h_keys_temp[i] == h_keys_test[3])\n                           ? static_cast<V>(h_keys_temp[i] * 0.00002)\n                           : static_cast<V>(h_keys_temp[i] * 0.00001);\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j], expected_v);\n        }\n        ASSERT_LE(h_scores_temp[i], start_ts);\n      }\n    }\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaFree(d_keys_temp));\n  CUDA_CHECK(cudaFree(d_scores_temp));\n  CUDA_CHECK(cudaFree(d_vectors_temp));\n\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_evict_strategy_epochlfu_basic(size_t max_hbm_for_vectors,\n                                        int key_start = 0) {\n  constexpr uint64_t BUCKET_NUM = 8UL;\n  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;\n  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;\n  constexpr uint64_t TEST_KEY_NUM = 4;\n  constexpr uint64_t TEMP_KEY_NUM =\n      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;\n  constexpr uint64_t TEST_TIMES = 128;\n\n  TableOptions options;\n\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kEpochLfu>;\n\n  std::vector<K> h_keys_base(BASE_KEY_NUM);\n  std::vector<S> h_scores_base(BASE_KEY_NUM);\n  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_test(TEST_KEY_NUM);\n  std::vector<S> h_scores_test(TEST_KEY_NUM);\n  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_temp(TEMP_KEY_NUM);\n  std::vector<S> h_scores_temp(TEMP_KEY_NUM);\n  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);\n\n  K* d_keys_temp;\n  S* d_scores_temp = nullptr;\n  V* d_vectors_temp;\n\n  int freq_range = 1000;\n\n  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));\n  CUDA_CHECK(\n      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));\n\n  test_util::create_keys_in_one_buckets_lfu<K, S, V, DIM>(\n      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),\n      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF,\n      freq_range);\n\n  test_util::create_keys_in_one_buckets_lfu<K, S, V, DIM>(\n      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),\n      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,\n      0xFFFFFFFFFFFFFFFD, freq_range);\n\n  // Simulate overflow of low 32bits.\n  h_scores_base[71] = static_cast<S>(std::numeric_limits<uint32_t>::max() -\n                                     static_cast<uint32_t>(1));\n\n  h_keys_test[1] = h_keys_base[71];\n  h_keys_test[2] = h_keys_base[72];\n  h_keys_test[3] = h_keys_base[73];\n\n  h_scores_test[1] = h_scores_base[71];\n  h_scores_test[2] = h_keys_base[72] % freq_range;\n  h_scores_test[3] = h_keys_base[73] % freq_range;\n\n  for (int i = 0; i < options.dim; i++) {\n    h_vectors_test[1 * options.dim + i] =\n        static_cast<float>(h_keys_base[71] * 0.00002);\n    h_vectors_test[2 * options.dim + i] =\n        static_cast<float>(h_keys_base[72] * 0.00002);\n    h_vectors_test[3 * options.dim + i] =\n        static_cast<float>(h_keys_base[73] * 0.00002);\n  }\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  size_t total_size = 0;\n  size_t dump_counter = 0;\n  S global_epoch = 1;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    {\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      table->set_global_epoch(global_epoch);\n      table->find_or_insert(BASE_KEY_NUM, d_keys_temp, d_vectors_temp,\n                            d_scores_temp, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      for (int i = 0; i < dump_counter; i++) {\n        if (h_keys_temp[i] == h_keys_base[71]) {\n          S expected_score = test_util::make_expected_score_for_epochlfu<S>(\n              global_epoch, h_scores_base[71]);\n          ASSERT_EQ(h_scores_temp[i], expected_score);\n        } else {\n          S expected_score = test_util::make_expected_score_for_epochlfu<S>(\n              global_epoch, (h_keys_temp[i] % freq_range));\n          ASSERT_EQ(h_scores_temp[i], expected_score);\n        }\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n\n    {\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),\n                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),\n                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),\n                            TEST_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      table->assign_values(TEST_KEY_NUM, d_keys_temp, d_vectors_temp, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            TEMP_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      ASSERT_TRUE(h_keys_temp.end() != std::find(h_keys_temp.begin(),\n                                                 h_keys_temp.end(),\n                                                 h_keys_base[71]));\n\n      for (int i = 0; i < dump_counter; i++) {\n        if (h_keys_temp[i] == h_keys_base[71]) {\n          S expected_score = test_util::make_expected_score_for_epochlfu<S>(\n              global_epoch, h_scores_base[71]);\n          ASSERT_EQ(h_scores_temp[i], expected_score);\n        } else {\n          S expected_score = test_util::make_expected_score_for_epochlfu<S>(\n              global_epoch, (h_keys_temp[i] % freq_range));\n          ASSERT_EQ(h_scores_temp[i], expected_score);\n        }\n        for (int j = 0; j < options.dim; j++) {\n          V expected_v = (h_keys_temp[i] == h_keys_test[1] ||\n                          h_keys_temp[i] == h_keys_test[2] ||\n                          h_keys_temp[i] == h_keys_test[3])\n                             ? static_cast<V>(h_keys_temp[i] * 0.00002)\n                             : static_cast<V>(h_keys_temp[i] * 0.00001);\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j], expected_v);\n        }\n      }\n    }\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaFree(d_keys_temp));\n  CUDA_CHECK(cudaFree(d_scores_temp));\n  CUDA_CHECK(cudaFree(d_vectors_temp));\n\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\ntemplate <typename K, typename V, typename S, typename Table, size_t dim = 64>\nvoid CheckAssignOnEpochLfu(Table* table,\n                           test_util::KVMSBuffer<K, V, S>* data_buffer,\n                           test_util::KVMSBuffer<K, V, S>* evict_buffer,\n                           test_util::KVMSBuffer<K, V, S>* pre_data_buffer,\n                           size_t len, cudaStream_t stream, TableOptions& opt,\n                           unsigned int global_epoch) {\n  std::map<K, test_util::ValueArray<V, dim>> values_map_before_insert;\n  std::map<K, test_util::ValueArray<V, dim>> values_map_after_insert;\n\n  std::unordered_map<K, S> scores_map_before_insert;\n  std::map<K, S> scores_map_after_insert;\n\n  std::map<K, S> scores_map_current_batch;\n  std::map<K, S> scores_map_current_evict;\n\n  K* keys = data_buffer->keys_ptr();\n  V* values = data_buffer->values_ptr();\n  S* scores = data_buffer->scores_ptr();\n\n  K* evicted_keys = evict_buffer->keys_ptr();\n  V* evicted_values = evict_buffer->values_ptr();\n  S* evicted_scores = evict_buffer->scores_ptr();\n\n  for (size_t i = 0; i < len; i++) {\n    scores_map_current_batch[data_buffer->keys_ptr(false)[i]] =\n        data_buffer->scores_ptr(false)[i];\n  }\n\n  K* h_tmp_keys = nullptr;\n  V* h_tmp_values = nullptr;\n  S* h_tmp_scores = nullptr;\n  bool* h_tmp_founds = nullptr;\n\n  K* d_tmp_keys = nullptr;\n  V* d_tmp_values = nullptr;\n  S* d_tmp_scores = nullptr;\n  bool* d_tmp_founds = nullptr;\n\n  size_t table_size_before = table->size(stream);\n  size_t cap = table_size_before + len;\n\n  CUDA_CHECK(cudaMallocAsync(&d_tmp_keys, cap * sizeof(K), stream));\n  CUDA_CHECK(cudaMemsetAsync(d_tmp_keys, 0, cap * sizeof(K), stream));\n  CUDA_CHECK(cudaMallocAsync(&d_tmp_values, cap * dim * sizeof(V), stream));\n  CUDA_CHECK(cudaMemsetAsync(d_tmp_values, 0, cap * dim * sizeof(V), stream));\n  CUDA_CHECK(cudaMallocAsync(&d_tmp_scores, cap * sizeof(S), stream));\n  CUDA_CHECK(cudaMemsetAsync(d_tmp_scores, 0, cap * sizeof(S), stream));\n  CUDA_CHECK(cudaMallocAsync(&d_tmp_founds, cap * sizeof(bool), stream));\n  CUDA_CHECK(cudaMemsetAsync(d_tmp_founds, 0, cap * sizeof(bool), stream));\n  h_tmp_keys = (K*)malloc(cap * sizeof(K));\n  h_tmp_values = (V*)malloc(cap * dim * sizeof(V));\n  h_tmp_scores = (S*)malloc(cap * sizeof(S));\n  h_tmp_founds = (bool*)malloc(cap * sizeof(bool));\n\n  size_t table_size_verify0 = table->export_batch(\n      table->capacity(), 0, d_tmp_keys, d_tmp_values, d_tmp_scores, stream);\n  ASSERT_EQ(table_size_before, table_size_verify0);\n\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys,\n                             table_size_before * sizeof(K),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,\n                             table_size_before * dim * sizeof(V),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores,\n                             table_size_before * sizeof(S),\n                             cudaMemcpyDeviceToHost, stream));\n\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys + table_size_before, keys,\n                             len * sizeof(K), cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values + table_size_before * dim, values,\n                             len * dim * sizeof(V), cudaMemcpyDeviceToHost,\n                             stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores + table_size_before, scores,\n                             len * sizeof(S), cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n\n  for (size_t i = 0; i < cap; i++) {\n    test_util::ValueArray<V, dim>* vec =\n        reinterpret_cast<test_util::ValueArray<V, dim>*>(h_tmp_values +\n                                                         i * dim);\n    values_map_before_insert[h_tmp_keys[i]] = *vec;\n  }\n\n  for (size_t i = 0; i < table_size_before; i++) {\n    scores_map_before_insert[h_tmp_keys[i]] = h_tmp_scores[i];\n  }\n\n  table->assign_values(len, keys, values, stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n\n  {\n    size_t table_size_verify1 = table->export_batch(\n        table->capacity(), 0, d_tmp_keys, d_tmp_values, d_tmp_scores, stream);\n\n    CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys,\n                               table_size_before * sizeof(K),\n                               cudaMemcpyDeviceToHost, stream));\n    CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,\n                               table_size_before * dim * sizeof(V),\n                               cudaMemcpyDeviceToHost, stream));\n    CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores,\n                               table_size_before * sizeof(S),\n                               cudaMemcpyDeviceToHost, stream));\n\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(table_size_verify1, table_size_before);\n\n    size_t score_error_cnt = 0;\n\n    for (int64_t i = table_size_before - 1; i >= 0; i--) {\n      test_util::ValueArray<V, dim>* vec =\n          reinterpret_cast<test_util::ValueArray<V, dim>*>(h_tmp_values +\n                                                           i * dim);\n      values_map_after_insert[h_tmp_keys[i]] = *vec;\n      scores_map_after_insert[h_tmp_keys[i]] = h_tmp_scores[i];\n    }\n\n    for (auto it : scores_map_current_batch) {\n      const K key = it.first;\n      const K score = it.second;\n      S current_score = scores_map_after_insert[key];\n      S score_before_insert = 0;\n      if (scores_map_before_insert.find(key) !=\n          scores_map_before_insert.end()) {\n        score_before_insert = scores_map_before_insert[key];\n        bool valid = ((current_score >> 32) < global_epoch) &&\n                     ((current_score & 0xFFFFFFFF) ==\n                      (0xFFFFFFFF & score_before_insert));\n\n        if (!valid) {\n          score_error_cnt++;\n        }\n      }\n    }\n    std::cout << \"Check assign behavior got \"\n              << \", score_error_cnt: \" << score_error_cnt\n              << \", while len: \" << len << std::endl;\n    ASSERT_EQ(score_error_cnt, 0);\n  }\n\n  for (int64_t i = 0; i < table_size_before; i++) {\n    values_map_before_insert[h_tmp_keys[i]] =\n        values_map_after_insert[h_tmp_keys[i]];\n    scores_map_before_insert[h_tmp_keys[i]] =\n        scores_map_after_insert[h_tmp_keys[i]];\n  }\n  values_map_after_insert.clear();\n  scores_map_after_insert.clear();\n\n  table->set_global_epoch(global_epoch);\n  auto start = std::chrono::steady_clock::now();\n  size_t filtered_len = table->insert_and_evict(\n      len, keys, values,\n      (Table::evict_strategy == EvictStrategy::kLru ||\n       Table::evict_strategy == EvictStrategy::kEpochLru)\n          ? nullptr\n          : scores,\n      evicted_keys, evicted_values, evicted_scores, stream);\n  evict_buffer->SyncData(false, stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  auto end = std::chrono::steady_clock::now();\n  auto diff = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start);\n\n  for (size_t i = 0; i < filtered_len; i++) {\n    scores_map_current_evict[evict_buffer->keys_ptr(false)[i]] =\n        evict_buffer->scores_ptr(false)[i];\n  }\n\n  float dur = diff.count();\n\n  size_t table_size_after = table->size(stream);\n  size_t table_size_verify1 = table->export_batch(\n      table->capacity(), 0, d_tmp_keys, d_tmp_values, d_tmp_scores, stream);\n\n  ASSERT_EQ(table_size_verify1, table_size_after);\n\n  size_t new_cap = table_size_after + filtered_len;\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys,\n                             table_size_after * sizeof(K),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,\n                             table_size_after * dim * sizeof(V),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores,\n                             table_size_after * sizeof(S),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys + table_size_after, evicted_keys,\n                             filtered_len * sizeof(K), cudaMemcpyDeviceToHost,\n                             stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values + table_size_after * dim,\n                             evicted_values, filtered_len * dim * sizeof(V),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores + table_size_after, evicted_scores,\n                             filtered_len * sizeof(S), cudaMemcpyDeviceToHost,\n                             stream));\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n\n  size_t key_miss_cnt = 0;\n  size_t value_diff_cnt = 0;\n  size_t score_error_cnt1 = 0;\n  size_t score_error_cnt2 = 0;\n\n  for (int64_t i = new_cap - 1; i >= 0; i--) {\n    test_util::ValueArray<V, dim>* vec =\n        reinterpret_cast<test_util::ValueArray<V, dim>*>(h_tmp_values +\n                                                         i * dim);\n    values_map_after_insert[h_tmp_keys[i]] = *vec;\n    scores_map_after_insert[h_tmp_keys[i]] = h_tmp_scores[i];\n    if (i >= (new_cap - filtered_len)) {\n      bool valid = ((h_tmp_scores[i] >> 32) < (global_epoch - 2));\n      if (!valid) {\n        score_error_cnt1++;\n      }\n    }\n  }\n\n  for (auto it : scores_map_current_batch) {\n    const K key = it.first;\n    const K score = it.second;\n    S current_score = scores_map_after_insert[key];\n    S score_before_insert = 0;\n    if (values_map_after_insert.find(key) != values_map_after_insert.end() &&\n        scores_map_current_evict.find(key) == scores_map_current_evict.end()) {\n      score_before_insert = scores_map_before_insert[key];\n    }\n    bool valid = ((current_score >> 32) == global_epoch) &&\n                 ((current_score & 0xFFFFFFFF) ==\n                  ((0xFFFFFFFF & score_before_insert) + (0xFFFFFFFF & score)));\n\n    if (!valid) {\n      score_error_cnt2++;\n    }\n  }\n\n  for (auto& it : values_map_before_insert) {\n    if (values_map_after_insert.find(it.first) ==\n        values_map_after_insert.end()) {\n      ++key_miss_cnt;\n      continue;\n    }\n    test_util::ValueArray<V, dim>& vec0 = it.second;\n    test_util::ValueArray<V, dim>& vec1 = values_map_after_insert.at(it.first);\n    for (size_t j = 0; j < dim; j++) {\n      if (vec0[j] != vec1[j]) {\n        ++value_diff_cnt;\n        break;\n      }\n    }\n  }\n\n  std::cout << \"Check insert_and_evict behavior got \"\n            << \"key_miss_cnt: \" << key_miss_cnt\n            << \", value_diff_cnt: \" << value_diff_cnt\n            << \", score_error_cnt1: \" << score_error_cnt1\n            << \", score_error_cnt2: \" << score_error_cnt2\n            << \", while table_size_before: \" << table_size_before\n            << \", while table_size_after: \" << table_size_after\n            << \", while len: \" << len << std::endl;\n\n  ASSERT_EQ(key_miss_cnt, 0);\n  ASSERT_EQ(value_diff_cnt, 0);\n  ASSERT_EQ(score_error_cnt1, 0);\n  ASSERT_EQ(score_error_cnt2, 0);\n\n  CUDA_CHECK(cudaFreeAsync(d_tmp_keys, stream));\n  CUDA_CHECK(cudaFreeAsync(d_tmp_values, stream));\n  CUDA_CHECK(cudaFreeAsync(d_tmp_scores, stream));\n  CUDA_CHECK(cudaFreeAsync(d_tmp_founds, stream));\n  free(h_tmp_keys);\n  free(h_tmp_values);\n  free(h_tmp_scores);\n  free(h_tmp_founds);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n}\n\nvoid test_assign_advanced_on_epochlfu(size_t max_hbm_for_vectors) {\n  const size_t U = 1024 * 1024;\n  const size_t B = 100000;\n  constexpr size_t dim = 16;\n\n  TableOptions opt;\n\n  opt.max_capacity = U;\n  opt.init_capacity = U;\n  opt.max_hbm_for_vectors = U * dim * sizeof(V);\n  opt.max_bucket_size = 128;\n  opt.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kEpochLfu>;\n  opt.dim = dim;\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  std::unique_ptr<Table> table = std::make_unique<Table>();\n  table->init(opt);\n\n  test_util::KVMSBuffer<K, V, S> evict_buffer;\n  evict_buffer.Reserve(B, dim, stream);\n  evict_buffer.ToZeros(stream);\n\n  test_util::KVMSBuffer<K, V, S> data_buffer;\n  test_util::KVMSBuffer<K, V, S> pre_data_buffer;\n  data_buffer.Reserve(B, dim, stream);\n  pre_data_buffer.Reserve(B, dim, stream);\n\n  size_t offset = 0;\n  int freq_range = 100;\n  float repeat_rate = 0.9;\n  for (unsigned int global_epoch = 1; global_epoch <= 20; global_epoch++) {\n    repeat_rate = global_epoch <= 1 ? 0.0 : 0.1;\n    if (global_epoch <= 1) {\n      test_util::create_random_keys_advanced<K, S, V>(\n          dim, data_buffer.keys_ptr(false), data_buffer.scores_ptr(false),\n          data_buffer.values_ptr(false), (int)B, B * 32, freq_range);\n    } else {\n      test_util::create_random_keys_advanced<K, S, V>(\n          dim, data_buffer.keys_ptr(false), pre_data_buffer.keys_ptr(false),\n          data_buffer.scores_ptr(false), data_buffer.values_ptr(false), (int)B,\n          B * 32, freq_range, repeat_rate);\n    }\n    data_buffer.SyncData(true, stream);\n    if (global_epoch <= 1) {\n      pre_data_buffer.CopyFrom(data_buffer, stream);\n    }\n\n    CheckAssignOnEpochLfu<K, V, S, Table, dim>(table.get(), &data_buffer,\n                                               &evict_buffer, &pre_data_buffer,\n                                               B, stream, opt, global_epoch);\n\n    pre_data_buffer.CopyFrom(data_buffer, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    offset += B;\n  }\n}\n\nTEST(AssignValuesTest, test_evict_strategy_lru_basic) {\n  test_evict_strategy_lru_basic(16, 21);\n  test_evict_strategy_lru_basic(0);\n}\nTEST(AssignValuesTest, test_evict_strategy_epochlfu_basic) {\n  test_evict_strategy_epochlfu_basic(16);\n  test_evict_strategy_epochlfu_basic(0, 8);\n}\nTEST(AssignValuesTest, test_assign_advanced_on_epochlfu) {\n  test_assign_advanced_on_epochlfu(16);\n}"
  },
  {
    "path": "tests/dual_bucket_test.cc.cu",
    "content": "/*\n * Copyright (c) 2024, NVIDIA CORPORATION.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *     http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n#include <gtest/gtest.h>\n#include <math.h>\n#include <stdio.h>\n#include <stdlib.h>\n#include <algorithm>\n#include <iostream>\n#include <numeric>\n#include <random>\n#include <unordered_set>\n#include <vector>\n#include \"merlin_hashtable.cuh\"\n#include \"test_util.cuh\"\n\nconstexpr size_t DIM = 16;\nusing K = uint64_t;\nusing V = float;\nusing S = uint64_t;\nusing TableOptions = nv::merlin::HashTableOptions;\nusing TableMode = nv::merlin::TableMode;\nusing EvictStrategy = nv::merlin::EvictStrategy;\n\n/*\n * Helper: create a MEMORY_MODE table with fixed capacity.\n */\ntemplate <typename Table>\nvoid create_memory_mode_table(Table& table, size_t capacity, size_t dim = DIM) {\n  TableOptions options;\n  options.init_capacity = capacity;\n  options.max_capacity = capacity;\n  options.max_hbm_for_vectors = 0;\n  options.dim = dim;\n  options.max_bucket_size = 128;\n  options.table_mode = TableMode::kMemory;\n  table.init(options);\n}\n\n/*\n * Helper: create a THROUGHPUT_MODE table with fixed capacity.\n */\ntemplate <typename Table>\nvoid create_throughput_mode_table(Table& table, size_t capacity,\n                                  size_t dim = DIM) {\n  TableOptions options;\n  options.init_capacity = capacity;\n  options.max_capacity = capacity;\n  options.max_hbm_for_vectors = 0;\n  options.dim = dim;\n  options.max_bucket_size = 128;\n  options.table_mode = TableMode::kThroughput;\n  table.init(options);\n}\n\n// ==============================\n// TestGroup 1: Basic Correctness\n// ==============================\n\n// T1.1: MEMORY_MODE insert_or_assign + find basic functionality.\nTEST(DualBucketTest, BasicInsertAndFind) {\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  constexpr size_t CAPACITY = 128 * 1024;  // ~16K entries\n  constexpr size_t N = static_cast<size_t>(CAPACITY * 0.5);\n\n  Table table;\n  create_memory_mode_table(table, CAPACITY);\n\n  // Allocate host data.\n  std::vector<K> h_keys(N);\n  std::vector<V> h_values(N * DIM);\n  std::vector<S> h_scores(N);\n\n  std::iota(h_keys.begin(), h_keys.end(), 1);\n  for (size_t i = 0; i < N; i++) {\n    h_scores[i] = i + 1;\n    for (size_t j = 0; j < DIM; j++) {\n      h_values[i * DIM + j] = static_cast<V>(h_keys[i] * 0.00001f);\n    }\n  }\n\n  // Allocate device data.\n  K* d_keys;\n  V* d_values;\n  S* d_scores;\n  bool* d_founds;\n  V* d_found_values;\n\n  CUDA_CHECK(cudaMalloc(&d_keys, N * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_values, N * DIM * sizeof(V)));\n  CUDA_CHECK(cudaMalloc(&d_scores, N * sizeof(S)));\n  CUDA_CHECK(cudaMalloc(&d_founds, N * sizeof(bool)));\n  CUDA_CHECK(cudaMalloc(&d_found_values, N * DIM * sizeof(V)));\n\n  CUDA_CHECK(\n      cudaMemcpy(d_keys, h_keys.data(), N * sizeof(K), cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_values, h_values.data(), N * DIM * sizeof(V),\n                        cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_scores, h_scores.data(), N * sizeof(S),\n                        cudaMemcpyHostToDevice));\n\n  // Insert.\n  table.insert_or_assign(N, d_keys, d_values, d_scores, /*stream=*/0,\n                         /*unique_key=*/true);\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  // Verify size.\n  size_t table_size = table.size(/*stream=*/0);\n  CUDA_CHECK(cudaDeviceSynchronize());\n  EXPECT_EQ(table_size, N);\n\n  // Find.\n  table.find(N, d_keys, d_found_values, d_founds, /*scores=*/nullptr,\n             /*stream=*/0);\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  // Check all found.\n  bool* h_founds = new bool[N];\n  CUDA_CHECK(\n      cudaMemcpy(h_founds, d_founds, N * sizeof(bool), cudaMemcpyDeviceToHost));\n  for (size_t i = 0; i < N; i++) {\n    EXPECT_TRUE(h_founds[i]) << \"Key \" << h_keys[i] << \" not found\";\n  }\n\n  // Check values correct.\n  std::vector<V> h_found_values(N * DIM);\n  CUDA_CHECK(cudaMemcpy(h_found_values.data(), d_found_values,\n                        N * DIM * sizeof(V), cudaMemcpyDeviceToHost));\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < DIM; j++) {\n      EXPECT_FLOAT_EQ(h_found_values[i * DIM + j],\n                      static_cast<V>(h_keys[i] * 0.00001f))\n          << \"Value mismatch for key \" << h_keys[i] << \" dim \" << j;\n    }\n  }\n\n  delete[] h_founds;\n  CUDA_CHECK(cudaFree(d_keys));\n  CUDA_CHECK(cudaFree(d_values));\n  CUDA_CHECK(cudaFree(d_scores));\n  CUDA_CHECK(cudaFree(d_founds));\n  CUDA_CHECK(cudaFree(d_found_values));\n}\n\n// T1.2: MEMORY_MODE assign (update) - key already exists.\nTEST(DualBucketTest, UpdateExistingKey) {\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  constexpr size_t CAPACITY = 128 * 1024;\n  constexpr size_t N = 1024;\n\n  Table table;\n  create_memory_mode_table(table, CAPACITY);\n\n  std::vector<K> h_keys(N);\n  std::vector<V> h_values_v1(N * DIM);\n  std::vector<V> h_values_v2(N * DIM);\n  std::vector<S> h_scores(N);\n\n  std::iota(h_keys.begin(), h_keys.end(), 1);\n  for (size_t i = 0; i < N; i++) {\n    h_scores[i] = i + 1;\n    for (size_t j = 0; j < DIM; j++) {\n      h_values_v1[i * DIM + j] = 1.0f;\n      h_values_v2[i * DIM + j] = 2.0f;\n    }\n  }\n\n  K* d_keys;\n  V* d_values;\n  S* d_scores;\n  bool* d_founds;\n  V* d_found_values;\n\n  CUDA_CHECK(cudaMalloc(&d_keys, N * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_values, N * DIM * sizeof(V)));\n  CUDA_CHECK(cudaMalloc(&d_scores, N * sizeof(S)));\n  CUDA_CHECK(cudaMalloc(&d_founds, N * sizeof(bool)));\n  CUDA_CHECK(cudaMalloc(&d_found_values, N * DIM * sizeof(V)));\n\n  CUDA_CHECK(\n      cudaMemcpy(d_keys, h_keys.data(), N * sizeof(K), cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_scores, h_scores.data(), N * sizeof(S),\n                        cudaMemcpyHostToDevice));\n\n  // Insert V1.\n  CUDA_CHECK(cudaMemcpy(d_values, h_values_v1.data(), N * DIM * sizeof(V),\n                        cudaMemcpyHostToDevice));\n  table.insert_or_assign(N, d_keys, d_values, d_scores, 0, true);\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  // Update with V2.\n  CUDA_CHECK(cudaMemcpy(d_values, h_values_v2.data(), N * DIM * sizeof(V),\n                        cudaMemcpyHostToDevice));\n  table.insert_or_assign(N, d_keys, d_values, d_scores, 0, true);\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  // Size should still be N (no duplicates).\n  size_t table_size = table.size(0);\n  CUDA_CHECK(cudaDeviceSynchronize());\n  EXPECT_EQ(table_size, N);\n\n  // Find and verify V2 values.\n  table.find(N, d_keys, d_found_values, d_founds, nullptr, 0);\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  std::vector<V> h_found_values(N * DIM);\n  CUDA_CHECK(cudaMemcpy(h_found_values.data(), d_found_values,\n                        N * DIM * sizeof(V), cudaMemcpyDeviceToHost));\n  for (size_t i = 0; i < N; i++) {\n    for (size_t j = 0; j < DIM; j++) {\n      EXPECT_FLOAT_EQ(h_found_values[i * DIM + j], 2.0f)\n          << \"Expected V2 value for key \" << h_keys[i] << \" dim \" << j;\n    }\n  }\n\n  CUDA_CHECK(cudaFree(d_keys));\n  CUDA_CHECK(cudaFree(d_values));\n  CUDA_CHECK(cudaFree(d_scores));\n  CUDA_CHECK(cudaFree(d_founds));\n  CUDA_CHECK(cudaFree(d_found_values));\n}\n\n// T1.3: MEMORY_MODE score-eviction correctness.\nTEST(DualBucketTest, ScoreEviction) {\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  // Small capacity to force eviction quickly.\n  constexpr size_t CAPACITY = 128 * 128;  // 128 buckets * 128 slots = 16384\n  constexpr size_t N_FILL = CAPACITY;     // Fill completely\n  constexpr size_t N_NEW = 1024;          // Insert high-score keys\n\n  Table table;\n  create_memory_mode_table(table, CAPACITY);\n\n  // Phase 1: Fill table with low-score keys.\n  std::vector<K> h_keys_fill(N_FILL);\n  std::vector<V> h_values_fill(N_FILL * DIM, 1.0f);\n  std::vector<S> h_scores_fill(N_FILL);\n\n  std::iota(h_keys_fill.begin(), h_keys_fill.end(), 1);\n  for (size_t i = 0; i < N_FILL; i++) {\n    h_scores_fill[i] = i + 1;  // Low scores: 1..N_FILL\n  }\n\n  K* d_keys;\n  V* d_values;\n  S* d_scores;\n  CUDA_CHECK(cudaMalloc(&d_keys, N_FILL * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_values, N_FILL * DIM * sizeof(V)));\n  CUDA_CHECK(cudaMalloc(&d_scores, N_FILL * sizeof(S)));\n\n  CUDA_CHECK(cudaMemcpy(d_keys, h_keys_fill.data(), N_FILL * sizeof(K),\n                        cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_values, h_values_fill.data(),\n                        N_FILL * DIM * sizeof(V), cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_scores, h_scores_fill.data(), N_FILL * sizeof(S),\n                        cudaMemcpyHostToDevice));\n\n  table.insert_or_assign(N_FILL, d_keys, d_values, d_scores, 0, true);\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  // Phase 2: Insert high-score keys (should evict low-score keys).\n  std::vector<K> h_keys_new(N_NEW);\n  std::vector<V> h_values_new(N_NEW * DIM, 2.0f);\n  std::vector<S> h_scores_new(N_NEW);\n\n  for (size_t i = 0; i < N_NEW; i++) {\n    h_keys_new[i] = N_FILL + 1 + i;       // New keys\n    h_scores_new[i] = N_FILL + 1000 + i;  // High scores\n  }\n\n  CUDA_CHECK(cudaMemcpy(d_keys, h_keys_new.data(), N_NEW * sizeof(K),\n                        cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_values, h_values_new.data(), N_NEW * DIM * sizeof(V),\n                        cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_scores, h_scores_new.data(), N_NEW * sizeof(S),\n                        cudaMemcpyHostToDevice));\n\n  table.insert_or_assign(N_NEW, d_keys, d_values, d_scores, 0, true);\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  // Phase 3: Verify high-score keys are present.\n  bool* d_founds;\n  V* d_found_values;\n  CUDA_CHECK(cudaMalloc(&d_founds, N_NEW * sizeof(bool)));\n  CUDA_CHECK(cudaMalloc(&d_found_values, N_NEW * DIM * sizeof(V)));\n\n  CUDA_CHECK(cudaMemcpy(d_keys, h_keys_new.data(), N_NEW * sizeof(K),\n                        cudaMemcpyHostToDevice));\n  table.find(N_NEW, d_keys, d_found_values, d_founds, nullptr, 0);\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  bool* h_founds = new bool[N_NEW];\n  CUDA_CHECK(cudaMemcpy(h_founds, d_founds, N_NEW * sizeof(bool),\n                        cudaMemcpyDeviceToHost));\n\n  int found_count = 0;\n  for (size_t i = 0; i < N_NEW; i++) {\n    if (h_founds[i]) found_count++;\n  }\n  std::cout << \"[ScoreEviction] High-score keys accuracy: \" << found_count\n            << \"/\" << N_NEW << \" (\" << (100.0 * found_count / N_NEW) << \"%)\"\n            << std::endl;\n  // Most high-score keys should be found.  Require >= 80%.\n  EXPECT_GT(found_count, static_cast<int>(N_NEW * 0.8))\n      << \"Expected >= 80% of high-score keys to survive eviction\";\n\n  delete[] h_founds;\n  CUDA_CHECK(cudaFree(d_keys));\n  CUDA_CHECK(cudaFree(d_values));\n  CUDA_CHECK(cudaFree(d_scores));\n  CUDA_CHECK(cudaFree(d_founds));\n  CUDA_CHECK(cudaFree(d_found_values));\n}\n\n// T1.4: THROUGHPUT_MODE regression test (not affected by dual-bucket changes).\nTEST(DualBucketTest, ThroughputModeRegression) {\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  constexpr size_t CAPACITY = 128 * 1024;\n  constexpr size_t N = 4096;\n\n  Table table;\n  create_throughput_mode_table(table, CAPACITY);\n\n  std::vector<K> h_keys(N);\n  std::vector<V> h_values(N * DIM);\n  std::vector<S> h_scores(N);\n\n  std::iota(h_keys.begin(), h_keys.end(), 1);\n  for (size_t i = 0; i < N; i++) {\n    h_scores[i] = i + 1;\n    for (size_t j = 0; j < DIM; j++) {\n      h_values[i * DIM + j] = static_cast<V>(h_keys[i] * 0.001f);\n    }\n  }\n\n  K* d_keys;\n  V* d_values;\n  S* d_scores;\n  bool* d_founds;\n  V* d_found_values;\n\n  CUDA_CHECK(cudaMalloc(&d_keys, N * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_values, N * DIM * sizeof(V)));\n  CUDA_CHECK(cudaMalloc(&d_scores, N * sizeof(S)));\n  CUDA_CHECK(cudaMalloc(&d_founds, N * sizeof(bool)));\n  CUDA_CHECK(cudaMalloc(&d_found_values, N * DIM * sizeof(V)));\n\n  CUDA_CHECK(\n      cudaMemcpy(d_keys, h_keys.data(), N * sizeof(K), cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_values, h_values.data(), N * DIM * sizeof(V),\n                        cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_scores, h_scores.data(), N * sizeof(S),\n                        cudaMemcpyHostToDevice));\n\n  table.insert_or_assign(N, d_keys, d_values, d_scores, 0, true);\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  table.find(N, d_keys, d_found_values, d_founds, nullptr, 0);\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  bool* h_founds = new bool[N];\n  CUDA_CHECK(\n      cudaMemcpy(h_founds, d_founds, N * sizeof(bool), cudaMemcpyDeviceToHost));\n  for (size_t i = 0; i < N; i++) {\n    EXPECT_TRUE(h_founds[i])\n        << \"THROUGHPUT_MODE: Key \" << h_keys[i] << \" not found\";\n  }\n\n  delete[] h_founds;\n  CUDA_CHECK(cudaFree(d_keys));\n  CUDA_CHECK(cudaFree(d_values));\n  CUDA_CHECK(cudaFree(d_scores));\n  CUDA_CHECK(cudaFree(d_founds));\n  CUDA_CHECK(cudaFree(d_found_values));\n}\n\n// ==========================================\n// TestGroup 2: Dual-bucket Feature Verify\n// ==========================================\n\n// T2.2: First eviction load factor comparison.\nTEST(DualBucketTest, FirstEvictionLoadFactor) {\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  constexpr size_t CAPACITY = 10 * 1024 * 1024;  // ~10M slots\n\n  // Run for MEMORY_MODE.\n  {\n    Table table;\n    create_memory_mode_table(table, CAPACITY);\n\n    constexpr size_t BATCH = 128;\n    std::vector<K> h_keys(BATCH);\n    std::vector<V> h_values(BATCH * DIM, 1.0f);\n    std::vector<S> h_scores(BATCH);\n\n    K* d_keys;\n    V* d_values;\n    S* d_scores;\n    CUDA_CHECK(cudaMalloc(&d_keys, BATCH * sizeof(K)));\n    CUDA_CHECK(cudaMalloc(&d_values, BATCH * DIM * sizeof(V)));\n    CUDA_CHECK(cudaMalloc(&d_scores, BATCH * sizeof(S)));\n\n    K next_key = 1;\n    size_t total_inserted = 0;\n    float first_eviction_lf = 0.0f;\n\n    // Insert in batches until table is nearly full.\n    while (total_inserted < CAPACITY) {\n      for (size_t i = 0; i < BATCH; i++) {\n        h_keys[i] = next_key++;\n        h_scores[i] = h_keys[i];  // Score = key value (ascending)\n      }\n      CUDA_CHECK(cudaMemcpy(d_keys, h_keys.data(), BATCH * sizeof(K),\n                            cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_values, h_values.data(), BATCH * DIM * sizeof(V),\n                            cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores, h_scores.data(), BATCH * sizeof(S),\n                            cudaMemcpyHostToDevice));\n\n      table.insert_or_assign(BATCH, d_keys, d_values, d_scores, 0, true);\n      CUDA_CHECK(cudaDeviceSynchronize());\n      total_inserted += BATCH;\n\n      size_t table_size = table.size(0);\n      CUDA_CHECK(cudaDeviceSynchronize());\n\n      // If table_size < total_inserted, eviction occurred.\n      if (table_size < total_inserted && first_eviction_lf == 0.0f) {\n        first_eviction_lf =\n            static_cast<float>(table_size) / static_cast<float>(CAPACITY);\n        break;\n      }\n    }\n\n    std::cout << \"[MEMORY_MODE] First eviction LF: \" << first_eviction_lf\n              << \" (total_inserted=\" << total_inserted << \")\" << std::endl;\n\n    // Dual-bucket two-choice hashing should achieve very high LF before first\n    // eviction.  Empirically measured ~0.982 at 10M scale on A6000.\n    EXPECT_GT(first_eviction_lf, 0.980f)\n        << \"Dual-bucket should delay eviction beyond 98.0% LF\";\n\n    CUDA_CHECK(cudaFree(d_keys));\n    CUDA_CHECK(cudaFree(d_values));\n    CUDA_CHECK(cudaFree(d_scores));\n  }\n}\n\n// ===================================\n// TestGroup 3: API Guard Tests\n// ===================================\n\nTEST(DualBucketTest, EraseGuard) {\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n  Table table;\n  create_memory_mode_table(table, 128 * 128);\n\n  K* d_keys;\n  CUDA_CHECK(cudaMalloc(&d_keys, sizeof(K)));\n  K h_key = 1;\n  CUDA_CHECK(cudaMemcpy(d_keys, &h_key, sizeof(K), cudaMemcpyHostToDevice));\n\n  EXPECT_THROW(table.erase(1, d_keys, 0), std::runtime_error);\n\n  CUDA_CHECK(cudaFree(d_keys));\n}\n\nTEST(DualBucketTest, ContainsGuard) {\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n  Table table;\n  create_memory_mode_table(table, 128 * 128);\n\n  K* d_keys;\n  bool* d_founds;\n  CUDA_CHECK(cudaMalloc(&d_keys, sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_founds, sizeof(bool)));\n  K h_key = 1;\n  CUDA_CHECK(cudaMemcpy(d_keys, &h_key, sizeof(K), cudaMemcpyHostToDevice));\n\n  EXPECT_THROW(table.contains(1, d_keys, d_founds, 0), std::runtime_error);\n\n  CUDA_CHECK(cudaFree(d_keys));\n  CUDA_CHECK(cudaFree(d_founds));\n}\n\nTEST(DualBucketTest, ReserveGuard) {\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n  Table table;\n  create_memory_mode_table(table, 128 * 128);\n\n  EXPECT_THROW(table.reserve(128 * 256, 0), std::runtime_error);\n}\n\n// ===================================\n// TestGroup 4: Boundary Conditions\n// ===================================\n\n// T4.1: Empty table find.\nTEST(DualBucketTest, EmptyTableFind) {\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  Table table;\n  create_memory_mode_table(table, 128 * 128);\n\n  constexpr size_t N = 64;\n  K* d_keys;\n  V* d_values;\n  bool* d_founds;\n  CUDA_CHECK(cudaMalloc(&d_keys, N * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_values, N * DIM * sizeof(V)));\n  CUDA_CHECK(cudaMalloc(&d_founds, N * sizeof(bool)));\n\n  std::vector<K> h_keys(N);\n  std::iota(h_keys.begin(), h_keys.end(), 1);\n  CUDA_CHECK(\n      cudaMemcpy(d_keys, h_keys.data(), N * sizeof(K), cudaMemcpyHostToDevice));\n\n  table.find(N, d_keys, d_values, d_founds, nullptr, 0);\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  bool* h_founds = new bool[N];\n  CUDA_CHECK(\n      cudaMemcpy(h_founds, d_founds, N * sizeof(bool), cudaMemcpyDeviceToHost));\n  for (size_t i = 0; i < N; i++) {\n    EXPECT_FALSE(h_founds[i])\n        << \"Empty table should not find key \" << h_keys[i];\n  }\n\n  delete[] h_founds;\n  CUDA_CHECK(cudaFree(d_keys));\n  CUDA_CHECK(cudaFree(d_values));\n  CUDA_CHECK(cudaFree(d_founds));\n}\n\n// T4.4: Different dim values.\nTEST(DualBucketTest, DimVariation) {\n  using Table1 = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  // Test dim=1 and dim=64 (exercises different VecV specializations).\n  // Note: dim > 224 exceeds the dual-bucket lookup kernel's fixed shared-memory\n  // buffer (896 bytes).  init() now rejects dim > 224 for kMemory mode.\n  for (size_t test_dim : {1, 64}) {\n    Table1 table;\n    constexpr size_t CAPACITY = 128 * 128;\n    constexpr size_t N = 256;\n\n    TableOptions options;\n    options.init_capacity = CAPACITY;\n    options.max_capacity = CAPACITY;\n    options.max_hbm_for_vectors = 0;\n    options.dim = test_dim;\n    options.max_bucket_size = 128;\n    options.table_mode = TableMode::kMemory;\n    table.init(options);\n\n    std::vector<K> h_keys(N);\n    std::vector<V> h_values(N * test_dim);\n    std::vector<S> h_scores(N);\n\n    std::iota(h_keys.begin(), h_keys.end(), 1);\n    for (size_t i = 0; i < N; i++) {\n      h_scores[i] = i + 1;\n      for (size_t j = 0; j < test_dim; j++) {\n        h_values[i * test_dim + j] = static_cast<V>(i);\n      }\n    }\n\n    K* d_keys;\n    V* d_values;\n    S* d_scores;\n    bool* d_founds;\n    V* d_found_values;\n    CUDA_CHECK(cudaMalloc(&d_keys, N * sizeof(K)));\n    CUDA_CHECK(cudaMalloc(&d_values, N * test_dim * sizeof(V)));\n    CUDA_CHECK(cudaMalloc(&d_scores, N * sizeof(S)));\n    CUDA_CHECK(cudaMalloc(&d_founds, N * sizeof(bool)));\n    CUDA_CHECK(cudaMalloc(&d_found_values, N * test_dim * sizeof(V)));\n\n    CUDA_CHECK(cudaMemcpy(d_keys, h_keys.data(), N * sizeof(K),\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(d_values, h_values.data(), N * test_dim * sizeof(V),\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(d_scores, h_scores.data(), N * sizeof(S),\n                          cudaMemcpyHostToDevice));\n\n    table.insert_or_assign(N, d_keys, d_values, d_scores, 0, true);\n    CUDA_CHECK(cudaDeviceSynchronize());\n\n    table.find(N, d_keys, d_found_values, d_founds, nullptr, 0);\n    CUDA_CHECK(cudaDeviceSynchronize());\n\n    bool* h_founds = new bool[N];\n    CUDA_CHECK(cudaMemcpy(h_founds, d_founds, N * sizeof(bool),\n                          cudaMemcpyDeviceToHost));\n    std::vector<V> h_found_values(N * test_dim);\n    CUDA_CHECK(cudaMemcpy(h_found_values.data(), d_found_values,\n                          N * test_dim * sizeof(V), cudaMemcpyDeviceToHost));\n    for (size_t i = 0; i < N; i++) {\n      EXPECT_TRUE(h_founds[i])\n          << \"dim=\" << test_dim << \": Key \" << h_keys[i] << \" not found\";\n      if (h_founds[i]) {\n        for (size_t j = 0; j < test_dim; j++) {\n          EXPECT_FLOAT_EQ(h_found_values[i * test_dim + j], static_cast<V>(i))\n              << \"dim=\" << test_dim << \": Value mismatch key \" << h_keys[i]\n              << \" dim \" << j;\n        }\n      }\n    }\n\n    delete[] h_founds;\n    CUDA_CHECK(cudaFree(d_keys));\n    CUDA_CHECK(cudaFree(d_values));\n    CUDA_CHECK(cudaFree(d_scores));\n    CUDA_CHECK(cudaFree(d_founds));\n    CUDA_CHECK(cudaFree(d_found_values));\n  }\n}\n\n// ===================================\n// TestGroup 5: Init Validation\n// ===================================\n\nTEST(DualBucketTest, InitCapacityMismatchReject) {\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n  Table table;\n\n  TableOptions options;\n  options.init_capacity = 128 * 128;\n  options.max_capacity = 128 * 256;  // Different from init_capacity!\n  options.max_hbm_for_vectors = 0;\n  options.dim = DIM;\n  options.max_bucket_size = 128;\n  options.table_mode = TableMode::kMemory;\n\n  EXPECT_THROW(table.init(options), std::runtime_error);\n}\n\n// ===================================\n// TestGroup 2 additions\n// ===================================\n\n// T2.3: b1 == b2 degeneration.\n// When a key's two bucket indices collide, the kernel must degenerate to\n// single-bucket behaviour without data corruption or deadlock.\nTEST(DualBucketTest, B1EqualsB2Degeneration) {\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  // Use a small number of buckets so that collisions of lo%N == hi%N are\n  // reasonably frequent.  With 4 buckets the probability for each key is ~25%.\n  constexpr size_t NUM_BUCKETS = 4;\n  constexpr size_t CAPACITY = NUM_BUCKETS * 128;  // 512 slots\n  constexpr size_t N = 256;\n\n  Table table;\n  create_memory_mode_table(table, CAPACITY);\n\n  std::vector<K> h_keys(N);\n  std::vector<V> h_values(N * DIM);\n  std::vector<S> h_scores(N);\n\n  std::iota(h_keys.begin(), h_keys.end(), 1);\n  for (size_t i = 0; i < N; i++) {\n    h_scores[i] = i + 1;\n    for (size_t j = 0; j < DIM; j++)\n      h_values[i * DIM + j] = static_cast<V>(h_keys[i]);\n  }\n\n  K* d_keys;\n  V* d_values;\n  S* d_scores;\n  bool* d_founds;\n  V* d_found_values;\n  CUDA_CHECK(cudaMalloc(&d_keys, N * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_values, N * DIM * sizeof(V)));\n  CUDA_CHECK(cudaMalloc(&d_scores, N * sizeof(S)));\n  CUDA_CHECK(cudaMalloc(&d_founds, N * sizeof(bool)));\n  CUDA_CHECK(cudaMalloc(&d_found_values, N * DIM * sizeof(V)));\n\n  CUDA_CHECK(\n      cudaMemcpy(d_keys, h_keys.data(), N * sizeof(K), cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_values, h_values.data(), N * DIM * sizeof(V),\n                        cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_scores, h_scores.data(), N * sizeof(S),\n                        cudaMemcpyHostToDevice));\n\n  table.insert_or_assign(N, d_keys, d_values, d_scores, 0, true);\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  // All N keys should be found, regardless of b1==b2 collisions.\n  table.find(N, d_keys, d_found_values, d_founds, nullptr, 0);\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  bool* h_founds = new bool[N];\n  CUDA_CHECK(\n      cudaMemcpy(h_founds, d_founds, N * sizeof(bool), cudaMemcpyDeviceToHost));\n  std::vector<V> h_found_values(N * DIM);\n  CUDA_CHECK(cudaMemcpy(h_found_values.data(), d_found_values,\n                        N * DIM * sizeof(V), cudaMemcpyDeviceToHost));\n\n  for (size_t i = 0; i < N; i++) {\n    EXPECT_TRUE(h_founds[i]) << \"Key \" << h_keys[i] << \" not found (b1==b2?)\";\n    if (h_founds[i]) {\n      EXPECT_FLOAT_EQ(h_found_values[i * DIM], static_cast<V>(h_keys[i]))\n          << \"Value mismatch for key \" << h_keys[i];\n    }\n  }\n\n  // Table size must equal N (no duplicates from b1==b2 path).\n  size_t table_size = table.size(0);\n  CUDA_CHECK(cudaDeviceSynchronize());\n  EXPECT_EQ(table_size, N);\n\n  delete[] h_founds;\n  CUDA_CHECK(cudaFree(d_keys));\n  CUDA_CHECK(cudaFree(d_values));\n  CUDA_CHECK(cudaFree(d_scores));\n  CUDA_CHECK(cudaFree(d_founds));\n  CUDA_CHECK(cudaFree(d_found_values));\n}\n\n// T2.5: Digest effectiveness — verify that dual-bucket digest (bit[56:63])\n// is used consistently during init, insert, and find.  If the init kernel\n// wrote the wrong empty-digest value, empty-slot detection would fail and\n// no keys could be inserted.  This test therefore doubles as a regression\n// guard for the G1 digest-mismatch bug.\nTEST(DualBucketTest, DigestEffectiveness) {\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  constexpr size_t CAPACITY = 128 * 64;  // 8192 slots, 64 buckets\n  constexpr size_t N = 4096;             // 50% LF\n\n  Table table;\n  create_memory_mode_table(table, CAPACITY);\n\n  std::vector<K> h_keys(N);\n  std::vector<V> h_values(N * DIM, 1.0f);\n  std::vector<S> h_scores(N);\n\n  // Use random keys so that digests are well-distributed.\n  std::mt19937_64 rng(42);\n  for (size_t i = 0; i < N; i++) {\n    h_keys[i] = (rng() & 0x00FFFFFFFFFFFFFF) | 1;  // avoid reserved keys\n    h_scores[i] = i + 1;\n  }\n\n  K* d_keys;\n  V* d_values;\n  S* d_scores;\n  bool* d_founds;\n  V* d_found_values;\n  CUDA_CHECK(cudaMalloc(&d_keys, N * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_values, N * DIM * sizeof(V)));\n  CUDA_CHECK(cudaMalloc(&d_scores, N * sizeof(S)));\n  CUDA_CHECK(cudaMalloc(&d_founds, N * sizeof(bool)));\n  CUDA_CHECK(cudaMalloc(&d_found_values, N * DIM * sizeof(V)));\n\n  CUDA_CHECK(\n      cudaMemcpy(d_keys, h_keys.data(), N * sizeof(K), cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_values, h_values.data(), N * DIM * sizeof(V),\n                        cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_scores, h_scores.data(), N * sizeof(S),\n                        cudaMemcpyHostToDevice));\n\n  table.insert_or_assign(N, d_keys, d_values, d_scores, 0, true);\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  // If empty-digest was wrong, insert would have gone through the D2 eviction\n  // path and all entries would be REFUSED.  Check that table is not empty.\n  size_t table_size = table.size(0);\n  CUDA_CHECK(cudaDeviceSynchronize());\n  EXPECT_EQ(table_size, N) << \"Digest mismatch: expected \" << N\n                           << \" entries but got \" << table_size\n                           << \" (empty-slot detection likely failed)\";\n\n  // Verify every key is findable.\n  table.find(N, d_keys, d_found_values, d_founds, nullptr, 0);\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  bool* h_founds = new bool[N];\n  CUDA_CHECK(\n      cudaMemcpy(h_founds, d_founds, N * sizeof(bool), cudaMemcpyDeviceToHost));\n  int found_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (h_founds[i]) found_count++;\n  }\n  EXPECT_EQ(found_count, static_cast<int>(N))\n      << \"Digest mismatch on find: only \" << found_count << \"/\" << N\n      << \" keys found\";\n\n  delete[] h_founds;\n  CUDA_CHECK(cudaFree(d_keys));\n  CUDA_CHECK(cudaFree(d_values));\n  CUDA_CHECK(cudaFree(d_scores));\n  CUDA_CHECK(cudaFree(d_founds));\n  CUDA_CHECK(cudaFree(d_found_values));\n}\n\n// ===================================\n// TestGroup 1 addition: Score ordering after eviction\n// ===================================\n\n// T1.3b: After eviction, surviving keys must have scores >= the scores of\n// evicted keys.  We export the full table and verify score ordering.\nTEST(DualBucketTest, ScoreOrderingAfterEviction) {\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  constexpr size_t CAPACITY = 128 * 64;  // 8192 slots\n  constexpr size_t N_FILL = CAPACITY;\n  constexpr size_t N_NEW = 512;\n\n  Table table;\n  create_memory_mode_table(table, CAPACITY);\n\n  // Phase 1: Fill with scores [1..N_FILL].\n  std::vector<K> h_keys(N_FILL);\n  std::vector<V> h_values(N_FILL * DIM, 1.0f);\n  std::vector<S> h_scores(N_FILL);\n\n  std::iota(h_keys.begin(), h_keys.end(), 1);\n  for (size_t i = 0; i < N_FILL; i++) h_scores[i] = i + 1;\n\n  K* d_keys;\n  V* d_values;\n  S* d_scores;\n  CUDA_CHECK(cudaMalloc(&d_keys, N_FILL * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_values, N_FILL * DIM * sizeof(V)));\n  CUDA_CHECK(cudaMalloc(&d_scores, N_FILL * sizeof(S)));\n\n  CUDA_CHECK(cudaMemcpy(d_keys, h_keys.data(), N_FILL * sizeof(K),\n                        cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_values, h_values.data(), N_FILL * DIM * sizeof(V),\n                        cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_scores, h_scores.data(), N_FILL * sizeof(S),\n                        cudaMemcpyHostToDevice));\n\n  table.insert_or_assign(N_FILL, d_keys, d_values, d_scores, 0, true);\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  // Phase 2: Insert high-score keys that force eviction.\n  std::vector<K> h_keys_new(N_NEW);\n  std::vector<V> h_values_new(N_NEW * DIM, 2.0f);\n  std::vector<S> h_scores_new(N_NEW);\n  for (size_t i = 0; i < N_NEW; i++) {\n    h_keys_new[i] = N_FILL + 1 + i;\n    h_scores_new[i] = N_FILL * 10 + i;  // Much higher scores\n  }\n\n  CUDA_CHECK(cudaMemcpy(d_keys, h_keys_new.data(), N_NEW * sizeof(K),\n                        cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_values, h_values_new.data(), N_NEW * DIM * sizeof(V),\n                        cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_scores, h_scores_new.data(), N_NEW * sizeof(S),\n                        cudaMemcpyHostToDevice));\n\n  table.insert_or_assign(N_NEW, d_keys, d_values, d_scores, 0, true);\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  // Phase 3: Export all surviving entries and check scores.\n  size_t table_size = table.size(0);\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  K* d_dump_keys;\n  V* d_dump_values;\n  S* d_dump_scores;\n  size_t* d_dump_counter;\n  CUDA_CHECK(cudaMalloc(&d_dump_keys, table_size * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_dump_values, table_size * DIM * sizeof(V)));\n  CUDA_CHECK(cudaMalloc(&d_dump_scores, table_size * sizeof(S)));\n  CUDA_CHECK(cudaMalloc(&d_dump_counter, sizeof(size_t)));\n  CUDA_CHECK(cudaMemset(d_dump_counter, 0, sizeof(size_t)));\n\n  table.export_batch(table_size, 0, d_dump_counter, d_dump_keys, d_dump_values,\n                     d_dump_scores, 0);\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  size_t dumped;\n  CUDA_CHECK(cudaMemcpy(&dumped, d_dump_counter, sizeof(size_t),\n                        cudaMemcpyDeviceToHost));\n\n  std::vector<S> h_dump_scores(dumped);\n  CUDA_CHECK(cudaMemcpy(h_dump_scores.data(), d_dump_scores, dumped * sizeof(S),\n                        cudaMemcpyDeviceToHost));\n\n  // Find the minimum score among all surviving entries.\n  S min_surviving =\n      *std::min_element(h_dump_scores.begin(), h_dump_scores.end());\n\n  // Check that all high-score keys that were inserted have scores above\n  // the surviving minimum.  (Some high-score keys may have been REFUSED,\n  // but if they ARE in the table, their score must be consistent.)\n  std::vector<K> h_dump_keys(dumped);\n  CUDA_CHECK(cudaMemcpy(h_dump_keys.data(), d_dump_keys, dumped * sizeof(K),\n                        cudaMemcpyDeviceToHost));\n\n  int high_score_survivors = 0;\n  for (size_t i = 0; i < dumped; i++) {\n    if (h_dump_keys[i] > N_FILL) {\n      high_score_survivors++;\n      // Every high-score key should have score >= min_surviving.\n      EXPECT_GE(h_dump_scores[i], min_surviving);\n    }\n  }\n  // At least some high-score keys should have survived.\n  EXPECT_GT(high_score_survivors, 0) << \"No high-score keys survived eviction\";\n\n  std::cout << \"[ScoreOrdering] min_surviving_score=\" << min_surviving\n            << \" high_score_survivors=\" << high_score_survivors << \"/\" << N_NEW\n            << std::endl;\n\n  CUDA_CHECK(cudaFree(d_keys));\n  CUDA_CHECK(cudaFree(d_values));\n  CUDA_CHECK(cudaFree(d_scores));\n  CUDA_CHECK(cudaFree(d_dump_keys));\n  CUDA_CHECK(cudaFree(d_dump_values));\n  CUDA_CHECK(cudaFree(d_dump_scores));\n  CUDA_CHECK(cudaFree(d_dump_counter));\n}\n\n// ===================================\n// TestGroup 3 additions: API Guard Tests (new)\n// ===================================\n\nTEST(DualBucketTest, FindOrInsertGuard) {\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n  Table table;\n  create_memory_mode_table(table, 128 * 128);\n\n  K* d_keys;\n  V* d_values;\n  S* d_scores;\n  CUDA_CHECK(cudaMalloc(&d_keys, sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_values, DIM * sizeof(V)));\n  CUDA_CHECK(cudaMalloc(&d_scores, sizeof(S)));\n\n  EXPECT_THROW(table.find_or_insert(1, d_keys, d_values, d_scores, 0, true),\n               std::runtime_error);\n\n  CUDA_CHECK(cudaFree(d_keys));\n  CUDA_CHECK(cudaFree(d_values));\n  CUDA_CHECK(cudaFree(d_scores));\n}\n\nTEST(DualBucketTest, InsertAndEvictGuard) {\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n  Table table;\n  create_memory_mode_table(table, 128 * 128);\n\n  K* d_keys;\n  V* d_values;\n  S* d_scores;\n  K* d_evicted_keys;\n  V* d_evicted_values;\n  S* d_evicted_scores;\n  size_t* d_counter;\n  CUDA_CHECK(cudaMalloc(&d_keys, sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_values, DIM * sizeof(V)));\n  CUDA_CHECK(cudaMalloc(&d_scores, sizeof(S)));\n  CUDA_CHECK(cudaMalloc(&d_evicted_keys, sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_evicted_values, DIM * sizeof(V)));\n  CUDA_CHECK(cudaMalloc(&d_evicted_scores, sizeof(S)));\n  CUDA_CHECK(cudaMalloc(&d_counter, sizeof(size_t)));\n\n  EXPECT_THROW(\n      table.insert_and_evict(1, d_keys, d_values, d_scores, d_evicted_keys,\n                             d_evicted_values, d_evicted_scores, d_counter, 0),\n      std::runtime_error);\n\n  CUDA_CHECK(cudaFree(d_keys));\n  CUDA_CHECK(cudaFree(d_values));\n  CUDA_CHECK(cudaFree(d_scores));\n  CUDA_CHECK(cudaFree(d_evicted_keys));\n  CUDA_CHECK(cudaFree(d_evicted_values));\n  CUDA_CHECK(cudaFree(d_evicted_scores));\n  CUDA_CHECK(cudaFree(d_counter));\n}\n\nTEST(DualBucketTest, AccumOrAssignGuard) {\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n  Table table;\n  create_memory_mode_table(table, 128 * 128);\n\n  K* d_keys;\n  V* d_values;\n  bool* d_accum;\n  CUDA_CHECK(cudaMalloc(&d_keys, sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_values, DIM * sizeof(V)));\n  CUDA_CHECK(cudaMalloc(&d_accum, sizeof(bool)));\n\n  EXPECT_THROW(table.accum_or_assign(1, d_keys, d_values, d_accum),\n               std::runtime_error);\n\n  CUDA_CHECK(cudaFree(d_keys));\n  CUDA_CHECK(cudaFree(d_values));\n  CUDA_CHECK(cudaFree(d_accum));\n}\n\nTEST(DualBucketTest, AssignScoresGuard) {\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n  Table table;\n  create_memory_mode_table(table, 128 * 128);\n\n  K* d_keys;\n  S* d_scores;\n  CUDA_CHECK(cudaMalloc(&d_keys, sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores, sizeof(S)));\n\n  EXPECT_THROW(table.assign_scores(1, d_keys, d_scores), std::runtime_error);\n\n  CUDA_CHECK(cudaFree(d_keys));\n  CUDA_CHECK(cudaFree(d_scores));\n}\n\nTEST(DualBucketTest, AssignValuesGuard) {\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n  Table table;\n  create_memory_mode_table(table, 128 * 128);\n\n  K* d_keys;\n  V* d_values;\n  CUDA_CHECK(cudaMalloc(&d_keys, sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_values, DIM * sizeof(V)));\n\n  EXPECT_THROW(table.assign_values(1, d_keys, d_values), std::runtime_error);\n\n  CUDA_CHECK(cudaFree(d_keys));\n  CUDA_CHECK(cudaFree(d_values));\n}\n\n// ===================================\n// TestGroup 5 addition: max_hbm_for_vectors rejection\n// ===================================\n\nTEST(DualBucketTest, InitHbmForVectorsReject) {\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n  Table table;\n\n  TableOptions options;\n  options.init_capacity = 128 * 128;\n  options.max_capacity = 128 * 128;\n  options.max_hbm_for_vectors = 1024;  // non-zero → should be rejected\n  options.dim = DIM;\n  options.max_bucket_size = 128;\n  options.table_mode = TableMode::kMemory;\n\n  EXPECT_THROW(table.init(options), std::runtime_error);\n}\n\n// T5.3: dim > 224 rejected in MEMORY_MODE (shared-memory buffer overflow).\nTEST(DualBucketTest, InitDimTooLargeReject) {\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n  Table table;\n\n  TableOptions options;\n  options.init_capacity = 128 * 128;\n  options.max_capacity = 128 * 128;\n  options.max_hbm_for_vectors = 0;\n  options.dim = 256;  // exceeds 224-float limit\n  options.max_bucket_size = 128;\n  options.table_mode = TableMode::kMemory;\n\n  EXPECT_THROW(table.init(options), std::runtime_error);\n}\n\n// T5.3b: dim=224 should be accepted (exact boundary).\nTEST(DualBucketTest, InitDimMaxAccepted) {\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n  Table table;\n\n  TableOptions options;\n  options.init_capacity = 128 * 128;\n  options.max_capacity = 128 * 128;\n  options.max_hbm_for_vectors = 0;\n  options.dim = 224;  // exactly at the limit\n  options.max_bucket_size = 128;\n  options.table_mode = TableMode::kMemory;\n\n  EXPECT_NO_THROW(table.init(options));\n}\n\n// ===================================\n// TestGroup 2 addition: Bucket distribution (T2.1)\n// ===================================\n\n// Verify that keys are distributed across multiple buckets (not all in b1).\n// We insert random keys and check that after export, the table size matches\n// expectations.  A more direct check would require bucket-level introspection\n// which the public API does not expose, but we can infer distribution by\n// checking that the first-eviction LF is significantly higher than single-\n// bucket mode (covered in FirstEvictionLoadFactor).  Here we do a simple\n// idempotency + size check with random keys to stress the hash distribution.\nTEST(DualBucketTest, RandomKeyDistribution) {\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  constexpr size_t CAPACITY = 128 * 128;  // 16384 slots\n  constexpr size_t N = 8192;              // 50% LF\n\n  Table table;\n  create_memory_mode_table(table, CAPACITY);\n\n  std::vector<K> h_keys(N);\n  std::vector<V> h_values(N * DIM, 1.0f);\n  std::vector<S> h_scores(N);\n\n  std::mt19937_64 rng(12345);\n  for (size_t i = 0; i < N; i++) {\n    h_keys[i] = (rng() & 0x00FFFFFFFFFFFFFF) | 1;\n    h_scores[i] = i + 1;\n  }\n\n  K* d_keys;\n  V* d_values;\n  S* d_scores;\n  bool* d_founds;\n  V* d_found_values;\n  CUDA_CHECK(cudaMalloc(&d_keys, N * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_values, N * DIM * sizeof(V)));\n  CUDA_CHECK(cudaMalloc(&d_scores, N * sizeof(S)));\n  CUDA_CHECK(cudaMalloc(&d_founds, N * sizeof(bool)));\n  CUDA_CHECK(cudaMalloc(&d_found_values, N * DIM * sizeof(V)));\n\n  CUDA_CHECK(\n      cudaMemcpy(d_keys, h_keys.data(), N * sizeof(K), cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_values, h_values.data(), N * DIM * sizeof(V),\n                        cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_scores, h_scores.data(), N * sizeof(S),\n                        cudaMemcpyHostToDevice));\n\n  table.insert_or_assign(N, d_keys, d_values, d_scores, 0, true);\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  size_t table_size = table.size(0);\n  CUDA_CHECK(cudaDeviceSynchronize());\n  EXPECT_EQ(table_size, N)\n      << \"Random keys at 50% LF should all be inserted without eviction\";\n\n  // Re-insert the same keys (idempotent).\n  table.insert_or_assign(N, d_keys, d_values, d_scores, 0, true);\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  size_t table_size_after = table.size(0);\n  CUDA_CHECK(cudaDeviceSynchronize());\n  EXPECT_EQ(table_size_after, N) << \"Re-insert must not create duplicates\";\n\n  // Find all.\n  table.find(N, d_keys, d_found_values, d_founds, nullptr, 0);\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  bool* h_founds = new bool[N];\n  CUDA_CHECK(\n      cudaMemcpy(h_founds, d_founds, N * sizeof(bool), cudaMemcpyDeviceToHost));\n  for (size_t i = 0; i < N; i++) {\n    EXPECT_TRUE(h_founds[i]) << \"Random key \" << h_keys[i] << \" not found\";\n  }\n\n  delete[] h_founds;\n  CUDA_CHECK(cudaFree(d_keys));\n  CUDA_CHECK(cudaFree(d_values));\n  CUDA_CHECK(cudaFree(d_scores));\n  CUDA_CHECK(cudaFree(d_founds));\n  CUDA_CHECK(cudaFree(d_found_values));\n}\n\n// ===================================\n// TestGroup 4 addition: Single bucket capacity (T4.2)\n// ===================================\n\n// T4.2: Single-bucket capacity must be rejected by MEMORY_MODE init guard.\n// Dual-bucket addressing requires at least 2 buckets (capacity >= 256).\nTEST(DualBucketTest, SingleBucketCapacityRejected) {\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n  Table table;\n\n  // 1 bucket = 128 slots → must be rejected.\n  EXPECT_THROW(create_memory_mode_table(table, 128), std::runtime_error);\n}\n\n// T4.2b: Minimum valid capacity (2 buckets = 256 slots).\nTEST(DualBucketTest, MinimumTwoBucketCapacity) {\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  constexpr size_t CAPACITY = 256;  // 2 buckets\n  constexpr size_t N = 128;\n\n  Table table;\n  create_memory_mode_table(table, CAPACITY);\n\n  std::vector<K> h_keys(N);\n  std::vector<V> h_values(N * DIM, 1.0f);\n  std::vector<S> h_scores(N);\n\n  std::iota(h_keys.begin(), h_keys.end(), 1);\n  for (size_t i = 0; i < N; i++) h_scores[i] = i + 1;\n\n  K* d_keys;\n  V* d_values;\n  S* d_scores;\n  bool* d_founds;\n  V* d_found_values;\n  CUDA_CHECK(cudaMalloc(&d_keys, N * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_values, N * DIM * sizeof(V)));\n  CUDA_CHECK(cudaMalloc(&d_scores, N * sizeof(S)));\n  CUDA_CHECK(cudaMalloc(&d_founds, N * sizeof(bool)));\n  CUDA_CHECK(cudaMalloc(&d_found_values, N * DIM * sizeof(V)));\n\n  CUDA_CHECK(\n      cudaMemcpy(d_keys, h_keys.data(), N * sizeof(K), cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_values, h_values.data(), N * DIM * sizeof(V),\n                        cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_scores, h_scores.data(), N * sizeof(S),\n                        cudaMemcpyHostToDevice));\n\n  table.insert_or_assign(N, d_keys, d_values, d_scores, 0, true);\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  size_t table_size = table.size(0);\n  CUDA_CHECK(cudaDeviceSynchronize());\n  EXPECT_EQ(table_size, N);\n\n  table.find(N, d_keys, d_found_values, d_founds, nullptr, 0);\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  bool* h_founds = new bool[N];\n  CUDA_CHECK(\n      cudaMemcpy(h_founds, d_founds, N * sizeof(bool), cudaMemcpyDeviceToHost));\n  for (size_t i = 0; i < N; i++) {\n    EXPECT_TRUE(h_founds[i])\n        << \"Two-bucket: Key \" << h_keys[i] << \" not found\";\n  }\n\n  delete[] h_founds;\n  CUDA_CHECK(cudaFree(d_keys));\n  CUDA_CHECK(cudaFree(d_values));\n  CUDA_CHECK(cudaFree(d_scores));\n  CUDA_CHECK(cudaFree(d_founds));\n  CUDA_CHECK(cudaFree(d_found_values));\n}\n\n// ===================================\n// DEBUG: 2-bucket eviction trace\n// ===================================\n\n// Small-scale eviction test with kernel printf enabled (buckets_num <= 4).\n// Fill 2 buckets (256 slots), then insert 4 high-score keys and trace D2.\nTEST(DualBucketTest, DebugEvictionTrace) {\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  constexpr size_t NUM_BUCKETS = 2;\n  constexpr size_t CAPACITY = NUM_BUCKETS * 128;  // 256 slots\n  constexpr size_t N_FILL = CAPACITY;             // Fill completely\n  constexpr size_t N_NEW = 4;  // Insert a few high-score keys\n\n  Table table;\n  create_memory_mode_table(table, CAPACITY);\n\n  // Phase 1: Fill with scores 1..256.\n  std::vector<K> h_keys_fill(N_FILL);\n  std::vector<V> h_values_fill(N_FILL * DIM, 1.0f);\n  std::vector<S> h_scores_fill(N_FILL);\n\n  std::iota(h_keys_fill.begin(), h_keys_fill.end(), 1);\n  for (size_t i = 0; i < N_FILL; i++) {\n    h_scores_fill[i] = i + 1;\n  }\n\n  K* d_keys;\n  V* d_values;\n  S* d_scores;\n  CUDA_CHECK(cudaMalloc(&d_keys, N_FILL * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_values, N_FILL * DIM * sizeof(V)));\n  CUDA_CHECK(cudaMalloc(&d_scores, N_FILL * sizeof(S)));\n\n  CUDA_CHECK(cudaMemcpy(d_keys, h_keys_fill.data(), N_FILL * sizeof(K),\n                        cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_values, h_values_fill.data(),\n                        N_FILL * DIM * sizeof(V), cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_scores, h_scores_fill.data(), N_FILL * sizeof(S),\n                        cudaMemcpyHostToDevice));\n\n  table.insert_or_assign(N_FILL, d_keys, d_values, d_scores, 0, true);\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  size_t table_size_after_fill = table.size(0);\n  CUDA_CHECK(cudaDeviceSynchronize());\n  std::cout << \"[DebugEviction] After fill: table_size=\"\n            << table_size_after_fill << \" capacity=\" << CAPACITY << std::endl;\n\n  // Verify fill: find all N_FILL keys to check b2 lookup correctness.\n  {\n    bool* d_fill_founds;\n    V* d_fill_found_vals;\n    CUDA_CHECK(cudaMalloc(&d_fill_founds, N_FILL * sizeof(bool)));\n    CUDA_CHECK(cudaMalloc(&d_fill_found_vals, N_FILL * DIM * sizeof(V)));\n    CUDA_CHECK(cudaMemcpy(d_keys, h_keys_fill.data(), N_FILL * sizeof(K),\n                          cudaMemcpyHostToDevice));\n    table.find(N_FILL, d_keys, d_fill_found_vals, d_fill_founds, nullptr, 0);\n    CUDA_CHECK(cudaDeviceSynchronize());\n    bool* h_fill_founds = new bool[N_FILL];\n    CUDA_CHECK(cudaMemcpy(h_fill_founds, d_fill_founds, N_FILL * sizeof(bool),\n                          cudaMemcpyDeviceToHost));\n    int fill_found = 0;\n    for (size_t i = 0; i < N_FILL; i++) {\n      if (h_fill_founds[i]) {\n        fill_found++;\n      } else {\n        std::cout << \"[DebugEviction] MISSING fill key=\" << h_keys_fill[i]\n                  << \" (index=\" << i << \")\" << std::endl;\n      }\n    }\n    std::cout << \"[DebugEviction] Fill verify: found \" << fill_found << \"/\"\n              << N_FILL << \" keys\" << std::endl;\n    delete[] h_fill_founds;\n    CUDA_CHECK(cudaFree(d_fill_founds));\n    CUDA_CHECK(cudaFree(d_fill_found_vals));\n  }\n\n  // Phase 2: Insert high-score keys.\n  std::vector<K> h_keys_new(N_NEW);\n  std::vector<V> h_values_new(N_NEW * DIM, 2.0f);\n  std::vector<S> h_scores_new(N_NEW);\n\n  for (size_t i = 0; i < N_NEW; i++) {\n    h_keys_new[i] = N_FILL + 100 + i;\n    h_scores_new[i] = 10000 + i;\n  }\n\n  std::cout << \"[DebugEviction] Inserting \" << N_NEW << \" high-score keys \"\n            << \"(scores \" << h_scores_new[0] << \"..\" << h_scores_new[N_NEW - 1]\n            << \")\" << std::endl;\n\n  CUDA_CHECK(cudaMemcpy(d_keys, h_keys_new.data(), N_NEW * sizeof(K),\n                        cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_values, h_values_new.data(), N_NEW * DIM * sizeof(V),\n                        cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_scores, h_scores_new.data(), N_NEW * sizeof(S),\n                        cudaMemcpyHostToDevice));\n\n  table.insert_or_assign(N_NEW, d_keys, d_values, d_scores, 0, true);\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  size_t table_size_after_evict = table.size(0);\n  CUDA_CHECK(cudaDeviceSynchronize());\n  std::cout << \"[DebugEviction] After evict-insert: table_size=\"\n            << table_size_after_evict << std::endl;\n\n  // Phase 3: Find the high-score keys.\n  bool* d_founds;\n  CUDA_CHECK(cudaMalloc(&d_founds, N_NEW * sizeof(bool)));\n  V* d_found_values;\n  CUDA_CHECK(cudaMalloc(&d_found_values, N_NEW * DIM * sizeof(V)));\n\n  CUDA_CHECK(cudaMemcpy(d_keys, h_keys_new.data(), N_NEW * sizeof(K),\n                        cudaMemcpyHostToDevice));\n  table.find(N_NEW, d_keys, d_found_values, d_founds, nullptr, 0);\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  bool* h_founds = new bool[N_NEW];\n  CUDA_CHECK(cudaMemcpy(h_founds, d_founds, N_NEW * sizeof(bool),\n                        cudaMemcpyDeviceToHost));\n  int found_count = 0;\n  for (size_t i = 0; i < N_NEW; i++) {\n    std::cout << \"[DebugEviction] key=\" << h_keys_new[i]\n              << \" score=\" << h_scores_new[i]\n              << \" found=\" << (h_founds[i] ? \"YES\" : \"NO\") << std::endl;\n    if (h_founds[i]) found_count++;\n  }\n  std::cout << \"[DebugEviction] Found \" << found_count << \"/\" << N_NEW\n            << std::endl;\n\n  EXPECT_EQ(found_count, static_cast<int>(N_NEW))\n      << \"All high-score keys should survive eviction in 2-bucket table\";\n\n  delete[] h_founds;\n  CUDA_CHECK(cudaFree(d_keys));\n  CUDA_CHECK(cudaFree(d_values));\n  CUDA_CHECK(cudaFree(d_scores));\n  CUDA_CHECK(cudaFree(d_founds));\n  CUDA_CHECK(cudaFree(d_found_values));\n}\n\n// ===================================\n// TestGroup 2 addition: Eviction Quality (T2.6)\n// ===================================\n\n// T2.6: After inserting 5x capacity keys with random scores, the surviving keys\n// in the table should overlap with the theoretical top-capacity scores by at\n// least 98%.  This validates that dual-bucket score-based eviction correctly\n// retains high-score keys under sustained oversubscription pressure.\nTEST(DualBucketTest, EvictionQualityAtFullLoad) {\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  constexpr size_t CAPACITY = 128 * 1024;      // 128K slots\n  constexpr size_t TOTAL_KEYS = 5 * CAPACITY;  // 5x oversubscription\n  constexpr size_t BATCH = CAPACITY;           // One capacity per batch\n  constexpr double QUALITY_THRESHOLD = 0.995;\n\n  Table table;\n  create_memory_mode_table(table, CAPACITY);\n\n  // Generate all keys (1..TOTAL_KEYS) with random scores.\n  std::vector<K> all_keys(TOTAL_KEYS);\n  std::vector<S> all_scores(TOTAL_KEYS);\n  std::iota(all_keys.begin(), all_keys.end(), 1);\n\n  std::mt19937_64 rng(42);\n  for (size_t i = 0; i < TOTAL_KEYS; i++) {\n    all_scores[i] = (rng() >> 1) | 1;  // Positive, non-zero\n  }\n\n  // Allocate device memory for one batch.\n  K* d_keys;\n  V* d_values;\n  S* d_scores;\n  CUDA_CHECK(cudaMalloc(&d_keys, BATCH * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_values, BATCH * DIM * sizeof(V)));\n  CUDA_CHECK(cudaMalloc(&d_scores, BATCH * sizeof(S)));\n\n  std::vector<V> h_values(BATCH * DIM, 1.0f);\n\n  // Insert all keys in 5 batches.\n  for (size_t offset = 0; offset < TOTAL_KEYS; offset += BATCH) {\n    size_t n = std::min(BATCH, TOTAL_KEYS - offset);\n    CUDA_CHECK(cudaMemcpy(d_keys, all_keys.data() + offset, n * sizeof(K),\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(d_values, h_values.data(), n * DIM * sizeof(V),\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(d_scores, all_scores.data() + offset, n * sizeof(S),\n                          cudaMemcpyHostToDevice));\n    table.insert_or_assign(n, d_keys, d_values, d_scores, 0, true);\n    CUDA_CHECK(cudaDeviceSynchronize());\n  }\n\n  // Export surviving keys and scores.\n  size_t table_size = table.size(0);\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  K* d_dump_keys;\n  V* d_dump_values;\n  S* d_dump_scores;\n  size_t* d_dump_counter;\n  CUDA_CHECK(cudaMalloc(&d_dump_keys, table_size * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_dump_values, table_size * DIM * sizeof(V)));\n  CUDA_CHECK(cudaMalloc(&d_dump_scores, table_size * sizeof(S)));\n  CUDA_CHECK(cudaMalloc(&d_dump_counter, sizeof(size_t)));\n  CUDA_CHECK(cudaMemset(d_dump_counter, 0, sizeof(size_t)));\n\n  table.export_batch(table_size, 0, d_dump_counter, d_dump_keys, d_dump_values,\n                     d_dump_scores, 0);\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  size_t dumped;\n  CUDA_CHECK(cudaMemcpy(&dumped, d_dump_counter, sizeof(size_t),\n                        cudaMemcpyDeviceToHost));\n\n  std::vector<K> h_dump_keys(dumped);\n  CUDA_CHECK(cudaMemcpy(h_dump_keys.data(), d_dump_keys, dumped * sizeof(K),\n                        cudaMemcpyDeviceToHost));\n\n  // Compute the ideal top-`dumped` set: keys with the highest scores out of\n  // all TOTAL_KEYS inserted during the entire test.\n  std::vector<std::pair<S, K>> score_key_pairs(TOTAL_KEYS);\n  for (size_t i = 0; i < TOTAL_KEYS; i++) {\n    score_key_pairs[i] = {all_scores[i], all_keys[i]};\n  }\n  std::sort(score_key_pairs.begin(), score_key_pairs.end(),\n            [](const auto& a, const auto& b) { return a.first > b.first; });\n\n  std::unordered_set<K> ideal_set;\n  for (size_t i = 0; i < dumped && i < TOTAL_KEYS; i++) {\n    ideal_set.insert(score_key_pairs[i].second);\n  }\n\n  // Count overlap between surviving keys and ideal set.\n  size_t overlap = 0;\n  for (size_t i = 0; i < dumped; i++) {\n    if (ideal_set.count(h_dump_keys[i])) overlap++;\n  }\n\n  double quality = static_cast<double>(overlap) / static_cast<double>(dumped);\n  std::cout << \"[EvictionQuality] Table size: \" << dumped << \"/\" << CAPACITY\n            << \" (LF=\" << (static_cast<double>(dumped) / CAPACITY) << \")\"\n            << std::endl;\n  std::cout << \"[EvictionQuality] Overlap with ideal top-\" << dumped << \": \"\n            << overlap << \"/\" << dumped << \" (quality=\" << (quality * 100.0)\n            << \"%)\" << std::endl;\n\n  EXPECT_GE(quality, QUALITY_THRESHOLD)\n      << \"Eviction quality \" << (quality * 100.0) << \"% is below \"\n      << (QUALITY_THRESHOLD * 100.0) << \"% threshold\";\n\n  // Cleanup.\n  CUDA_CHECK(cudaFree(d_keys));\n  CUDA_CHECK(cudaFree(d_values));\n  CUDA_CHECK(cudaFree(d_scores));\n  CUDA_CHECK(cudaFree(d_dump_keys));\n  CUDA_CHECK(cudaFree(d_dump_values));\n  CUDA_CHECK(cudaFree(d_dump_scores));\n  CUDA_CHECK(cudaFree(d_dump_counter));\n}\n\n// ===================================\n// TestGroup 6: Concurrency Stress Tests\n// ===================================\n\n// T6.1: Multi-stream concurrent upsert stress test.\n// Multiple CUDA streams issue insert_or_assign concurrently to stress Phase 2\n// eviction's stale-score handling.  Under high contention some inserts may be\n// REFUSED, but the table must remain consistent: no crashes, no duplicates,\n// and all surviving keys must be findable.\nTEST(DualBucketTest, MultiStreamConcurrentUpsert) {\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  constexpr size_t CAPACITY = 128 * 1024;    // 128K slots\n  constexpr int NUM_STREAMS = 4;\n  constexpr size_t KEYS_PER_STREAM = CAPACITY;  // Each stream fills capacity\n  constexpr size_t TOTAL_KEYS = NUM_STREAMS * KEYS_PER_STREAM;\n\n  Table table;\n  create_memory_mode_table(table, CAPACITY);\n\n  // Create CUDA streams.\n  cudaStream_t streams[NUM_STREAMS];\n  for (int s = 0; s < NUM_STREAMS; s++) {\n    CUDA_CHECK(cudaStreamCreate(&streams[s]));\n  }\n\n  // Prepare per-stream device memory and data.\n  K* d_keys[NUM_STREAMS];\n  V* d_values[NUM_STREAMS];\n  S* d_scores[NUM_STREAMS];\n\n  std::mt19937_64 rng(42);\n  for (int s = 0; s < NUM_STREAMS; s++) {\n    CUDA_CHECK(cudaMalloc(&d_keys[s], KEYS_PER_STREAM * sizeof(K)));\n    CUDA_CHECK(cudaMalloc(&d_values[s], KEYS_PER_STREAM * DIM * sizeof(V)));\n    CUDA_CHECK(cudaMalloc(&d_scores[s], KEYS_PER_STREAM * sizeof(S)));\n\n    std::vector<K> h_keys(KEYS_PER_STREAM);\n    std::vector<V> h_values(KEYS_PER_STREAM * DIM, static_cast<V>(s + 1));\n    std::vector<S> h_scores(KEYS_PER_STREAM);\n\n    for (size_t i = 0; i < KEYS_PER_STREAM; i++) {\n      // Use non-overlapping key ranges per stream.\n      h_keys[i] = s * KEYS_PER_STREAM + i + 1;\n      h_scores[i] = (rng() >> 1) | 1;  // Random positive score\n    }\n\n    CUDA_CHECK(cudaMemcpy(d_keys[s], h_keys.data(),\n                          KEYS_PER_STREAM * sizeof(K),\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(d_values[s], h_values.data(),\n                          KEYS_PER_STREAM * DIM * sizeof(V),\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(d_scores[s], h_scores.data(),\n                          KEYS_PER_STREAM * sizeof(S),\n                          cudaMemcpyHostToDevice));\n  }\n\n  // Launch concurrent inserts on all streams simultaneously.\n  for (int s = 0; s < NUM_STREAMS; s++) {\n    table.insert_or_assign(KEYS_PER_STREAM, d_keys[s], d_values[s],\n                           d_scores[s], streams[s], /*unique_key=*/true);\n  }\n\n  // Synchronize all streams.\n  for (int s = 0; s < NUM_STREAMS; s++) {\n    CUDA_CHECK(cudaStreamSynchronize(streams[s]));\n  }\n\n  // Verify table consistency.\n  size_t table_size = table.size(0);\n  CUDA_CHECK(cudaDeviceSynchronize());\n  std::cout << \"[MultiStream] Table size after concurrent inserts: \"\n            << table_size << \"/\" << CAPACITY << std::endl;\n\n  // Table size must not exceed capacity (no overflow).\n  EXPECT_LE(table_size, CAPACITY);\n  // Some keys should have been inserted (table should not be empty).\n  EXPECT_GT(table_size, static_cast<size_t>(0));\n\n  // Export all surviving keys and verify they are findable.\n  K* d_dump_keys;\n  V* d_dump_values;\n  S* d_dump_scores;\n  size_t* d_dump_counter;\n  CUDA_CHECK(cudaMalloc(&d_dump_keys, table_size * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_dump_values, table_size * DIM * sizeof(V)));\n  CUDA_CHECK(cudaMalloc(&d_dump_scores, table_size * sizeof(S)));\n  CUDA_CHECK(cudaMalloc(&d_dump_counter, sizeof(size_t)));\n  CUDA_CHECK(cudaMemset(d_dump_counter, 0, sizeof(size_t)));\n\n  table.export_batch(table_size, 0, d_dump_counter, d_dump_keys, d_dump_values,\n                     d_dump_scores, 0);\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  size_t dumped;\n  CUDA_CHECK(cudaMemcpy(&dumped, d_dump_counter, sizeof(size_t),\n                        cudaMemcpyDeviceToHost));\n  EXPECT_EQ(dumped, table_size);\n\n  // Find all exported keys — every surviving key must be findable.\n  bool* d_founds;\n  V* d_found_values;\n  CUDA_CHECK(cudaMalloc(&d_founds, dumped * sizeof(bool)));\n  CUDA_CHECK(cudaMalloc(&d_found_values, dumped * DIM * sizeof(V)));\n\n  table.find(dumped, d_dump_keys, d_found_values, d_founds, nullptr, 0);\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  bool* h_founds = new bool[dumped];\n  CUDA_CHECK(cudaMemcpy(h_founds, d_founds, dumped * sizeof(bool),\n                        cudaMemcpyDeviceToHost));\n\n  int found_count = 0;\n  for (size_t i = 0; i < dumped; i++) {\n    if (h_founds[i]) found_count++;\n  }\n  std::cout << \"[MultiStream] Find consistency: \" << found_count << \"/\"\n            << dumped << std::endl;\n  EXPECT_EQ(found_count, static_cast<int>(dumped))\n      << \"All surviving keys must be findable after concurrent upserts\";\n\n  // Check no duplicates: export size must match table.size().\n  std::vector<K> h_dump_keys(dumped);\n  CUDA_CHECK(cudaMemcpy(h_dump_keys.data(), d_dump_keys, dumped * sizeof(K),\n                        cudaMemcpyDeviceToHost));\n  std::unordered_set<K> unique_keys(h_dump_keys.begin(), h_dump_keys.end());\n  EXPECT_EQ(unique_keys.size(), dumped) << \"Duplicate keys found in table\";\n\n  // Cleanup.\n  delete[] h_founds;\n  for (int s = 0; s < NUM_STREAMS; s++) {\n    CUDA_CHECK(cudaFree(d_keys[s]));\n    CUDA_CHECK(cudaFree(d_values[s]));\n    CUDA_CHECK(cudaFree(d_scores[s]));\n    CUDA_CHECK(cudaStreamDestroy(streams[s]));\n  }\n  CUDA_CHECK(cudaFree(d_dump_keys));\n  CUDA_CHECK(cudaFree(d_dump_values));\n  CUDA_CHECK(cudaFree(d_dump_scores));\n  CUDA_CHECK(cudaFree(d_dump_counter));\n  CUDA_CHECK(cudaFree(d_founds));\n  CUDA_CHECK(cudaFree(d_found_values));\n}\n\n// T6.2: Multi-stream concurrent upsert with overlapping keys.\n// Tests that concurrent streams inserting the same keys do not create\n// duplicates, and that the final values/scores are consistent.\nTEST(DualBucketTest, MultiStreamOverlappingKeys) {\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  constexpr size_t CAPACITY = 128 * 1024;\n  constexpr int NUM_STREAMS = 4;\n  constexpr size_t N = 32768;  // Shared key set\n\n  Table table;\n  create_memory_mode_table(table, CAPACITY);\n\n  cudaStream_t streams[NUM_STREAMS];\n  for (int s = 0; s < NUM_STREAMS; s++) {\n    CUDA_CHECK(cudaStreamCreate(&streams[s]));\n  }\n\n  // All streams insert the SAME keys with different scores.\n  std::vector<K> h_keys(N);\n  std::iota(h_keys.begin(), h_keys.end(), 1);\n\n  K* d_keys[NUM_STREAMS];\n  V* d_values[NUM_STREAMS];\n  S* d_scores[NUM_STREAMS];\n\n  for (int s = 0; s < NUM_STREAMS; s++) {\n    CUDA_CHECK(cudaMalloc(&d_keys[s], N * sizeof(K)));\n    CUDA_CHECK(cudaMalloc(&d_values[s], N * DIM * sizeof(V)));\n    CUDA_CHECK(cudaMalloc(&d_scores[s], N * sizeof(S)));\n\n    std::vector<V> h_values(N * DIM, static_cast<V>(s + 1));\n    std::vector<S> h_scores(N);\n    for (size_t i = 0; i < N; i++) {\n      h_scores[i] = (s + 1) * 1000 + i;  // Different scores per stream\n    }\n\n    CUDA_CHECK(cudaMemcpy(d_keys[s], h_keys.data(), N * sizeof(K),\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(d_values[s], h_values.data(), N * DIM * sizeof(V),\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(d_scores[s], h_scores.data(), N * sizeof(S),\n                          cudaMemcpyHostToDevice));\n  }\n\n  // Launch concurrent inserts.\n  for (int s = 0; s < NUM_STREAMS; s++) {\n    table.insert_or_assign(N, d_keys[s], d_values[s], d_scores[s], streams[s],\n                           true);\n  }\n\n  for (int s = 0; s < NUM_STREAMS; s++) {\n    CUDA_CHECK(cudaStreamSynchronize(streams[s]));\n  }\n\n  // Table size must equal N (no duplicates from concurrent inserts).\n  size_t table_size = table.size(0);\n  CUDA_CHECK(cudaDeviceSynchronize());\n  std::cout << \"[MultiStreamOverlap] Table size: \" << table_size\n            << \" (expected \" << N << \")\" << std::endl;\n  EXPECT_EQ(table_size, N) << \"Concurrent inserts of same keys created \"\n                           << (table_size - N) << \" duplicates\";\n\n  // All keys must be findable.\n  bool* d_founds;\n  V* d_found_values;\n  CUDA_CHECK(cudaMalloc(&d_founds, N * sizeof(bool)));\n  CUDA_CHECK(cudaMalloc(&d_found_values, N * DIM * sizeof(V)));\n\n  table.find(N, d_keys[0], d_found_values, d_founds, nullptr, 0);\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  bool* h_founds = new bool[N];\n  CUDA_CHECK(\n      cudaMemcpy(h_founds, d_founds, N * sizeof(bool), cudaMemcpyDeviceToHost));\n\n  int found_count = 0;\n  for (size_t i = 0; i < N; i++) {\n    if (h_founds[i]) found_count++;\n  }\n  EXPECT_EQ(found_count, static_cast<int>(N))\n      << \"All keys must be findable after concurrent overlapping inserts\";\n\n  // Cleanup.\n  delete[] h_founds;\n  for (int s = 0; s < NUM_STREAMS; s++) {\n    CUDA_CHECK(cudaFree(d_keys[s]));\n    CUDA_CHECK(cudaFree(d_values[s]));\n    CUDA_CHECK(cudaFree(d_scores[s]));\n    CUDA_CHECK(cudaStreamDestroy(streams[s]));\n  }\n  CUDA_CHECK(cudaFree(d_founds));\n  CUDA_CHECK(cudaFree(d_found_values));\n}\n\n// ===================================\n// TestGroup 7: Additional Missing Tests\n// ===================================\n\n// T7.1: Find with scores=nullptr (CopyScoreEmpty path).\nTEST(DualBucketTest, FindWithNullScores) {\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  constexpr size_t CAPACITY = 128 * 128;\n  constexpr size_t N = 1024;\n\n  Table table;\n  create_memory_mode_table(table, CAPACITY);\n\n  std::vector<K> h_keys(N);\n  std::vector<V> h_values(N * DIM, 1.0f);\n  std::vector<S> h_scores(N);\n\n  std::iota(h_keys.begin(), h_keys.end(), 1);\n  for (size_t i = 0; i < N; i++) h_scores[i] = i + 1;\n\n  K* d_keys;\n  V* d_values;\n  S* d_scores;\n  bool* d_founds;\n  V* d_found_values;\n  S* d_found_scores;\n  CUDA_CHECK(cudaMalloc(&d_keys, N * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_values, N * DIM * sizeof(V)));\n  CUDA_CHECK(cudaMalloc(&d_scores, N * sizeof(S)));\n  CUDA_CHECK(cudaMalloc(&d_founds, N * sizeof(bool)));\n  CUDA_CHECK(cudaMalloc(&d_found_values, N * DIM * sizeof(V)));\n  CUDA_CHECK(cudaMalloc(&d_found_scores, N * sizeof(S)));\n\n  CUDA_CHECK(\n      cudaMemcpy(d_keys, h_keys.data(), N * sizeof(K), cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_values, h_values.data(), N * DIM * sizeof(V),\n                        cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_scores, h_scores.data(), N * sizeof(S),\n                        cudaMemcpyHostToDevice));\n\n  table.insert_or_assign(N, d_keys, d_values, d_scores, 0, true);\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  // Find with scores=nullptr (CopyScoreEmpty branch).\n  table.find(N, d_keys, d_found_values, d_founds, /*scores=*/nullptr, 0);\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  bool* h_founds = new bool[N];\n  CUDA_CHECK(\n      cudaMemcpy(h_founds, d_founds, N * sizeof(bool), cudaMemcpyDeviceToHost));\n  for (size_t i = 0; i < N; i++) {\n    EXPECT_TRUE(h_founds[i]) << \"Key \" << h_keys[i] << \" not found (null scores)\";\n  }\n\n  // Find with scores!=nullptr (CopyScoreByPassCache branch).\n  table.find(N, d_keys, d_found_values, d_founds, d_found_scores, 0);\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CUDA_CHECK(\n      cudaMemcpy(h_founds, d_founds, N * sizeof(bool), cudaMemcpyDeviceToHost));\n  std::vector<S> h_found_scores(N);\n  CUDA_CHECK(cudaMemcpy(h_found_scores.data(), d_found_scores, N * sizeof(S),\n                        cudaMemcpyDeviceToHost));\n  for (size_t i = 0; i < N; i++) {\n    EXPECT_TRUE(h_founds[i])\n        << \"Key \" << h_keys[i] << \" not found (with scores)\";\n    if (h_founds[i]) {\n      EXPECT_GT(h_found_scores[i], static_cast<S>(0))\n          << \"Score should be non-zero for key \" << h_keys[i];\n    }\n  }\n\n  delete[] h_founds;\n  CUDA_CHECK(cudaFree(d_keys));\n  CUDA_CHECK(cudaFree(d_values));\n  CUDA_CHECK(cudaFree(d_scores));\n  CUDA_CHECK(cudaFree(d_founds));\n  CUDA_CHECK(cudaFree(d_found_values));\n  CUDA_CHECK(cudaFree(d_found_scores));\n}\n\n// T7.2: Clear then re-insert (verifies dual_bucket_empty_digest reset).\nTEST(DualBucketTest, ClearAndReinsert) {\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  constexpr size_t CAPACITY = 128 * 128;\n  constexpr size_t N = 2048;\n\n  Table table;\n  create_memory_mode_table(table, CAPACITY);\n\n  std::vector<K> h_keys(N);\n  std::vector<V> h_values(N * DIM, 1.0f);\n  std::vector<S> h_scores(N);\n\n  std::iota(h_keys.begin(), h_keys.end(), 1);\n  for (size_t i = 0; i < N; i++) h_scores[i] = i + 1;\n\n  K* d_keys;\n  V* d_values;\n  S* d_scores;\n  bool* d_founds;\n  V* d_found_values;\n  CUDA_CHECK(cudaMalloc(&d_keys, N * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_values, N * DIM * sizeof(V)));\n  CUDA_CHECK(cudaMalloc(&d_scores, N * sizeof(S)));\n  CUDA_CHECK(cudaMalloc(&d_founds, N * sizeof(bool)));\n  CUDA_CHECK(cudaMalloc(&d_found_values, N * DIM * sizeof(V)));\n\n  CUDA_CHECK(\n      cudaMemcpy(d_keys, h_keys.data(), N * sizeof(K), cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_values, h_values.data(), N * DIM * sizeof(V),\n                        cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_scores, h_scores.data(), N * sizeof(S),\n                        cudaMemcpyHostToDevice));\n\n  // Insert first batch.\n  table.insert_or_assign(N, d_keys, d_values, d_scores, 0, true);\n  CUDA_CHECK(cudaDeviceSynchronize());\n  EXPECT_EQ(table.size(0), N);\n\n  // Clear the table.\n  table.clear(0);\n  CUDA_CHECK(cudaDeviceSynchronize());\n  EXPECT_EQ(table.size(0), static_cast<size_t>(0));\n\n  // Re-insert different keys.\n  std::vector<K> h_keys2(N);\n  std::iota(h_keys2.begin(), h_keys2.end(), N + 1);  // Different keys\n  CUDA_CHECK(cudaMemcpy(d_keys, h_keys2.data(), N * sizeof(K),\n                        cudaMemcpyHostToDevice));\n\n  table.insert_or_assign(N, d_keys, d_values, d_scores, 0, true);\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  size_t table_size = table.size(0);\n  CUDA_CHECK(cudaDeviceSynchronize());\n  EXPECT_EQ(table_size, N)\n      << \"After clear + re-insert, table should have N entries\";\n\n  // Verify new keys are findable.\n  table.find(N, d_keys, d_found_values, d_founds, nullptr, 0);\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  bool* h_founds = new bool[N];\n  CUDA_CHECK(\n      cudaMemcpy(h_founds, d_founds, N * sizeof(bool), cudaMemcpyDeviceToHost));\n  for (size_t i = 0; i < N; i++) {\n    EXPECT_TRUE(h_founds[i])\n        << \"Key \" << h_keys2[i] << \" not found after clear + re-insert\";\n  }\n\n  // Verify old keys are NOT findable.\n  CUDA_CHECK(cudaMemcpy(d_keys, h_keys.data(), N * sizeof(K),\n                        cudaMemcpyHostToDevice));\n  table.find(N, d_keys, d_found_values, d_founds, nullptr, 0);\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CUDA_CHECK(\n      cudaMemcpy(h_founds, d_founds, N * sizeof(bool), cudaMemcpyDeviceToHost));\n  for (size_t i = 0; i < N; i++) {\n    EXPECT_FALSE(h_founds[i])\n        << \"Old key \" << h_keys[i] << \" still found after clear\";\n  }\n\n  delete[] h_founds;\n  CUDA_CHECK(cudaFree(d_keys));\n  CUDA_CHECK(cudaFree(d_values));\n  CUDA_CHECK(cudaFree(d_scores));\n  CUDA_CHECK(cudaFree(d_founds));\n  CUDA_CHECK(cudaFree(d_found_values));\n}\n"
  },
  {
    "path": "tests/dynamic_max_capacity_test.cc.cu",
    "content": "/*\n * Copyright (c) 2023, NVIDIA CORPORATION.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *     http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n#include <gtest/gtest.h>\n#include <stdio.h>\n#include <array>\n#include <map>\n#include \"merlin/types.cuh\"\n#include \"merlin_hashtable.cuh\"\n#include \"merlin_localfile.hpp\"\n#include \"test_util.cuh\"\n\nconstexpr size_t dim = 64;\nusing i64 = int64_t;\nusing u64 = uint64_t;\nusing f32 = float;\nusing EvictStrategy = nv::merlin::EvictStrategy;\nusing TableOptions = nv::merlin::HashTableOptions;\n\n/*\n * test_dynamic_max_capcity_table creates a table in small\n * capacity and insert random kv pairs until its load_factor\n * became 1.0. Then expand the max_capacity. Keep inserting until\n * the load factor growth to 1.0 again.\n */\nvoid test_dynamic_max_capcity_table() {\n  size_t len = 10000llu;\n  size_t max_capacity = 1 << 14;\n  size_t init_capacity = 1 << 12;\n  size_t offset = 0;\n  size_t uplimit = 1 << 20;\n  float load_factor_threshold = 0.98f;\n\n  TableOptions opt;\n  opt.max_capacity = max_capacity;\n  opt.init_capacity = init_capacity;\n  opt.max_hbm_for_vectors = uplimit * dim * sizeof(f32);\n  using Table = nv::merlin::HashTable<i64, f32, u64, EvictStrategy::kLru>;\n  opt.dim = dim;\n\n  using Vec_t = test_util::ValueArray<f32, dim>;\n  std::map<i64, Vec_t> ref_map;\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  std::unique_ptr<Table> table = std::make_unique<Table>();\n  table->init(opt);\n\n  test_util::KVMSBuffer<i64, f32, u64> buffer;\n  buffer.Reserve(len, dim, stream);\n  test_util::KVMSBuffer<i64, f32, u64> evict_buffer;\n  evict_buffer.Reserve(len, dim, stream);\n\n  size_t total_len = 0;\n  while (true) {\n    buffer.ToRange(offset, /*skip=1*/ 1, stream);\n    size_t n_evicted = table->insert_and_evict(\n        len, buffer.keys_ptr(), buffer.values_ptr(), nullptr,\n        evict_buffer.keys_ptr(), evict_buffer.values_ptr(), nullptr, stream);\n    printf(\"Insert %zu keys and evict %zu\\n\", len, n_evicted);\n    offset += len;\n    total_len += len;\n    evict_buffer.SyncData(/*h2d=*/false, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    for (size_t i = 0; i < n_evicted; i++) {\n      Vec_t* vec =\n          reinterpret_cast<Vec_t*>(evict_buffer.values_ptr(false) + i * dim);\n      ref_map[evict_buffer.keys_ptr(false)[i]] = *vec;\n    }\n\n    if (table->load_factor(stream) >= load_factor_threshold) {\n      ASSERT_GE(table->size(stream),\n                static_cast<size_t>((static_cast<float>(max_capacity) *\n                                     load_factor_threshold)));\n      max_capacity *= 2;\n      if (max_capacity > uplimit) {\n        break;\n      }\n      // What we need.\n      printf(\"----> check change max_capacity from %zu to %zu\\n\",\n             table->capacity(), max_capacity);\n      table->set_max_capacity(max_capacity);\n      table->reserve(max_capacity, stream);\n      ASSERT_EQ(max_capacity, table->capacity());\n      ASSERT_LE(table->load_factor(stream), 0.5f);\n    }\n\n    if (total_len > uplimit * 2) {\n      throw std::runtime_error(\"Traverse too much keys but not finish test.\");\n    }\n  };\n\n  offset = 0;\n  for (; offset < table->capacity(); offset += len) {\n    size_t search_len = len;\n    if (offset + search_len > table->capacity()) {\n      search_len = table->capacity() - offset;\n    }\n    size_t n_exported =\n        table->export_batch(search_len, offset, buffer.keys_ptr(),\n                            buffer.values_ptr(), /*scores=*/nullptr, stream);\n    buffer.SyncData(/*h2d=*/false);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    for (size_t i = 0; i < n_exported; i++) {\n      Vec_t* vec = reinterpret_cast<Vec_t*>(buffer.values_ptr(false) + i * dim);\n      for (size_t j = 0; j < dim; j++) {\n        ASSERT_EQ(buffer.keys_ptr(false)[i], vec->operator[](j));\n      }\n      ref_map[buffer.keys_ptr(false)[i]] = *vec;\n    }\n  }\n\n  printf(\"---> uplimit: %zu\\n\", uplimit);\n  printf(\"---> table size: %zu\\n\", table->size(stream));\n  printf(\"---> table cap: %zu\\n\", table->capacity());\n  printf(\"---> cpu table size: %zu\\n\", ref_map.size());\n  for (auto& it : ref_map) {\n    for (size_t j = 0; j < dim; j++) {\n      ASSERT_EQ(static_cast<f32>(it.first), it.second.data[j]);\n    }\n  }\n  ASSERT_EQ(table->capacity() * 2, max_capacity);\n  ASSERT_GE(static_cast<float>(ref_map.size()),\n            static_cast<float>(table->capacity()) * load_factor_threshold);\n}\n\nTEST(MerlinHashTableTest, test_dynamic_max_capcity_table) {\n  test_dynamic_max_capcity_table();\n}\n"
  },
  {
    "path": "tests/export_batch_if_test.cc.cu",
    "content": "#include <cooperative_groups.h>\n#include <math.h>\n#include <stdio.h>\n#include <stdlib.h>\n#include <algorithm>\n#include <array>\n#include <iostream>\n#include <map>\n#include <thread>\n#include <unordered_map>\n#include <vector>\n#include \"merlin/types.cuh\"\n#include \"merlin_hashtable.cuh\"\n#include \"test_util.cuh\"\n\nnamespace cg = cooperative_groups;\n\nusing i64 = int64_t;\nusing u64 = uint64_t;\nusing f32 = float;\nusing EvictStrategy = nv::merlin::EvictStrategy;\nusing TableOptions = nv::merlin::HashTableOptions;\nusing Table = nv::merlin::HashTable<i64, f32, u64, EvictStrategy::kCustomized>;\n\ntemplate <class K, class S>\nstruct ExportIfPredFunctor {\n  __forceinline__ __device__ bool operator()(const K& key, S& score,\n                                             const K& pattern,\n                                             const S& threshold) {\n    return score < threshold;\n  }\n};\n\ntemplate <class K, class V, class S>\nstruct ExportIfPredFunctorV2 {\n  K pattern;\n  S threshold;\n  ExportIfPredFunctorV2(K pattern, S threshold)\n      : pattern(pattern), threshold(threshold) {}\n  template <int GroupSize>\n  __forceinline__ __device__ bool operator()(\n      const K& key, const V* value, const S& score,\n      cg::thread_block_tile<GroupSize>& g) {\n    /* evaluate key, score and value. */\n    return score < threshold;\n  }\n};\n\ntemplate <class K, class V, class S>\nstruct ExportIfPredFunctorV3 {\n  K pattern;\n  S threshold;\n  int dim;\n  ExportIfPredFunctorV3(K pattern, S threshold)\n      : pattern(pattern), threshold(threshold) {}\n  template <int GroupSize>\n  __forceinline__ __device__ bool operator()(\n      const K& key, const V* value, const S& score,\n      cg::thread_block_tile<GroupSize>& g) {\n    /* evaluate key, score and value. */\n    bool pred = score < threshold;\n\n    for (int i = 0; i < g.size(); i++) {\n      auto cur_value = g.shfl(value, i);\n      auto cur_key = g.shfl(key, i);\n      bool cur_pred = g.shfl(pred, i);\n      if (cur_pred == false) continue;\n      unsigned int vote = 0;\n      /* evaluate one value cooperatively in one loop. */\n      for (int j = g.thread_rank(); j < dim; j += g.size()) {\n        if (cur_value[j] != cur_key) cur_pred = false;\n        vote = g.ballot(cur_pred == false);\n        if (vote != 0) break;\n      }\n      if (g.thread_rank() == i && vote != 0) pred = false;\n    }\n    return pred;\n  }\n};\n\n// Using for_each API to simulate export_batch_if_v2 API.\ntemplate <class K, class V, class S>\nstruct ForEachExecutionFuncV4 {\n  K pattern;\n  S threshold;\n  int dim;\n  uint64_t* d_counter;\n  K* out_keys;\n  V* out_vals;\n  S* out_scores;\n  ForEachExecutionFuncV4(K pattern, S threshold)\n      : pattern(pattern), threshold(threshold) {}\n  template <int GroupSize>\n  __forceinline__ __device__ void operator()(\n      const K& key, V* value, S* score, cg::thread_block_tile<GroupSize>& g) {\n    S score_val = *score;\n    bool match = score_val < threshold;\n    uint32_t vote = g.ballot(match);\n    int group_cnt = __popc(vote);\n    uint64_t group_offset = 0;\n    if (g.thread_rank() == 0) {\n      group_offset = atomicAdd(d_counter, static_cast<uint64_t>(group_cnt));\n    }\n    group_offset = g.shfl(group_offset, 0);\n    int previous_cnt = group_cnt - __popc(vote >> g.thread_rank());\n    if (match) {\n      out_keys[group_offset + previous_cnt] = key;\n      if (out_scores) {\n        out_scores[group_offset + previous_cnt] = score_val;\n      }\n    }\n    for (int r = 0; r < GroupSize; r++) {\n      uint32_t biased_vote = vote >> r;\n      bool cur_match = biased_vote & 1;\n      if (cur_match) {\n        int bias = group_cnt - __popc(biased_vote);\n        V* cur_vals = g.shfl(value, r);\n        for (int j = g.thread_rank(); j < dim; j += GroupSize) {\n          out_vals[(group_offset + bias) * dim + j] = cur_vals[j];\n        }\n      }\n    }\n  }\n};\n\nenum class ExportIfVersion { V1, V2, V3, V4 };\n\ntemplate <ExportIfVersion EV>\nvoid test_export_batch_if_with_limited_size() {\n  constexpr uint64_t CAP = 1llu << 24;\n  size_t n0 = (1llu << 23) - 163;\n  size_t n1 = (1llu << 23) + 221;\n  size_t n2 = (1llu << 23) - 17;\n  size_t dim = 64;\n  size_t table_size = 0;\n  i64 pattern = 0;\n  u64 threshold = 40;\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  TableOptions options;\n  options.init_capacity = CAP;\n  options.max_capacity = CAP;\n  options.dim = dim;\n  options.max_hbm_for_vectors = nv::merlin::GB(100);\n\n  std::unique_ptr<Table> table = std::make_unique<Table>();\n  table->init(options);\n\n  size_t* d_cnt = nullptr;\n  CUDA_CHECK(cudaMallocAsync(&d_cnt, sizeof(size_t), stream));\n  CUDA_CHECK(cudaMemsetAsync(d_cnt, 0, sizeof(size_t), stream));\n\n  test_util::KVMSBuffer<i64, f32, u64> buffer0;\n  buffer0.Reserve(n0, dim, stream);\n  buffer0.ToRange(0, 1, stream);\n  buffer0.Setscore((u64)15, stream);\n  {\n    test_util::KVMSBuffer<i64, f32, u64> buffer0_ev;\n    buffer0_ev.Reserve(n0, dim, stream);\n    buffer0_ev.ToZeros(stream);\n    // table->insert_or_assign(n0, buffer0.keys_ptr(), buffer0.values_ptr(),\n    //                         buffer0.scores_ptr(), stream, true, false);\n    table->insert_and_evict(n0, buffer0.keys_ptr(), buffer0.values_ptr(),\n                            buffer0.scores_ptr(), buffer0_ev.keys_ptr(),\n                            buffer0_ev.values_ptr(), buffer0_ev.scores_ptr(),\n                            d_cnt, stream, true, false);\n    table_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    MERLIN_EXPECT_TRUE(table_size == n0, \"Invalid table size.\");\n  }\n\n  test_util::KVMSBuffer<i64, f32, u64> buffer1;\n  buffer1.Reserve(n1, dim, stream);\n  buffer1.ToRange(n0, 1, stream);\n  buffer1.Setscore((u64)30, stream);\n  {\n    test_util::KVMSBuffer<i64, f32, u64> buffer1_ev;\n    buffer1_ev.Reserve(n0, dim, stream);\n    buffer1_ev.ToZeros(stream);\n    // table->insert_or_assign(n1, buffer1.keys_ptr(), buffer1.values_ptr(),\n    //                         buffer1.scores_ptr(), stream, true, false);\n    table->insert_and_evict(n0, buffer1.keys_ptr(), buffer1.values_ptr(),\n                            buffer1.scores_ptr(), buffer1_ev.keys_ptr(),\n                            buffer1_ev.values_ptr(), buffer1_ev.scores_ptr(),\n                            d_cnt, stream, true, false);\n    table_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n  }\n\n  test_util::KVMSBuffer<i64, f32, u64> buffer2;\n  buffer2.Reserve(n2, dim, stream);\n  buffer2.ToRange(n0 + n1, 1, stream);\n  buffer2.Setscore((u64)45, stream);\n  {\n    test_util::KVMSBuffer<i64, f32, u64> buffer2_ev;\n    buffer2_ev.Reserve(n0, dim, stream);\n    buffer2_ev.ToZeros(stream);\n    // table->insert_or_assign(n2, buffer2.keys_ptr(), buffer2.values_ptr(),\n    //                         buffer2.scores_ptr(), stream, true, false);\n    table->insert_and_evict(n0, buffer2.keys_ptr(), buffer2.values_ptr(),\n                            buffer2.scores_ptr(), buffer2_ev.keys_ptr(),\n                            buffer2_ev.values_ptr(), buffer2_ev.scores_ptr(),\n                            d_cnt, stream, true, false);\n    table_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    printf(\"final size: %zu, capacity: %zu\\n\", table_size, table->capacity());\n  }\n\n  size_t h_cnt = 0;\n  size_t h_cnt2 = 0;\n\n  table->size_if<ExportIfPredFunctor>(pattern, threshold, d_cnt, stream);\n  CUDA_CHECK(cudaMemcpyAsync(&h_cnt, d_cnt, sizeof(size_t),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  printf(\"---> check h_cnt from size_if kernel: %zu\\n\", h_cnt);\n\n  test_util::KVMSBuffer<i64, f32, u64> buffer_out;\n  buffer_out.Reserve(h_cnt, dim, stream);\n  buffer_out.ToZeros(stream);\n\n  CUDA_CHECK(cudaMemsetAsync(d_cnt, 0, sizeof(size_t), stream));\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n\n  bool use_pin = false;\n\n  uint64_t t0 = test_util::getTimestamp();\n  cudaEvent_t start, stop;\n  cudaEventCreate(&start);\n  cudaEventCreate(&stop);\n  cudaEventRecord(start);\n  if (EV == ExportIfVersion::V1) {\n    table->export_batch_if<ExportIfPredFunctor>(\n        pattern, threshold, static_cast<size_t>(CAP), 0, d_cnt,\n        buffer_out.keys_ptr(!use_pin), buffer_out.values_ptr(!use_pin),\n        buffer_out.scores_ptr(!use_pin), stream);\n  } else if (EV == ExportIfVersion::V2) {\n    ExportIfPredFunctorV2<i64, f32, u64> pred(pattern, threshold);\n    table->export_batch_if_v2<ExportIfPredFunctorV2<i64, f32, u64>>(\n        pred, static_cast<size_t>(CAP), 0, d_cnt, buffer_out.keys_ptr(!use_pin),\n        buffer_out.values_ptr(!use_pin), buffer_out.scores_ptr(!use_pin),\n        stream);\n  } else if (EV == ExportIfVersion::V3) {\n    ExportIfPredFunctorV3<i64, f32, u64> pred(pattern, threshold);\n    pred.dim = dim;\n    table->export_batch_if_v2<ExportIfPredFunctorV3<i64, f32, u64>>(\n        pred, static_cast<size_t>(CAP), 0, d_cnt, buffer_out.keys_ptr(!use_pin),\n        buffer_out.values_ptr(!use_pin), buffer_out.scores_ptr(!use_pin),\n        stream);\n  } else if (EV == ExportIfVersion::V4) {\n    ForEachExecutionFuncV4<i64, f32, u64> f(pattern, threshold);\n    f.dim = dim;\n    f.d_counter = d_cnt;\n    f.out_keys = buffer_out.keys_ptr(!use_pin);\n    f.out_vals = buffer_out.values_ptr(!use_pin);\n    f.out_scores = buffer_out.scores_ptr(!use_pin);\n    table->for_each<ForEachExecutionFuncV4<i64, f32, u64>>(\n        0, static_cast<size_t>(CAP), f, stream);\n  }\n  cudaEventRecord(stop);\n  CUDA_CHECK(cudaMemcpyAsync(&h_cnt2, d_cnt, sizeof(size_t),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  printf(\"final h_cnt2: %zu\\n\", h_cnt2);\n\n  MERLIN_EXPECT_TRUE(\n      h_cnt == h_cnt2,\n      \"size_if and export_batch_if get different matching count.\");\n  float cu_cost = 0;\n  cudaEventElapsedTime(&cu_cost, start, stop);\n  uint64_t t1 = test_util::getTimestamp();\n  printf(\"final h_cnt2: %zu, cost: %zu, cu_cost: %f\\n\", h_cnt2, t1 - t0,\n         cu_cost);\n\n  if (!use_pin) {\n    buffer_out.SyncData(false, stream);\n  }\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  uint64_t t2 = test_util::getTimestamp();\n  printf(\"use_pin: %d. After sycn data of len: %zu, total cost: %zu\\n\", use_pin,\n         h_cnt2, t2 - t0);\n\n  std::unordered_map<i64, u64> record;\n  for (size_t i = 0; i < h_cnt; i++) {\n    i64 key = buffer_out.keys_ptr(false)[i];\n    u64 score = buffer_out.scores_ptr(false)[i];\n    MERLIN_EXPECT_TRUE(score < threshold, \"\");\n    record[key] = score;\n    for (int j = 0; j < dim; j++) {\n      f32 value = buffer_out.values_ptr(false)[i * dim + j];\n      MERLIN_EXPECT_TRUE(key == static_cast<i64>(value), \"\");\n    }\n  }\n  MERLIN_EXPECT_TRUE(record.size() == h_cnt2, \"\");\n  printf(\"record: %zu\\n\", record.size());\n  printf(\"n0+n1: %zu\\n\", n0 + n1);\n  printf(\"n0+n1+n2: %zu\\n\", n0 + n1 + n2);\n  printf(\"done\\n\");\n}\n\nint main() {\n  test_export_batch_if_with_limited_size<ExportIfVersion::V1>();\n  test_export_batch_if_with_limited_size<ExportIfVersion::V2>();\n  test_export_batch_if_with_limited_size<ExportIfVersion::V3>();\n  test_export_batch_if_with_limited_size<ExportIfVersion::V4>();\n  return 0;\n}\n"
  },
  {
    "path": "tests/find_or_insert_ptr_lock_test.cc.cu",
    "content": "/*\n * Copyright (c) 2025, NVIDIA CORPORATION.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *     http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n/*\n * test APIs: find_or_insert_ptr, unlock_keys\n */\n\n#include <gtest/gtest.h>\n#include <math.h>\n#include <stdio.h>\n#include <stdlib.h>\n#include <algorithm>\n#include <array>\n#include <iostream>\n#include <thread>\n#include <vector>\n#include \"merlin_hashtable.cuh\"\n#include \"test_util.cuh\"\n\nconstexpr size_t DIM = 16;\nusing K = uint64_t;\nusing V = float;\nusing S = uint64_t;\nusing EvictStrategy = nv::merlin::EvictStrategy;\nusing TableOptions = nv::merlin::HashTableOptions;\n\ntemplate <class K, class S>\nstruct EraseIfPredFunctor {\n  __forceinline__ __device__ bool operator()(const K& key, S& score,\n                                             const K& pattern,\n                                             const S& threshold) {\n    return ((key & 0x7f > pattern) && (score > threshold));\n  }\n};\n\ntemplate <class K, class S>\nstruct ExportIfPredFunctor {\n  __forceinline__ __device__ bool operator()(const K& key, S& score,\n                                             const K& pattern,\n                                             const S& threshold) {\n    return score > threshold;\n  }\n};\n\ntemplate <typename T>\nvoid find_or_insert_safe_ptr(T* table, uint64_t KEY_NUM, K* d_keys, S* d_scores,\n                             V* d_vectors, uint64_t dim, cudaStream_t& stream) {\n  V** d_vectors_ptr = nullptr;\n  bool* d_found;\n  K** d_key_ptrs = nullptr;\n  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n  CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));\n  CUDA_CHECK(cudaMalloc(&d_key_ptrs, KEY_NUM * sizeof(K*)));\n\n  table->find_or_insert(KEY_NUM, d_keys, d_vectors_ptr, d_found, d_scores,\n                        stream, true, false, d_key_ptrs);\n  test_util::read_or_write_ptr(d_vectors_ptr, d_vectors, d_found, dim, KEY_NUM,\n                               stream);\n  /// TODO:check the d_found\n  table->unlock_keys(KEY_NUM, d_key_ptrs, d_keys, d_found, stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  CUDA_CHECK(cudaFree(d_key_ptrs));\n  CUDA_CHECK(cudaFree(d_vectors_ptr));\n  CUDA_CHECK(cudaFree(d_found));\n}\n\nvoid test_basic(size_t max_hbm_for_vectors, int key_start = 0) {\n  constexpr uint64_t INIT_CAPACITY = 64 * 1024 * 1024UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t KEY_NUM = 1 * 1024 * 1024UL;\n  constexpr uint64_t TEST_TIMES = 1;\n\n  K* h_keys;\n  S* h_scores;\n  V* h_vectors;\n  bool* h_found;\n\n  TableOptions options;\n\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));\n\n  CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n\n  test_util::create_random_keys<K, S, V, DIM>(h_keys, h_scores, h_vectors,\n                                              KEY_NUM);\n\n  K* d_keys;\n  S* d_scores = nullptr;\n  V* d_vectors;\n  V* d_new_vectors;\n  bool* d_found;\n  size_t dump_counter = 0;\n\n  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMalloc(&d_new_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n\n  CUDA_CHECK(\n      cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),\n                        cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors, KEY_NUM * sizeof(V) * options.dim,\n                        cudaMemcpyHostToDevice));\n\n  CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  uint64_t total_size = 0;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    find_or_insert_safe_ptr(table.get(), KEY_NUM, d_keys, d_scores, d_vectors,\n                            options.dim, stream);\n\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, KEY_NUM);\n\n    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n    CUDA_CHECK(cudaMemset(d_scores, 0, KEY_NUM * sizeof(S)));\n    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n\n    {\n      V** d_vectors_ptr = nullptr;\n      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));\n      table->find(KEY_NUM, d_keys, d_vectors_ptr, d_found, d_scores, stream);\n      test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim, KEY_NUM,\n                               stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      CUDA_CHECK(cudaFree(d_vectors_ptr));\n    }\n\n    int found_num = 0;\n    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n\n    for (int i = 0; i < KEY_NUM; i++) {\n      if (h_found[i]) found_num++;\n      ASSERT_EQ(h_scores[i], h_keys[i]);\n      for (int j = 0; j < options.dim; j++) {\n        ASSERT_EQ(h_vectors[i * options.dim + j],\n                  static_cast<float>(h_keys[i] * 0.00001));\n      }\n    }\n    ASSERT_EQ(found_num, KEY_NUM);\n\n    CUDA_CHECK(cudaMemset(d_new_vectors, 2, KEY_NUM * sizeof(V) * options.dim));\n    table->assign(KEY_NUM, reinterpret_cast<const K*>(d_keys),\n                  reinterpret_cast<const float*>(d_new_vectors),\n                  reinterpret_cast<const S*>(d_scores), stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, KEY_NUM);\n\n    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n    CUDA_CHECK(cudaMemset(d_new_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n    {\n      V** d_vectors_ptr = nullptr;\n      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));\n      test_util::array2ptr(d_vectors_ptr, d_new_vectors, options.dim, KEY_NUM,\n                           stream);\n      table->find(KEY_NUM, d_keys, d_new_vectors, d_found, d_scores, stream);\n      test_util::read_from_ptr(d_vectors_ptr, d_new_vectors, options.dim,\n                               KEY_NUM, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      CUDA_CHECK(cudaFree(d_vectors_ptr));\n    }\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_new_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n    found_num = 0;\n    uint32_t i_value = 0x2020202;\n    for (int i = 0; i < KEY_NUM; i++) {\n      if (h_found[i]) found_num++;\n      for (int j = 0; j < options.dim; j++) {\n        ASSERT_EQ(h_vectors[i * options.dim + j],\n                  *(reinterpret_cast<float*>(&i_value)));\n      }\n    }\n    ASSERT_EQ(found_num, KEY_NUM);\n\n    table->accum_or_assign(KEY_NUM, d_keys, d_vectors, d_found, d_scores,\n                           stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, KEY_NUM);\n\n    table->erase(KEY_NUM >> 1, d_keys, stream);\n    size_t total_size_after_erase = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size_after_erase, total_size >> 1);\n\n    table->clear(stream);\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    find_or_insert_safe_ptr(table.get(), KEY_NUM, d_keys, d_scores, d_vectors,\n                            options.dim, stream);\n\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, KEY_NUM);\n\n    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n    CUDA_CHECK(cudaMemset(d_scores, 0, KEY_NUM * sizeof(S)));\n    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n\n    {\n      V** d_vectors_ptr = nullptr;\n      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));\n      table->find(KEY_NUM, d_keys, d_vectors_ptr, d_found, d_scores, stream);\n      test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim, KEY_NUM,\n                               stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      CUDA_CHECK(cudaFree(d_vectors_ptr));\n    }\n\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    found_num = 0;\n    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n\n    for (int i = 0; i < KEY_NUM; i++) {\n      if (h_found[i]) found_num++;\n      ASSERT_EQ(h_scores[i], h_keys[i]);\n      for (int j = 0; j < options.dim; j++) {\n        ASSERT_EQ(h_vectors[i * options.dim + j],\n                  static_cast<float>(h_keys[i] * 0.00001));\n      }\n    }\n    ASSERT_EQ(found_num, KEY_NUM);\n\n    CUDA_CHECK(cudaMemset(d_keys, 0, KEY_NUM * sizeof(K)));\n    CUDA_CHECK(cudaMemset(d_scores, 0, KEY_NUM * sizeof(S)));\n    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n    dump_counter = table->export_batch(table->capacity(), 0, d_keys, d_vectors,\n                                       d_scores, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    ASSERT_EQ(dump_counter, KEY_NUM);\n    CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),\n                          cudaMemcpyDeviceToHost));\n    for (int i = 0; i < KEY_NUM; i++) {\n      ASSERT_EQ(h_scores[i], h_keys[i]);\n      for (int j = 0; j < options.dim; j++) {\n        ASSERT_EQ(h_vectors[i * options.dim + j],\n                  static_cast<float>(h_keys[i] * 0.00001));\n      }\n    }\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaFreeHost(h_keys));\n  CUDA_CHECK(cudaFreeHost(h_scores));\n  CUDA_CHECK(cudaFreeHost(h_vectors));\n  CUDA_CHECK(cudaFreeHost(h_found));\n\n  CUDA_CHECK(cudaFree(d_keys));\n  CUDA_CHECK(cudaFree(d_scores));\n  CUDA_CHECK(cudaFree(d_vectors));\n  CUDA_CHECK(cudaFree(d_new_vectors));\n  CUDA_CHECK(cudaFree(d_found));\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_basic_when_full(size_t max_hbm_for_vectors, int key_start = 0) {\n  constexpr uint64_t INIT_CAPACITY = 1 * 1024 * 1024UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t KEY_NUM = 1 * 1024 * 1024UL;\n  constexpr uint64_t TEST_TIMES = 1;\n\n  K* h_keys;\n  S* h_scores;\n  V* h_vectors;\n  bool* h_found;\n\n  TableOptions options;\n\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));\n\n  CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n\n  test_util::create_random_keys<K, S, V, DIM>(h_keys, h_scores, nullptr,\n                                              KEY_NUM);\n\n  K* d_keys;\n  S* d_scores = nullptr;\n  V* d_vectors;\n  V* d_def_val;\n  V** d_vectors_ptr;\n  bool* d_found;\n\n  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMalloc(&d_def_val, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));\n  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n\n  CUDA_CHECK(\n      cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),\n                        cudaMemcpyHostToDevice));\n\n  CUDA_CHECK(cudaMemset(d_vectors, 1, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMemset(d_def_val, 2, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMemset(d_vectors_ptr, 0, KEY_NUM * sizeof(V*)));\n  CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  uint64_t total_size = 0;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    find_or_insert_safe_ptr(table.get(), KEY_NUM, d_keys, d_scores, d_vectors,\n                            options.dim, stream);\n\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    uint64_t total_size_after_insert = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    table->erase(KEY_NUM, d_keys, stream);\n    size_t total_size_after_erase = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size_after_erase, 0);\n\n    find_or_insert_safe_ptr(table.get(), KEY_NUM, d_keys, d_scores, d_vectors,\n                            options.dim, stream);\n\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    uint64_t total_size_after_reinsert = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size_after_insert, total_size_after_reinsert);\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,\n                        cudaMemcpyDeviceToHost));\n\n  CUDA_CHECK(cudaFreeHost(h_keys));\n  CUDA_CHECK(cudaFreeHost(h_scores));\n  CUDA_CHECK(cudaFreeHost(h_found));\n\n  CUDA_CHECK(cudaFree(d_keys));\n  CUDA_CHECK(cudaFree(d_scores));\n  CUDA_CHECK(cudaFree(d_vectors));\n  CUDA_CHECK(cudaFree(d_def_val));\n  CUDA_CHECK(cudaFree(d_vectors_ptr));\n  CUDA_CHECK(cudaFree(d_found));\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_erase_if_pred(size_t max_hbm_for_vectors, int key_start = 0) {\n  constexpr uint64_t INIT_CAPACITY = 256UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t KEY_NUM = 128UL;\n  constexpr uint64_t TEST_TIMES = 1;\n  constexpr uint64_t BUCKET_MAX_SIZE = 128;\n\n  K* h_keys;\n  S* h_scores;\n  V* h_vectors;\n  bool* h_found;\n\n  TableOptions options;\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  std::unique_ptr<Table> table = std::make_unique<Table>();\n  table->init(options);\n\n  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));\n\n  K* d_keys;\n  S* d_scores = nullptr;\n  V* d_vectors;\n  bool* d_found;\n\n  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  uint64_t total_size = 0;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    test_util::create_keys_in_one_buckets<K, S, V, DIM>(\n        h_keys, h_scores, h_vectors, KEY_NUM, INIT_CAPACITY);\n    CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    find_or_insert_safe_ptr(table.get(), KEY_NUM, d_keys, d_scores, d_vectors,\n                            options.dim, stream);\n\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n    K pattern = 100;\n    S threshold = 0;\n    size_t erase_num = table->template erase_if<EraseIfPredFunctor>(\n        pattern, threshold, stream);\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ((erase_num + total_size), BUCKET_MAX_SIZE);\n\n    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n\n    {\n      V** d_vectors_ptr = nullptr;\n      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));\n      table->find(KEY_NUM, d_keys, d_vectors_ptr, d_found, d_scores, stream);\n      test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim, KEY_NUM,\n                               stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      CUDA_CHECK(cudaFree(d_vectors_ptr));\n    }\n\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    int found_num = 0;\n\n    CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));\n    CUDA_CHECK(cudaMemset(h_scores, 0, KEY_NUM * sizeof(S)));\n    CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n    for (int i = 0; i < KEY_NUM; i++) {\n      if (h_found[i]) {\n        found_num++;\n        ASSERT_EQ(h_scores[i], h_keys[i]);\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors[i * options.dim + j],\n                    static_cast<float>(h_keys[i] * 0.00001));\n        }\n      }\n    }\n    ASSERT_EQ(found_num, (BUCKET_MAX_SIZE - erase_num));\n\n    table->clear(stream);\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,\n                        cudaMemcpyDeviceToHost));\n\n  CUDA_CHECK(cudaFreeHost(h_keys));\n  CUDA_CHECK(cudaFreeHost(h_scores));\n  CUDA_CHECK(cudaFreeHost(h_vectors));\n  CUDA_CHECK(cudaFreeHost(h_found));\n\n  CUDA_CHECK(cudaFree(d_keys));\n  CUDA_CHECK(cudaFree(d_scores));\n  CUDA_CHECK(cudaFree(d_vectors));\n  CUDA_CHECK(cudaFree(d_found));\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_rehash(size_t max_hbm_for_vectors, int key_start = 0) {\n  constexpr uint64_t BUCKET_MAX_SIZE = 128ul;\n  constexpr uint64_t INIT_CAPACITY = BUCKET_MAX_SIZE;\n  constexpr uint64_t MAX_CAPACITY = 4 * INIT_CAPACITY;\n  constexpr uint64_t KEY_NUM = BUCKET_MAX_SIZE * 2;\n  constexpr uint64_t TEST_TIMES = 100;\n  K* h_keys;\n  S* h_scores;\n  V* h_vectors;\n  bool* h_found;\n\n  TableOptions options;\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_bucket_size = BUCKET_MAX_SIZE;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));\n\n  K* d_keys;\n  S* d_scores = nullptr;\n  V* d_vectors;\n  bool* d_found;\n  size_t dump_counter = 0;\n\n  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  uint64_t total_size = 0;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n    test_util::create_keys_in_one_buckets<K, S, V, DIM>(\n        h_keys, h_scores, h_vectors, KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE);\n    CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    find_or_insert_safe_ptr(table.get(), KEY_NUM, d_keys, d_scores, d_vectors,\n                            options.dim, stream);\n\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaDeviceSynchronize());\n    ASSERT_EQ(total_size, KEY_NUM);\n\n    dump_counter = table->export_batch(table->capacity(), 0, d_keys, d_vectors,\n                                       d_scores, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(dump_counter, KEY_NUM);\n\n    table->reserve(MAX_CAPACITY, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(table->capacity(), MAX_CAPACITY);\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, KEY_NUM);\n\n    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n    {\n      V** d_vectors_ptr = nullptr;\n      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, BUCKET_MAX_SIZE * sizeof(V*)));\n      table->find(BUCKET_MAX_SIZE, d_keys, d_vectors_ptr, d_found, d_scores,\n                  stream);\n      test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim,\n                               BUCKET_MAX_SIZE, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      CUDA_CHECK(cudaFree(d_vectors_ptr));\n    }\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    int found_num = 0;\n\n    CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));\n    CUDA_CHECK(cudaMemset(h_scores, 0, KEY_NUM * sizeof(S)));\n    CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n    CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n    for (int i = 0; i < BUCKET_MAX_SIZE; i++) {\n      if (h_found[i]) {\n        found_num++;\n        ASSERT_EQ(h_scores[i], h_keys[i]);\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors[i * options.dim + j],\n                    static_cast<float>(h_keys[i] * 0.00001));\n        }\n      }\n    }\n    ASSERT_EQ(found_num, BUCKET_MAX_SIZE);\n\n    table->clear(stream);\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,\n                        cudaMemcpyDeviceToHost));\n\n  CUDA_CHECK(cudaFreeHost(h_keys));\n  CUDA_CHECK(cudaFreeHost(h_scores));\n  CUDA_CHECK(cudaFreeHost(h_vectors));\n  CUDA_CHECK(cudaFreeHost(h_found));\n\n  CUDA_CHECK(cudaFree(d_keys));\n  CUDA_CHECK(cudaFree(d_scores));\n  CUDA_CHECK(cudaFree(d_vectors));\n  CUDA_CHECK(cudaFree(d_found));\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_rehash_on_big_batch(size_t max_hbm_for_vectors, int key_start = 0) {\n  constexpr uint64_t INIT_CAPACITY = 1024;\n  constexpr uint64_t MAX_CAPACITY = 16 * 1024;\n  constexpr uint64_t INIT_KEY_NUM = 1024;\n  constexpr uint64_t KEY_NUM = 2048;\n  K* h_keys;\n  S* h_scores;\n  V* h_vectors;\n  bool* h_found;\n\n  TableOptions options;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_bucket_size = 128;\n  options.max_load_factor = 0.6;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));\n\n  K* d_keys;\n  S* d_scores = nullptr;\n  V* d_vectors;\n  bool* d_found;\n  size_t dump_counter = 0;\n\n  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  uint64_t total_size = 0;\n  uint64_t expected_size = 0;\n  std::unique_ptr<Table> table = std::make_unique<Table>();\n  table->init(options);\n\n  test_util::create_random_keys<K, S, V, DIM>(h_keys, h_scores, h_vectors,\n                                              KEY_NUM);\n\n  CUDA_CHECK(\n      cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),\n                        cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors, KEY_NUM * sizeof(V) * options.dim,\n                        cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n\n  total_size = table->size(stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  ASSERT_EQ(total_size, 0);\n\n  find_or_insert_safe_ptr(table.get(), INIT_KEY_NUM, d_keys, d_scores,\n                          d_vectors, options.dim, stream);\n\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  expected_size = INIT_KEY_NUM;\n\n  total_size = table->size(stream);\n  CUDA_CHECK(cudaDeviceSynchronize());\n  ASSERT_EQ(total_size, expected_size);\n  ASSERT_EQ(table->capacity(), (INIT_CAPACITY * 2));\n\n  find_or_insert_safe_ptr(table.get(), KEY_NUM, d_keys, d_scores, d_vectors,\n                          options.dim, stream);\n\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  expected_size = KEY_NUM;\n\n  total_size = table->size(stream);\n  CUDA_CHECK(cudaDeviceSynchronize());\n  ASSERT_EQ(total_size, expected_size);\n  ASSERT_EQ(table->capacity(), KEY_NUM * 4);\n\n  dump_counter = table->export_batch(table->capacity(), 0, d_keys, d_vectors,\n                                     d_scores, stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  ASSERT_EQ(dump_counter, expected_size);\n\n  CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMemset(d_scores, 0, KEY_NUM * sizeof(S)));\n  {\n    V** d_vectors_ptr = nullptr;\n    CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));\n    table->find(KEY_NUM, d_keys, d_vectors_ptr, d_found, d_scores, stream);\n    test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim, KEY_NUM,\n                             stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    CUDA_CHECK(cudaFree(d_vectors_ptr));\n  }\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  int found_num = 0;\n\n  CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));\n  CUDA_CHECK(cudaMemset(h_scores, 0, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(\n      cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K), cudaMemcpyDeviceToHost));\n  CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                        cudaMemcpyDeviceToHost));\n  CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),\n                        cudaMemcpyDeviceToHost));\n  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,\n                        cudaMemcpyDeviceToHost));\n  for (int i = 0; i < KEY_NUM; i++) {\n    if (h_found[i]) {\n      found_num++;\n      ASSERT_EQ(h_scores[i], h_keys[i]);\n      for (int j = 0; j < options.dim; j++) {\n        ASSERT_EQ(h_vectors[i * options.dim + j],\n                  static_cast<float>(h_keys[i] * 0.00001));\n      }\n    }\n  }\n  ASSERT_EQ(found_num, KEY_NUM);\n\n  table->clear(stream);\n  total_size = table->size(stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  ASSERT_EQ(total_size, 0);\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,\n                        cudaMemcpyDeviceToHost));\n\n  CUDA_CHECK(cudaFreeHost(h_keys));\n  CUDA_CHECK(cudaFreeHost(h_scores));\n  CUDA_CHECK(cudaFreeHost(h_vectors));\n  CUDA_CHECK(cudaFreeHost(h_found));\n\n  CUDA_CHECK(cudaFree(d_keys));\n  CUDA_CHECK(cudaFree(d_scores));\n  CUDA_CHECK(cudaFree(d_vectors));\n  CUDA_CHECK(cudaFree(d_found));\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_dynamic_rehash_on_multi_threads(size_t max_hbm_for_vectors,\n                                          int key_start = 0) {\n  constexpr uint64_t BUCKET_MAX_SIZE = 128ul;\n  constexpr uint64_t INIT_CAPACITY = 4 * 1024;\n  constexpr uint64_t MAX_CAPACITY = 16 * 1024 * INIT_CAPACITY;\n  constexpr uint64_t KEY_NUM = 256;\n  constexpr uint64_t THREAD_N = 8;\n\n  std::vector<std::thread> threads;\n\n  TableOptions options;\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_load_factor = 0.50f;\n  options.max_bucket_size = BUCKET_MAX_SIZE;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>;\n\n  std::shared_ptr<Table> table = std::make_shared<Table>();\n  table->init(options);\n\n  auto worker_function = [&table, KEY_NUM, options](int task_n) {\n    K* h_keys;\n    V* h_vectors;\n    bool* h_found;\n\n    size_t current_capacity = table->capacity();\n\n    CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));\n    CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));\n    CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));\n\n    K* d_keys;\n    V* d_vectors;\n    bool* d_found;\n\n    CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));\n    CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));\n    CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n\n    cudaStream_t stream;\n    CUDA_CHECK(cudaStreamCreate(&stream));\n\n    while (table->capacity() < MAX_CAPACITY) {\n      test_util::create_random_keys<K, S, V, DIM>(h_keys, nullptr, h_vectors,\n                                                  KEY_NUM);\n      CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),\n                            cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,\n                            KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n\n      find_or_insert_safe_ptr(table.get(), KEY_NUM, d_keys, nullptr, d_vectors,\n                              options.dim, stream);\n\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n      {\n        V** d_vectors_ptr = nullptr;\n        CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));\n        table->find(KEY_NUM, d_keys, d_vectors_ptr, d_found, nullptr, stream);\n        test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim, KEY_NUM,\n                                 stream);\n        CUDA_CHECK(cudaStreamSynchronize(stream));\n        CUDA_CHECK(cudaFree(d_vectors_ptr));\n      }\n\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      int found_num = 0;\n\n      CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));\n      CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n      CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),\n                            cudaMemcpyDeviceToHost));\n      CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                            cudaMemcpyDeviceToHost));\n\n      CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                            KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDeviceToHost));\n      for (int i = 0; i < KEY_NUM; i++) {\n        if (h_found[i]) {\n          found_num++;\n          for (int j = 0; j < options.dim; j++) {\n            ASSERT_EQ(h_vectors[i * options.dim + j],\n                      static_cast<float>(h_keys[i] * 0.00001));\n          }\n        }\n      }\n      ASSERT_EQ(found_num, KEY_NUM);\n      if (task_n == 0 && current_capacity != table->capacity()) {\n        std::cout << \"[test_dynamic_rehash_on_multi_threads] The capacity \"\n                     \"changed from \"\n                  << current_capacity << \" to \" << table->capacity()\n                  << std::endl;\n        current_capacity = table->capacity();\n      }\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n    }\n    CUDA_CHECK(cudaStreamDestroy(stream));\n\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n\n    CUDA_CHECK(cudaFreeHost(h_keys));\n    CUDA_CHECK(cudaFreeHost(h_found));\n    CUDA_CHECK(cudaFreeHost(h_vectors));\n\n    CUDA_CHECK(cudaFree(d_keys));\n    CUDA_CHECK(cudaFree(d_vectors));\n    CUDA_CHECK(cudaFree(d_found));\n    CUDA_CHECK(cudaDeviceSynchronize());\n\n    CudaCheckError();\n  };\n\n  for (int i = 0; i < THREAD_N; ++i)\n    threads.emplace_back(std::thread(worker_function, i));\n\n  for (auto& th : threads) {\n    th.join();\n  }\n  ASSERT_EQ(table->capacity(), MAX_CAPACITY);\n}\n\nvoid test_export_batch_if(size_t max_hbm_for_vectors, int key_start = 0) {\n  constexpr uint64_t INIT_CAPACITY = 256UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t KEY_NUM = 128UL;\n  constexpr uint64_t TEST_TIMES = 1;\n  constexpr uint64_t BUCKET_MAX_SIZE = 128ul;\n\n  K* h_keys;\n  S* h_scores;\n  V* h_vectors;\n  size_t h_dump_counter = 0;\n\n  TableOptions options;\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>;\n\n  std::unique_ptr<Table> table = std::make_unique<Table>();\n  table->init(options);\n\n  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));\n\n  K* d_keys;\n  S* d_scores = nullptr;\n  V* d_vectors;\n  bool* d_found;\n  size_t* d_dump_counter;\n  int found_num = 0;\n  bool* h_found;\n\n  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n  CUDA_CHECK(cudaMalloc(&d_dump_counter, sizeof(size_t)));\n\n  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  S threshold = test_util::host_nano<S>();\n  uint64_t total_size = 0;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    test_util::create_random_keys<K, S, V, DIM>(h_keys, h_scores, h_vectors,\n                                                KEY_NUM);\n\n    CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyHostToDevice));\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    find_or_insert_safe_ptr(table.get(), KEY_NUM, d_keys, nullptr, d_vectors,\n                            options.dim, stream);\n\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, KEY_NUM);\n\n    CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));\n    CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n    CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),\n                          cudaMemcpyDeviceToHost));\n\n    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n    {\n      V** d_vectors_ptr = nullptr;\n      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));\n      table->find(KEY_NUM, d_keys, d_vectors_ptr, d_found, nullptr, stream);\n      test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim, KEY_NUM,\n                               stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      CUDA_CHECK(cudaFree(d_vectors_ptr));\n    }\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                          cudaMemcpyDeviceToHost));\n\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n    found_num = 0;\n    for (int i = 0; i < BUCKET_MAX_SIZE; i++) {\n      if (h_found[i]) {\n        found_num++;\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors[i * options.dim + j],\n                    static_cast<float>(h_keys[i] * 0.00001));\n        }\n      }\n    }\n    ASSERT_EQ(found_num, KEY_NUM);\n\n    K pattern = 100;\n\n    table->template export_batch_if<ExportIfPredFunctor>(\n        pattern, threshold, table->capacity(), 0, d_dump_counter, d_keys,\n        d_vectors, d_scores, stream);\n\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    CUDA_CHECK(cudaMemcpy(&h_dump_counter, d_dump_counter, sizeof(size_t),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),\n                          cudaMemcpyDeviceToHost));\n\n    size_t expected_export_count = 0;\n    for (int i = 0; i < KEY_NUM; i++) {\n      if (h_scores[i] > threshold) expected_export_count++;\n    }\n    ASSERT_EQ(expected_export_count, h_dump_counter);\n\n    table->template export_batch_if<ExportIfPredFunctor>(\n        pattern, test_util::host_nano<S>(), table->capacity(), 0,\n        d_dump_counter, d_keys, d_vectors, d_scores, stream);\n\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    CUDA_CHECK(cudaMemcpy(&h_dump_counter, d_dump_counter, sizeof(size_t),\n                          cudaMemcpyDeviceToHost));\n\n    ASSERT_EQ(0, h_dump_counter);\n\n    CUDA_CHECK(cudaMemset(h_keys, 0, KEY_NUM * sizeof(K)));\n    CUDA_CHECK(cudaMemset(h_scores, 0, KEY_NUM * sizeof(S)));\n    CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n\n    CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n\n    for (int i = 0; i < h_dump_counter; i++) {\n      ASSERT_GT(h_scores[i], threshold);\n      for (int j = 0; j < options.dim; j++) {\n        ASSERT_EQ(h_vectors[i * options.dim + j],\n                  static_cast<float>(h_keys[i] * 0.00001));\n      }\n    }\n\n    table->clear(stream);\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n  }\n  CUDA_CHECK(cudaDeviceSynchronize());\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,\n                        cudaMemcpyDeviceToHost));\n\n  CUDA_CHECK(cudaFreeHost(h_keys));\n  CUDA_CHECK(cudaFreeHost(h_scores));\n  CUDA_CHECK(cudaFreeHost(h_vectors));\n  CUDA_CHECK(cudaFreeHost(h_found));\n\n  CUDA_CHECK(cudaFree(d_keys));\n  CUDA_CHECK(cudaFree(d_scores));\n  CUDA_CHECK(cudaFree(d_vectors));\n  CUDA_CHECK(cudaFree(d_found));\n  CUDA_CHECK(cudaFree(d_dump_counter));\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_evict_strategy_lru_basic(size_t max_hbm_for_vectors,\n                                   int key_start = 0) {\n  constexpr uint64_t BUCKET_NUM = 8UL;\n  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;\n  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;\n  constexpr uint64_t TEST_KEY_NUM = 4;\n  constexpr uint64_t TEMP_KEY_NUM =\n      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;\n  constexpr uint64_t TEST_TIMES = 128;\n\n  TableOptions options;\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>;\n\n  std::vector<K> h_keys_base(BASE_KEY_NUM);\n  std::vector<S> h_scores_base(BASE_KEY_NUM);\n  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_test(TEST_KEY_NUM);\n  std::vector<S> h_scores_test(TEST_KEY_NUM);\n  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_temp(TEMP_KEY_NUM);\n  std::vector<S> h_scores_temp(TEMP_KEY_NUM);\n  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);\n\n  K* d_keys_temp;\n  S* d_scores_temp = nullptr;\n  V* d_vectors_temp;\n\n  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));\n  CUDA_CHECK(\n      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));\n\n  test_util::create_keys_in_one_buckets<K, S, V, DIM>(\n      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),\n      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF);\n\n  test_util::create_keys_in_one_buckets<K, S, V, DIM>(\n      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),\n      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,\n      0xFFFFFFFFFFFFFFFD);\n\n  h_keys_test[2] = h_keys_base[72];\n  h_keys_test[3] = h_keys_base[73];\n\n  for (int i = 0; i < options.dim; i++) {\n    h_vectors_test[2 * options.dim + i] = h_vectors_base[72 * options.dim + i];\n    h_vectors_test[3 * options.dim + i] = h_vectors_base[73 * options.dim + i];\n  }\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  size_t total_size = 0;\n  size_t dump_counter = 0;\n  S start_ts;\n  S end_ts;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    {\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      {\n        V** d_vectors_ptr = nullptr;\n        bool* d_found;\n        K** d_key_ptrs = nullptr;\n        CUDA_CHECK(cudaMalloc(&d_found, BASE_KEY_NUM * sizeof(bool)));\n        CUDA_CHECK(cudaMalloc(&d_vectors_ptr, BASE_KEY_NUM * sizeof(V*)));\n        CUDA_CHECK(cudaMalloc(&d_key_ptrs, BASE_KEY_NUM * sizeof(K*)));\n        start_ts = test_util::host_nano<S>(stream);\n        table->find_or_insert(BASE_KEY_NUM, d_keys_temp, d_vectors_ptr, d_found,\n                              nullptr, stream, true, false, d_key_ptrs);\n        test_util::read_or_write_ptr(d_vectors_ptr, d_vectors_temp, d_found,\n                                     options.dim, BASE_KEY_NUM, stream);\n        table->unlock_keys(BASE_KEY_NUM, d_key_ptrs, d_keys_temp, d_found,\n                           stream);\n        CUDA_CHECK(cudaStreamSynchronize(stream));\n        end_ts = test_util::host_nano<S>(stream);\n        CUDA_CHECK(cudaFree(d_key_ptrs));\n        CUDA_CHECK(cudaFree(d_vectors_ptr));\n        CUDA_CHECK(cudaFree(d_found));\n      }\n\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      std::vector<S> h_scores_temp_sorted(h_scores_temp);\n      std::sort(h_scores_temp_sorted.begin(), h_scores_temp_sorted.end());\n\n      ASSERT_GE(h_scores_temp_sorted[0], start_ts);\n      ASSERT_LE(h_scores_temp_sorted[BASE_KEY_NUM - 1], end_ts);\n      for (int i = 0; i < dump_counter; i++) {\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n\n    {\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),\n                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),\n                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),\n                            TEST_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      start_ts = test_util::host_nano<S>(stream);\n      table->assign(TEST_KEY_NUM, d_keys_temp, d_vectors_temp, nullptr, stream);\n\n      {\n        V** d_vectors_ptr = nullptr;\n        bool* d_found;\n        K** d_key_ptrs = nullptr;\n        CUDA_CHECK(cudaMalloc(&d_found, TEST_KEY_NUM * sizeof(bool)));\n        CUDA_CHECK(cudaMalloc(&d_vectors_ptr, TEST_KEY_NUM * sizeof(V*)));\n        CUDA_CHECK(cudaMalloc(&d_key_ptrs, TEST_KEY_NUM * sizeof(K*)));\n        table->find_or_insert(TEST_KEY_NUM, d_keys_temp, d_vectors_ptr, d_found,\n                              nullptr, stream, true, false, d_key_ptrs);\n        test_util::read_or_write_ptr(d_vectors_ptr, d_vectors_temp, d_found,\n                                     options.dim, TEST_KEY_NUM, stream);\n        table->unlock_keys(TEST_KEY_NUM, d_key_ptrs, d_keys_temp, d_found,\n                           stream);\n        CUDA_CHECK(cudaStreamSynchronize(stream));\n        end_ts = test_util::host_nano<S>(stream);\n        CUDA_CHECK(cudaFree(d_key_ptrs));\n        CUDA_CHECK(cudaFree(d_vectors_ptr));\n        CUDA_CHECK(cudaFree(d_found));\n      }\n\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            TEMP_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      std::vector<S> h_scores_temp_sorted(TEST_KEY_NUM);\n      int ctr = 0;\n      for (int i = 0; i < TEMP_KEY_NUM; i++) {\n        if (h_keys_test.end() !=\n            std::find(h_keys_test.begin(), h_keys_test.end(), h_keys_temp[i])) {\n          ASSERT_GT(h_scores_temp[i], BUCKET_MAX_SIZE);\n          h_scores_temp_sorted[ctr++] = h_scores_temp[i];\n        } else {\n          ASSERT_LE(h_scores_temp[i], start_ts);\n        }\n      }\n      std::sort(h_scores_temp_sorted.begin(),\n                h_scores_temp_sorted.begin() + ctr);\n\n      ASSERT_GE(h_scores_temp_sorted[0], start_ts);\n      ASSERT_LE(h_scores_temp_sorted[ctr - 1], end_ts);\n      for (int i = 0; i < dump_counter; i++) {\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaFree(d_keys_temp));\n  CUDA_CHECK(cudaFree(d_scores_temp));\n  CUDA_CHECK(cudaFree(d_vectors_temp));\n\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_evict_strategy_lfu_basic(size_t max_hbm_for_vectors,\n                                   int key_start = 0) {\n  constexpr uint64_t BUCKET_NUM = 8UL;\n  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;\n  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;\n  constexpr uint64_t TEST_KEY_NUM = 4;\n  constexpr uint64_t TEMP_KEY_NUM =\n      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;\n  constexpr uint64_t TEST_TIMES = 128;\n\n  TableOptions options;\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLfu>;\n\n  std::vector<K> h_keys_base(BASE_KEY_NUM);\n  std::vector<S> h_scores_base(BASE_KEY_NUM);\n  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_test(TEST_KEY_NUM);\n  std::vector<S> h_scores_test(TEST_KEY_NUM);\n  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_temp(TEMP_KEY_NUM);\n  std::vector<S> h_scores_temp(TEMP_KEY_NUM);\n  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);\n\n  K* d_keys_temp;\n  S* d_scores_temp = nullptr;\n  V* d_vectors_temp;\n\n  int freq_range = 1000;\n\n  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));\n  CUDA_CHECK(\n      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));\n\n  test_util::create_keys_in_one_buckets_lfu<K, S, V, DIM>(\n      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),\n      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF,\n      freq_range);\n\n  test_util::create_keys_in_one_buckets_lfu<K, S, V, DIM>(\n      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),\n      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,\n      0xFFFFFFFFFFFFFFFD, freq_range);\n\n  h_keys_test[2] = h_keys_base[72];\n  h_keys_test[3] = h_keys_base[73];\n\n  h_scores_test[2] = h_keys_base[72] % freq_range;\n  h_scores_test[3] = h_keys_base[73] % freq_range;\n\n  for (int i = 0; i < options.dim; i++) {\n    h_vectors_test[2 * options.dim + i] = h_vectors_base[72 * options.dim + i];\n    h_vectors_test[3 * options.dim + i] = h_vectors_base[73 * options.dim + i];\n  }\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  size_t total_size = 0;\n  size_t dump_counter = 0;\n  S global_epoch = 1;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    {\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n\n      find_or_insert_safe_ptr(table.get(), BASE_KEY_NUM, d_keys_temp,\n                              d_scores_temp, d_vectors_temp, options.dim,\n                              stream);\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      for (int i = 0; i < dump_counter; i++) {\n        ASSERT_EQ(h_scores_temp[i], h_keys_temp[i] % freq_range);\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n\n    {\n      global_epoch++;\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),\n                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),\n                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),\n                            TEST_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      table->assign(TEST_KEY_NUM, d_keys_temp, d_vectors_temp, d_scores_temp,\n                    stream);\n      find_or_insert_safe_ptr(table.get(), TEST_KEY_NUM, d_keys_temp,\n                              d_scores_temp, d_vectors_temp, options.dim,\n                              stream);\n\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            TEMP_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      for (int i = 0; i < dump_counter; i++) {\n        bool in_base =\n            h_keys_base.end() !=\n            std::find(h_keys_base.begin(), h_keys_base.end(), h_keys_temp[i]);\n        bool in_test =\n            h_keys_test.end() !=\n            std::find(h_keys_test.begin(), h_keys_test.end(), h_keys_temp[i]);\n        if (in_base && in_test) {\n          ASSERT_EQ(h_scores_temp[i], (h_keys_temp[i] % freq_range) *\n                                          3);  // update score when found.\n        } else {\n          ASSERT_EQ(h_scores_temp[i], (h_keys_temp[i] % freq_range));\n        }\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaFree(d_keys_temp));\n  CUDA_CHECK(cudaFree(d_scores_temp));\n  CUDA_CHECK(cudaFree(d_vectors_temp));\n\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_evict_strategy_epochlru_basic(size_t max_hbm_for_vectors,\n                                        int key_start = 0) {\n  constexpr int RSHIFT_ON_NANO = 20;\n\n  constexpr uint64_t BUCKET_NUM = 8UL;\n  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;\n  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;\n  constexpr uint64_t TEST_KEY_NUM = 4;\n  constexpr uint64_t TEMP_KEY_NUM =\n      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;\n  constexpr uint64_t TEST_TIMES = 128;\n\n  TableOptions options;\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kEpochLru>;\n\n  std::vector<K> h_keys_base(BASE_KEY_NUM);\n  std::vector<S> h_scores_base(BASE_KEY_NUM);\n  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_test(TEST_KEY_NUM);\n  std::vector<S> h_scores_test(TEST_KEY_NUM);\n  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_temp(TEMP_KEY_NUM);\n  std::vector<S> h_scores_temp(TEMP_KEY_NUM);\n  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);\n\n  K* d_keys_temp;\n  S* d_scores_temp = nullptr;\n  V* d_vectors_temp;\n\n  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));\n  CUDA_CHECK(\n      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));\n\n  test_util::create_keys_in_one_buckets<K, S, V, DIM>(\n      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),\n      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF);\n\n  test_util::create_keys_in_one_buckets<K, S, V, DIM>(\n      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),\n      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,\n      0xFFFFFFFFFFFFFFFD);\n\n  h_keys_test[2] = h_keys_base[72];\n  h_keys_test[3] = h_keys_base[73];\n\n  for (int i = 0; i < options.dim; i++) {\n    h_vectors_test[2 * options.dim + i] = h_vectors_base[72 * options.dim + i];\n    h_vectors_test[3 * options.dim + i] = h_vectors_base[73 * options.dim + i];\n  }\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  size_t total_size = 0;\n  size_t dump_counter = 0;\n  S start_ts;\n  S end_ts;\n\n  S global_epoch = 1;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    {\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      {\n        V** d_vectors_ptr = nullptr;\n        bool* d_found;\n        K** d_key_ptrs = nullptr;\n        CUDA_CHECK(cudaMalloc(&d_found, BASE_KEY_NUM * sizeof(bool)));\n        CUDA_CHECK(cudaMalloc(&d_vectors_ptr, BASE_KEY_NUM * sizeof(V*)));\n        CUDA_CHECK(cudaMalloc(&d_key_ptrs, BASE_KEY_NUM * sizeof(K*)));\n\n        start_ts =\n            (test_util::host_nano<S>(stream) >> RSHIFT_ON_NANO) & 0xFFFFFFFF;\n        table->set_global_epoch(global_epoch);\n        table->find_or_insert(BASE_KEY_NUM, d_keys_temp, d_vectors_ptr, d_found,\n                              nullptr, stream, true, false, d_key_ptrs);\n        test_util::read_or_write_ptr(d_vectors_ptr, d_vectors_temp, d_found,\n                                     options.dim, BASE_KEY_NUM, stream);\n        table->unlock_keys(BASE_KEY_NUM, d_key_ptrs, d_keys_temp, d_found,\n                           stream);\n        CUDA_CHECK(cudaStreamSynchronize(stream));\n        end_ts =\n            (test_util::host_nano<S>(stream) >> RSHIFT_ON_NANO) & 0xFFFFFFFF;\n        CUDA_CHECK(cudaFree(d_key_ptrs));\n        CUDA_CHECK(cudaFree(d_vectors_ptr));\n        CUDA_CHECK(cudaFree(d_found));\n      }\n\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      std::vector<S> h_scores_temp_sorted(h_scores_temp);\n      std::sort(h_scores_temp_sorted.begin(), h_scores_temp_sorted.end());\n\n      ASSERT_GE(h_scores_temp_sorted[0], (global_epoch << 32 | start_ts));\n      ASSERT_LE(h_scores_temp_sorted[BASE_KEY_NUM - 1],\n                (global_epoch << 32 | end_ts));\n      for (int i = 0; i < dump_counter; i++) {\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n\n    {\n      global_epoch++;\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),\n                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),\n                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),\n                            TEST_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      start_ts =\n          (test_util::host_nano<S>(stream) >> RSHIFT_ON_NANO) & 0xFFFFFFFF;\n      table->set_global_epoch(global_epoch);\n      table->assign(TEST_KEY_NUM, d_keys_temp, d_vectors_temp, nullptr, stream);\n      {\n        V** d_vectors_ptr = nullptr;\n        bool* d_found;\n        K** d_key_ptrs = nullptr;\n        CUDA_CHECK(cudaMalloc(&d_found, TEST_KEY_NUM * sizeof(bool)));\n        CUDA_CHECK(cudaMalloc(&d_vectors_ptr, TEST_KEY_NUM * sizeof(V*)));\n        CUDA_CHECK(cudaMalloc(&d_key_ptrs, TEST_KEY_NUM * sizeof(K*)));\n\n        table->find_or_insert(TEST_KEY_NUM, d_keys_temp, d_vectors_ptr, d_found,\n                              nullptr, stream, true, false, d_key_ptrs);\n        test_util::read_or_write_ptr(d_vectors_ptr, d_vectors_temp, d_found,\n                                     options.dim, TEST_KEY_NUM, stream);\n        table->unlock_keys(TEST_KEY_NUM, d_key_ptrs, d_keys_temp, d_found,\n                           stream);\n        CUDA_CHECK(cudaStreamSynchronize(stream));\n\n        end_ts =\n            (test_util::host_nano<S>(stream) >> RSHIFT_ON_NANO) & 0xFFFFFFFF;\n        CUDA_CHECK(cudaFree(d_key_ptrs));\n        CUDA_CHECK(cudaFree(d_vectors_ptr));\n        CUDA_CHECK(cudaFree(d_found));\n      }\n\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            TEMP_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      std::vector<S> h_scores_temp_sorted(TEST_KEY_NUM);\n      int ctr = 0;\n      for (int i = 0; i < TEMP_KEY_NUM; i++) {\n        if (h_keys_test.end() !=\n            std::find(h_keys_test.begin(), h_keys_test.end(), h_keys_temp[i])) {\n          ASSERT_GE(h_scores_temp[i], (global_epoch << 32 | start_ts));\n          h_scores_temp_sorted[ctr++] = h_scores_temp[i];\n        } else {\n          ASSERT_LE(h_scores_temp[i], (global_epoch << 32 | start_ts));\n        }\n      }\n      std::sort(h_scores_temp_sorted.begin(),\n                h_scores_temp_sorted.begin() + ctr);\n\n      ASSERT_GE(h_scores_temp_sorted[0], (global_epoch << 32 | start_ts));\n      ASSERT_LE(h_scores_temp_sorted[ctr - 1], (global_epoch << 32 | end_ts));\n      for (int i = 0; i < dump_counter; i++) {\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaFree(d_keys_temp));\n  CUDA_CHECK(cudaFree(d_scores_temp));\n  CUDA_CHECK(cudaFree(d_vectors_temp));\n\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_evict_strategy_epochlfu_basic(size_t max_hbm_for_vectors,\n                                        int key_start = 0) {\n  constexpr uint64_t BUCKET_NUM = 8UL;\n  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;\n  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;\n  constexpr uint64_t TEST_KEY_NUM = 4;\n  constexpr uint64_t TEMP_KEY_NUM =\n      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;\n  constexpr uint64_t TEST_TIMES = 128;\n\n  TableOptions options;\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kEpochLfu>;\n\n  std::vector<K> h_keys_base(BASE_KEY_NUM);\n  std::vector<S> h_scores_base(BASE_KEY_NUM);\n  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_test(TEST_KEY_NUM);\n  std::vector<S> h_scores_test(TEST_KEY_NUM);\n  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_temp(TEMP_KEY_NUM);\n  std::vector<S> h_scores_temp(TEMP_KEY_NUM);\n  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);\n\n  K* d_keys_temp;\n  S* d_scores_temp = nullptr;\n  V* d_vectors_temp;\n\n  int freq_range = 1000;\n\n  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));\n  CUDA_CHECK(\n      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));\n\n  test_util::create_keys_in_one_buckets_lfu<K, S, V, DIM>(\n      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),\n      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF,\n      freq_range);\n\n  test_util::create_keys_in_one_buckets_lfu<K, S, V, DIM>(\n      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),\n      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,\n      0xFFFFFFFFFFFFFFFD, freq_range);\n\n  // Simulate overflow of low 32bits.\n  h_scores_base[71] = static_cast<S>(std::numeric_limits<uint32_t>::max() -\n                                     static_cast<uint32_t>(1));\n\n  h_keys_test[1] = h_keys_base[71];\n  h_keys_test[2] = h_keys_base[72];\n  h_keys_test[3] = h_keys_base[73];\n\n  h_scores_test[1] = h_scores_base[71];\n  h_scores_test[2] = h_keys_base[72] % freq_range;\n  h_scores_test[3] = h_keys_base[73] % freq_range;\n\n  for (int i = 0; i < options.dim; i++) {\n    h_vectors_test[1 * options.dim + i] = h_vectors_base[71 * options.dim + i];\n    h_vectors_test[2 * options.dim + i] = h_vectors_base[72 * options.dim + i];\n    h_vectors_test[3 * options.dim + i] = h_vectors_base[73 * options.dim + i];\n  }\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  size_t total_size = 0;\n  size_t dump_counter = 0;\n  S global_epoch = 1;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    {\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n\n      {\n        V** d_vectors_ptr = nullptr;\n        bool* d_found;\n        K** d_key_ptrs = nullptr;\n        CUDA_CHECK(cudaMalloc(&d_found, BASE_KEY_NUM * sizeof(bool)));\n        CUDA_CHECK(cudaMalloc(&d_vectors_ptr, BASE_KEY_NUM * sizeof(V*)));\n        CUDA_CHECK(cudaMalloc(&d_key_ptrs, BASE_KEY_NUM * sizeof(K*)));\n\n        table->set_global_epoch(global_epoch);\n        table->find_or_insert(BASE_KEY_NUM, d_keys_temp, d_vectors_ptr, d_found,\n                              d_scores_temp, stream, true, false, d_key_ptrs);\n        test_util::read_or_write_ptr(d_vectors_ptr, d_vectors_temp, d_found,\n                                     options.dim, BASE_KEY_NUM, stream);\n        table->unlock_keys(BASE_KEY_NUM, d_key_ptrs, d_keys_temp, d_found,\n                           stream);\n        CUDA_CHECK(cudaStreamSynchronize(stream));\n        CUDA_CHECK(cudaFree(d_key_ptrs));\n        CUDA_CHECK(cudaFree(d_vectors_ptr));\n        CUDA_CHECK(cudaFree(d_found));\n      }\n\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      for (int i = 0; i < dump_counter; i++) {\n        if (h_keys_temp[i] == h_keys_base[71]) {\n          S expected_score = test_util::make_expected_score_for_epochlfu<S>(\n              global_epoch, h_scores_base[71]);\n          ASSERT_EQ(h_scores_temp[i], expected_score);\n        } else {\n          S expected_score = test_util::make_expected_score_for_epochlfu<S>(\n              global_epoch, (h_keys_temp[i] % freq_range));\n          ASSERT_EQ(h_scores_temp[i], expected_score);\n        }\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n\n    {\n      global_epoch++;\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),\n                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),\n                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),\n                            TEST_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      table->set_global_epoch(global_epoch);\n      table->assign(TEST_KEY_NUM, d_keys_temp, d_vectors_temp, d_scores_temp,\n                    stream);\n      {\n        V** d_vectors_ptr = nullptr;\n        bool* d_found;\n        K** d_key_ptrs = nullptr;\n        CUDA_CHECK(cudaMalloc(&d_found, TEST_KEY_NUM * sizeof(bool)));\n        CUDA_CHECK(cudaMalloc(&d_vectors_ptr, TEST_KEY_NUM * sizeof(V*)));\n        CUDA_CHECK(cudaMalloc(&d_key_ptrs, TEST_KEY_NUM * sizeof(K*)));\n        table->find_or_insert(TEST_KEY_NUM, d_keys_temp, d_vectors_ptr, d_found,\n                              d_scores_temp, stream, true, false, d_key_ptrs);\n        test_util::read_or_write_ptr(d_vectors_ptr, d_vectors_temp, d_found,\n                                     options.dim, TEST_KEY_NUM, stream);\n        table->unlock_keys(TEST_KEY_NUM, d_key_ptrs, d_keys_temp, d_found,\n                           stream);\n        CUDA_CHECK(cudaStreamSynchronize(stream));\n        CUDA_CHECK(cudaFree(d_key_ptrs));\n        CUDA_CHECK(cudaFree(d_vectors_ptr));\n        CUDA_CHECK(cudaFree(d_found));\n      }\n\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            TEMP_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      ASSERT_TRUE(h_keys_temp.end() != std::find(h_keys_temp.begin(),\n                                                 h_keys_temp.end(),\n                                                 h_keys_base[71]));\n\n      for (int i = 0; i < dump_counter; i++) {\n        bool in_base =\n            h_keys_base.end() !=\n            std::find(h_keys_base.begin(), h_keys_base.end(), h_keys_temp[i]);\n        bool in_test =\n            h_keys_test.end() !=\n            std::find(h_keys_test.begin(), h_keys_test.end(), h_keys_temp[i]);\n\n        if (in_base && in_test) {\n          if (h_keys_temp[i] == h_keys_base[71]) {\n            S expected_score = test_util::make_expected_score_for_epochlfu<S>(\n                global_epoch, h_scores_base[71] * 2);\n            ASSERT_EQ(h_scores_temp[i], expected_score);\n          } else {\n            S expected_score = test_util::make_expected_score_for_epochlfu<S>(\n                global_epoch,\n                (h_keys_temp[i] % freq_range) * 3);  // update score when found.\n            ASSERT_EQ(h_scores_temp[i], expected_score);\n          }\n        } else {\n          if (h_keys_temp[i] == h_keys_base[71]) {\n            S expected_score = test_util::make_expected_score_for_epochlfu<S>(\n                global_epoch - static_cast<S>(in_base), h_scores_base[71]);\n            ASSERT_EQ(h_scores_temp[i], expected_score);\n          } else {\n            S expected_score = test_util::make_expected_score_for_epochlfu<S>(\n                global_epoch - static_cast<S>(in_base),\n                (h_keys_temp[i] % freq_range));\n\n            ASSERT_EQ(h_scores_temp[i], expected_score);\n          }\n        }\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaFree(d_keys_temp));\n  CUDA_CHECK(cudaFree(d_scores_temp));\n  CUDA_CHECK(cudaFree(d_vectors_temp));\n\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_evict_strategy_customized_basic(size_t max_hbm_for_vectors,\n                                          int key_start = 0) {\n  constexpr uint64_t BUCKET_NUM = 8UL;\n  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;\n  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;\n  constexpr uint64_t TEST_KEY_NUM = 128;\n  constexpr uint64_t TEMP_KEY_NUM =\n      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;\n  constexpr uint64_t TEST_TIMES = 128;\n\n  TableOptions options;\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  std::vector<K> h_keys_base(BASE_KEY_NUM);\n  std::vector<S> h_scores_base(BASE_KEY_NUM);\n  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_test(TEST_KEY_NUM);\n  std::vector<S> h_scores_test(TEST_KEY_NUM);\n  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_temp(TEMP_KEY_NUM);\n  std::vector<S> h_scores_temp(TEMP_KEY_NUM);\n  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);\n\n  K* d_keys_temp;\n  S* d_scores_temp = nullptr;\n  V* d_vectors_temp;\n\n  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));\n  CUDA_CHECK(\n      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));\n\n  test_util::create_keys_in_one_buckets<K, S, V, DIM>(\n      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),\n      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF);\n\n  const S base_score_start = 1000;\n  for (int i = 0; i < BASE_KEY_NUM; i++) {\n    h_scores_base[i] = base_score_start + i;\n  }\n\n  test_util::create_keys_in_one_buckets<K, S, V, DIM>(\n      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),\n      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,\n      0xFFFFFFFFFFFFFFFD);\n  const S test_score_start = base_score_start + BASE_KEY_NUM;\n  for (int i = 0; i < TEST_KEY_NUM; i++) {\n    h_scores_test[i] = test_score_start + i;\n  }\n  for (int i = 64; i < TEST_KEY_NUM; i++) {\n    h_keys_test[i] = h_keys_base[i];\n    for (int j = 0; j < options.dim; j++) {\n      h_vectors_test[i * options.dim + j] = h_vectors_base[i * options.dim + j];\n    }\n  }\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  size_t total_size = 0;\n  size_t dump_counter = 0;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    {\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      find_or_insert_safe_ptr(table.get(), BASE_KEY_NUM, d_keys_temp,\n                              d_scores_temp, d_vectors_temp, options.dim,\n                              stream);\n\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      std::vector<S> h_scores_temp_sorted(h_scores_temp);\n      std::sort(h_scores_temp_sorted.begin(), h_scores_temp_sorted.end());\n\n      auto expected_range = test_util::range<S, TEMP_KEY_NUM>(base_score_start);\n      ASSERT_TRUE(std::equal(h_scores_temp_sorted.begin(),\n                             h_scores_temp_sorted.end(),\n                             expected_range.begin()));\n      for (int i = 0; i < dump_counter; i++) {\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n\n    {\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),\n                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),\n                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),\n                            TEST_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      table->assign(TEST_KEY_NUM, d_keys_temp, d_vectors_temp, d_scores_temp,\n                    stream);\n      find_or_insert_safe_ptr(table.get(), TEST_KEY_NUM, d_keys_temp,\n                              d_scores_temp, d_vectors_temp, options.dim,\n                              stream);\n\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            TEMP_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      std::vector<S> h_scores_temp_sorted(h_scores_temp);\n      std::sort(h_scores_temp_sorted.begin(), h_scores_temp_sorted.end());\n\n      auto expected_range_test =\n          test_util::range<S, TEST_KEY_NUM>(test_score_start);\n      ASSERT_TRUE(std::equal(h_scores_temp_sorted.begin(),\n                             h_scores_temp_sorted.end(),\n                             expected_range_test.begin()));\n      for (int i = 0; i < dump_counter; i++) {\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaFree(d_keys_temp));\n  CUDA_CHECK(cudaFree(d_scores_temp));\n  CUDA_CHECK(cudaFree(d_vectors_temp));\n\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_evict_strategy_customized_advanced(size_t max_hbm_for_vectors,\n                                             int key_start = 0) {\n  constexpr uint64_t BUCKET_NUM = 8UL;\n  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;\n  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;\n  constexpr uint64_t TEST_KEY_NUM = 8;\n  constexpr uint64_t TEMP_KEY_NUM =\n      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;\n  constexpr uint64_t TEST_TIMES = 256;\n\n  TableOptions options;\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  std::vector<K> h_keys_base(BASE_KEY_NUM);\n  std::vector<S> h_scores_base(BASE_KEY_NUM);\n  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_test(TEST_KEY_NUM);\n  std::vector<S> h_scores_test(TEST_KEY_NUM);\n  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_temp(TEMP_KEY_NUM);\n  std::vector<S> h_scores_temp(TEMP_KEY_NUM);\n  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);\n\n  K* d_keys_temp;\n  S* d_scores_temp = nullptr;\n  V* d_vectors_temp;\n\n  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));\n  CUDA_CHECK(\n      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));\n\n  test_util::create_keys_in_one_buckets<K, S, V, DIM>(\n      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),\n      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF);\n\n  const S base_score_start = 1000;\n  for (int i = 0; i < BASE_KEY_NUM; i++) {\n    h_scores_base[i] = base_score_start + i;\n  }\n\n  test_util::create_keys_in_one_buckets<K, S, V, DIM>(\n      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),\n      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,\n      0xFFFFFFFFFFFFFFFD);\n\n  h_keys_test[4] = h_keys_base[72];\n  h_keys_test[5] = h_keys_base[73];\n  h_keys_test[6] = h_keys_base[74];\n  h_keys_test[7] = h_keys_base[75];\n\n  // replace four new keys to lower scores, would not be inserted.\n  h_scores_test[0] = 20;\n  h_scores_test[1] = 78;\n  h_scores_test[2] = 97;\n  h_scores_test[3] = 98;\n\n  // replace three exist keys to new scores, just refresh the score for them.\n  h_scores_test[4] = 99;\n  h_scores_test[5] = 1010;\n  h_scores_test[6] = 1020;\n  h_scores_test[7] = 1035;\n\n  for (int i = 4; i < TEST_KEY_NUM; i++) {\n    for (int j = 0; j < options.dim; j++) {\n      h_vectors_test[i * options.dim + j] =\n          static_cast<V>(h_keys_test[i] * 0.00001);\n    }\n  }\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  size_t total_size = 0;\n  size_t dump_counter = 0;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    {\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      find_or_insert_safe_ptr(table.get(), BASE_KEY_NUM, d_keys_temp,\n                              d_scores_temp, d_vectors_temp, options.dim,\n                              stream);\n\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      std::vector<S> h_scores_temp_sorted(h_scores_temp);\n      std::sort(h_scores_temp_sorted.begin(), h_scores_temp_sorted.end());\n\n      auto expected_range = test_util::range<S, TEMP_KEY_NUM>(base_score_start);\n      ASSERT_TRUE(std::equal(h_scores_temp_sorted.begin(),\n                             h_scores_temp_sorted.end(),\n                             expected_range.begin()));\n      for (int i = 0; i < dump_counter; i++) {\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n\n    {\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),\n                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),\n                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),\n                            TEST_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      table->assign(TEST_KEY_NUM, d_keys_temp, d_vectors_temp, d_scores_temp,\n                    stream);\n      find_or_insert_safe_ptr(table.get(), TEST_KEY_NUM, d_keys_temp,\n                              d_scores_temp, d_vectors_temp, options.dim,\n                              stream);\n\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            TEMP_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      for (int i = 0; i < TEST_KEY_NUM; i++) {\n        if (i < 4) {\n          ASSERT_EQ(h_keys_temp.end(),\n                    std::find(h_keys_temp.begin(), h_keys_temp.end(),\n                              h_keys_test[i]));\n        } else {\n          ASSERT_NE(h_keys_temp.end(),\n                    std::find(h_keys_temp.begin(), h_keys_temp.end(),\n                              h_keys_test[i]));\n        }\n      }\n      for (int i = 0; i < TEMP_KEY_NUM; i++) {\n        if (h_keys_temp[i] == h_keys_test[4])\n          ASSERT_EQ(h_scores_temp[i], h_scores_test[4]);\n        if (h_keys_temp[i] == h_keys_test[5])\n          ASSERT_EQ(h_scores_temp[i], h_scores_test[5]);\n        if (h_keys_temp[i] == h_keys_test[6])\n          ASSERT_EQ(h_scores_temp[i], h_scores_test[6]);\n        if (h_keys_temp[i] == h_keys_test[7])\n          ASSERT_EQ(h_scores_temp[i], h_scores_test[7]);\n\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaFree(d_keys_temp));\n  CUDA_CHECK(cudaFree(d_scores_temp));\n  CUDA_CHECK(cudaFree(d_vectors_temp));\n\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_evict_strategy_customized_correct_rate(size_t max_hbm_for_vectors,\n                                                 int key_start = 0) {\n  constexpr uint64_t BATCH_SIZE = 1024 * 1024ul;\n  constexpr uint64_t STEPS = 128;\n  constexpr uint64_t MAX_BUCKET_SIZE = 128;\n  constexpr uint64_t INIT_CAPACITY = BATCH_SIZE * STEPS;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t TEST_TIMES = 1;\n  float expected_correct_rate = 0.964;\n  const int rounds = 12;\n\n  TableOptions options;\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_bucket_size = MAX_BUCKET_SIZE;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  K* h_keys_base = test_util::HostBuffer<K>(BATCH_SIZE).ptr();\n  S* h_scores_base = test_util::HostBuffer<S>(BATCH_SIZE).ptr();\n  V* h_vectors_base = test_util::HostBuffer<V>(BATCH_SIZE * options.dim).ptr();\n\n  K* h_keys_temp = test_util::HostBuffer<K>(MAX_CAPACITY).ptr();\n  S* h_scores_temp = test_util::HostBuffer<S>(MAX_CAPACITY).ptr();\n  V* h_vectors_temp =\n      test_util::HostBuffer<V>(MAX_CAPACITY * options.dim).ptr();\n\n  K* d_keys_temp;\n  S* d_scores_temp = nullptr;\n  V* d_vectors_temp;\n\n  CUDA_CHECK(cudaMalloc(&d_keys_temp, MAX_CAPACITY * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores_temp, MAX_CAPACITY * sizeof(S)));\n  CUDA_CHECK(\n      cudaMalloc(&d_vectors_temp, MAX_CAPACITY * sizeof(V) * options.dim));\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  size_t total_size = 0;\n  size_t global_start_key = 100000;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n    size_t start_key = global_start_key;\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    for (int r = 0; r < rounds; r++) {\n      size_t expected_min_key = global_start_key + INIT_CAPACITY * r;\n      size_t expected_max_key = global_start_key + INIT_CAPACITY * (r + 1) - 1;\n      size_t expected_table_size =\n          (r == 0) ? size_t(expected_correct_rate * INIT_CAPACITY)\n                   : INIT_CAPACITY;\n\n      for (int s = 0; s < STEPS; s++) {\n        test_util::create_continuous_keys<K, S, V, DIM>(\n            h_keys_base, h_scores_base, h_vectors_base, BATCH_SIZE, start_key);\n        start_key += BATCH_SIZE;\n\n        CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base, BATCH_SIZE * sizeof(K),\n                              cudaMemcpyHostToDevice));\n        CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base,\n                              BATCH_SIZE * sizeof(S), cudaMemcpyHostToDevice));\n        CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base,\n                              BATCH_SIZE * sizeof(V) * options.dim,\n                              cudaMemcpyHostToDevice));\n        table->assign(BATCH_SIZE, d_keys_temp, d_vectors_temp, d_scores_temp,\n                      stream);\n        find_or_insert_safe_ptr(table.get(), BATCH_SIZE, d_keys_temp,\n                                d_scores_temp, d_vectors_temp, options.dim,\n                                stream);\n        CUDA_CHECK(cudaStreamSynchronize(stream));\n      }\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_GE(total_size, expected_table_size);\n      ASSERT_EQ(MAX_CAPACITY, table->capacity());\n\n      size_t dump_counter = table->export_batch(\n          MAX_CAPACITY, 0, d_keys_temp, d_vectors_temp, d_scores_temp, stream);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp, d_keys_temp, MAX_CAPACITY * sizeof(K),\n                            cudaMemcpyDeviceToHost));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp, d_scores_temp,\n                            MAX_CAPACITY * sizeof(S), cudaMemcpyDeviceToHost));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp, d_vectors_temp,\n                            MAX_CAPACITY * sizeof(V) * options.dim,\n                            cudaMemcpyDeviceToHost));\n\n      size_t bigger_score_counter = 0;\n      K max_key = 0;\n      size_t values_error_counter = 0;\n      for (int i = 0; i < dump_counter; i++) {\n        ASSERT_EQ(h_keys_temp[i], h_scores_temp[i]);\n        max_key = std::max(max_key, h_keys_temp[i]);\n        if (h_scores_temp[i] >= expected_min_key) bigger_score_counter++;\n        for (int j = 0; j < options.dim; j++) {\n          if (h_vectors_temp[i * options.dim + j] !=\n              static_cast<float>(h_keys_temp[i] * 0.00001)) {\n            values_error_counter++;\n          }\n        }\n      }\n\n      ASSERT_EQ(values_error_counter, 0);\n      float correct_rate = (bigger_score_counter * 1.0) / MAX_CAPACITY;\n      std::cout << std::setprecision(3) << \"[Round \" << r << \"]\"\n                << \"correct_rate=\" << correct_rate << std::endl;\n      ASSERT_GE(max_key, expected_max_key);\n      ASSERT_GE(correct_rate, expected_correct_rate);\n    }\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaFree(d_keys_temp));\n  CUDA_CHECK(cudaFree(d_scores_temp));\n  CUDA_CHECK(cudaFree(d_vectors_temp));\n\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_find_or_insert_multi_threads(size_t max_hbm_for_vectors,\n                                       const float BATCH_0_RATIO,\n                                       const float BATCH_1_RATIO,\n                                       bool capacity_silent = true) {\n  constexpr uint64_t THREAD_N = 64UL;\n  const uint64_t BATCH_0_SIZE = static_cast<uint64_t>(THREAD_N * BATCH_0_RATIO);\n  const uint64_t BATCH_1_SIZE = static_cast<uint64_t>(THREAD_N * BATCH_1_RATIO);\n  const uint64_t BATCH_2_SIZE = THREAD_N - BATCH_0_SIZE - BATCH_1_SIZE;\n\n  constexpr uint64_t INIT_CAPACITY = 32 * 1024 * 1024UL;\n  constexpr uint64_t MAX_CAPACITY = 128 * 1024 * 1024UL;\n  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;\n  constexpr uint64_t KEY_NUM = 1 * 1024 * 1024UL;\n\n  std::vector<std::thread> threads;\n\n  TableOptions options;\n\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_load_factor = 0.50f;\n  options.max_bucket_size = BUCKET_MAX_SIZE;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>;\n\n  std::shared_ptr<Table> table = std::make_shared<Table>();\n  table->init(options);\n  // assert every key is different\n  auto worker1 = [&table, KEY_NUM, options, capacity_silent](int batch,\n                                                             int task_n) {\n    K* h_keys;\n    V* h_vectors;\n    bool* h_found;\n\n    size_t current_capacity = table->capacity();\n\n    CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));\n    CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));\n    CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));\n\n    K* d_keys;\n    V* d_vectors;\n    bool* d_found;\n\n    CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));\n    CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));\n    CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n\n    cudaStream_t stream;\n    CUDA_CHECK(cudaStreamCreate(&stream));\n\n    test_util::create_random_keys<K, S, V, DIM>(h_keys, nullptr, h_vectors,\n                                                KEY_NUM);\n    CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n\n    table->assign(KEY_NUM, d_keys, d_vectors, nullptr, stream);\n    {\n      V** d_vectors_ptr = nullptr;\n      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));\n      table->find(KEY_NUM, d_keys, d_vectors_ptr, d_found, nullptr, stream);\n      test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim, KEY_NUM,\n                               stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      CUDA_CHECK(cudaFree(d_vectors_ptr));\n    }\n\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    {\n      int found_num = 0;\n      CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));\n      CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                            cudaMemcpyDeviceToHost));\n      for (int i = 0; i < KEY_NUM; i++) {\n        if (h_found[i]) {\n          found_num++;\n        }\n      }\n      ASSERT_EQ(found_num, 0);\n    }\n    find_or_insert_safe_ptr(table.get(), KEY_NUM, d_keys, nullptr, d_vectors,\n                            options.dim, stream);\n\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n    {\n      V** d_vectors_ptr = nullptr;\n      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));\n      table->find(KEY_NUM, d_keys, d_vectors_ptr, d_found, nullptr, stream);\n      test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim, KEY_NUM,\n                               stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      CUDA_CHECK(cudaFree(d_vectors_ptr));\n    }\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    int found_num = 0;\n\n    CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));\n    CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n    CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                          cudaMemcpyDeviceToHost));\n\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n    thread_local bool print_unequal{false};\n    thread_local uint64_t err_times{0};\n    for (int i = 0; i < KEY_NUM; i++) {\n      if (h_found[i]) {\n        found_num++;\n        for (int j = 0; j < options.dim; j++) {\n          if (batch == 2) {\n            if (h_vectors[i * options.dim + j] !=\n                static_cast<float>(h_keys[i] * 0.00001)) {\n              if (!print_unequal) {\n                std::cout << \" [Thread \" << task_n << \"]\\t\";\n                UNEQUAL_EXPR(h_vectors[i * options.dim + j],\n                             static_cast<float>(h_keys[i] * 0.00001));\n                print_unequal = true;\n              }\n              err_times += 1;\n            }\n          } else {\n            ASSERT_EQ(h_vectors[i * options.dim + j],\n                      static_cast<float>(h_keys[i] * 0.00001));\n          }\n        }\n      }\n    }\n\n    bool print_thread_id{false};\n    if (batch == 0 || batch == 1) {\n      ASSERT_EQ(found_num, KEY_NUM);\n      ASSERT_EQ(err_times, 0);\n    } else {\n      if (found_num != KEY_NUM or err_times != 0) {\n        std::cout << \" [Thread \" << task_n << \"]\\t\"\n                  << \"Number of keys(insert/found/error) : \" << \"(\" << KEY_NUM\n                  << \"/\" << found_num << \"/\" << err_times << \") \\t\";\n        print_thread_id = true;\n      }\n    }\n    if (current_capacity != table->capacity() && !capacity_silent) {\n      if (!print_thread_id) std::cout << \" [Thread \" << task_n << \"]\\t\";\n\n      std::cout << \"The capacity changed from \" << current_capacity << \" to \"\n                << table->capacity() << std::endl;\n    } else if (print_thread_id) {\n      std::cout << std::endl;\n    }\n\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    CUDA_CHECK(cudaStreamDestroy(stream));\n\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n\n    CUDA_CHECK(cudaFreeHost(h_keys));\n    CUDA_CHECK(cudaFreeHost(h_found));\n    CUDA_CHECK(cudaFreeHost(h_vectors));\n\n    CUDA_CHECK(cudaFree(d_keys));\n    CUDA_CHECK(cudaFree(d_vectors));\n    CUDA_CHECK(cudaFree(d_found));\n    CUDA_CHECK(cudaDeviceSynchronize());\n\n    CudaCheckError();\n  };\n  auto worker2 = [&table, KEY_NUM, options, capacity_silent](int batch,\n                                                             int task_n) {\n    K* h_keys;\n    V* h_vectors;\n    bool* h_found;\n\n    size_t current_capacity = table->capacity();\n\n    CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));\n    CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));\n    CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));\n\n    K* d_keys;\n    V* d_vectors;\n    V* d_new_vectors;\n    bool* d_found;\n\n    CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));\n    CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));\n    CUDA_CHECK(cudaMalloc(&d_new_vectors, KEY_NUM * sizeof(V) * options.dim));\n    CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n\n    cudaStream_t stream;\n    CUDA_CHECK(cudaStreamCreate(&stream));\n\n    test_util::create_random_keys<K, S, V, DIM>(h_keys, nullptr, h_vectors,\n                                                KEY_NUM);\n    CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n    CUDA_CHECK(cudaMemset(d_new_vectors, 2, KEY_NUM * sizeof(V) * options.dim));\n\n    find_or_insert_safe_ptr(table.get(), KEY_NUM, d_keys, nullptr, d_vectors,\n                            options.dim, stream);\n\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    table->assign(KEY_NUM, d_keys, d_new_vectors, nullptr, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n    {\n      V** d_vectors_ptr = nullptr;\n      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));\n      table->find(KEY_NUM, d_keys, d_vectors_ptr, d_found, nullptr, stream);\n      test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim, KEY_NUM,\n                               stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      CUDA_CHECK(cudaFree(d_vectors_ptr));\n    }\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    int found_num = 0;\n\n    CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));\n    CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n    CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                          cudaMemcpyDeviceToHost));\n\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n    thread_local bool print_unequal{false};\n    thread_local uint64_t err_times{0};\n    uint32_t i_value = 0x2020202;\n    for (int i = 0; i < KEY_NUM; i++) {\n      if (h_found[i]) {\n        found_num++;\n        for (int j = 0; j < options.dim; j++) {\n          if (batch == 2) {\n            if (h_vectors[i * options.dim + j] !=\n                *(reinterpret_cast<float*>(&i_value))) {\n              if (!print_unequal) {\n                std::cout << \" [Thread \" << task_n << \"]\\t\";\n                UNEQUAL_EXPR(h_vectors[i * options.dim + j],\n                             *(reinterpret_cast<float*>(&i_value)));\n                print_unequal = true;\n              }\n              err_times += 1;\n            }\n          } else {\n            ASSERT_EQ(h_vectors[i * options.dim + j],\n                      *(reinterpret_cast<float*>(&i_value)));\n          }\n        }\n      }\n    }\n\n    bool print_thread_id{false};\n    if (batch == 0 || batch == 1) {\n      ASSERT_EQ(found_num, KEY_NUM);\n      ASSERT_EQ(err_times, 0);\n    } else {\n      if (found_num != KEY_NUM or err_times != 0) {\n        std::cout << \" [Thread \" << task_n << \"]\\t\"\n                  << \"Number of keys(insert/found/error) : \" << \"(\" << KEY_NUM\n                  << \"/\" << found_num << \"/\" << err_times << \") \\t\";\n        print_thread_id = true;\n      }\n    }\n    if (current_capacity != table->capacity() && !capacity_silent) {\n      if (!print_thread_id) std::cout << \" [Thread \" << task_n << \"]\\t\";\n\n      std::cout << \"The capacity changed from \" << current_capacity << \" to \"\n                << table->capacity() << std::endl;\n    } else if (print_thread_id) {\n      std::cout << std::endl;\n    }\n\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    CUDA_CHECK(cudaStreamDestroy(stream));\n\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n\n    CUDA_CHECK(cudaFreeHost(h_keys));\n    CUDA_CHECK(cudaFreeHost(h_found));\n    CUDA_CHECK(cudaFreeHost(h_vectors));\n\n    CUDA_CHECK(cudaFree(d_keys));\n    CUDA_CHECK(cudaFree(d_vectors));\n    CUDA_CHECK(cudaFree(d_new_vectors));\n    CUDA_CHECK(cudaFree(d_found));\n    CUDA_CHECK(cudaDeviceSynchronize());\n\n    CudaCheckError();\n  };\n\n  /* the table is relative idle, and assume there is no eviction */\n  int batch = 0;\n  std::cout << \"[Batch 0] \" << BATCH_0_SIZE << \" threads\\n\";\n  for (int i = 0; i < BATCH_0_SIZE; i += 2) {\n    threads.emplace_back(std::thread(worker1, batch, i));\n    threads.emplace_back(std::thread(worker2, batch, i + 1));\n  }\n  for (auto& th : threads) {\n    th.join();\n  }\n  threads.clear();\n\n  /* test the correct of APIs serially */\n  batch = 1;\n  std::cout << \"[Batch 1] \" << BATCH_1_SIZE << \" threads\\n\";\n  for (int i = BATCH_0_SIZE; i < BATCH_0_SIZE + BATCH_1_SIZE; i += 2) {\n    auto th = std::thread(worker1, batch, i);\n    th.join();\n    th = std::thread(worker2, batch, i + 1);\n    th.join();\n  }\n\n  /* eviction may occur */\n  batch = 2;\n  std::cout << \"[Batch 2] \" << BATCH_2_SIZE << \" threads\\n\";\n  for (int i = BATCH_0_SIZE + BATCH_1_SIZE; i < THREAD_N; i += 2) {\n    threads.emplace_back(std::thread(worker1, batch, i));\n    threads.emplace_back(std::thread(worker2, batch, i + 1));\n  }\n  for (auto& th : threads) {\n    th.join();\n  }\n  ASSERT_EQ(table->capacity(), MAX_CAPACITY);\n}\n\ntemplate <typename K, typename V, typename S, typename Table, size_t dim = 64>\nvoid CheckFindOrInsertValues(Table* table, K* keys, V* values, S* scores,\n                             size_t len, cudaStream_t stream) {\n  std::map<K, test_util::ValueArray<V, dim>> map_before_insert;\n  std::map<K, test_util::ValueArray<V, dim>> map_after_insert;\n  K* h_tmp_keys = nullptr;\n  V* h_tmp_values = nullptr;\n  S* h_tmp_scores = nullptr;\n\n  K* d_tmp_keys = nullptr;\n  V* d_tmp_values = nullptr;\n  S* d_tmp_scores = nullptr;\n\n  size_t table_size_before = table->size(stream);\n  size_t cap = table_size_before + len;\n\n  CUDA_CHECK(cudaMallocAsync(&d_tmp_keys, cap * sizeof(K), stream));\n  CUDA_CHECK(cudaMemsetAsync(d_tmp_keys, 0, cap * sizeof(K), stream));\n  CUDA_CHECK(cudaMallocAsync(&d_tmp_values, cap * dim * sizeof(V), stream));\n  CUDA_CHECK(cudaMemsetAsync(d_tmp_values, 0, cap * dim * sizeof(V), stream));\n  CUDA_CHECK(cudaMallocAsync(&d_tmp_scores, cap * sizeof(S), stream));\n  CUDA_CHECK(cudaMemsetAsync(d_tmp_scores, 0, cap * sizeof(S), stream));\n  h_tmp_keys = (K*)malloc(cap * sizeof(K));\n  h_tmp_values = (V*)malloc(cap * dim * sizeof(V));\n  h_tmp_scores = (S*)malloc(cap * sizeof(S));\n\n  size_t table_size_verify0 = table->export_batch(\n      table->capacity(), 0, d_tmp_keys, d_tmp_values, d_tmp_scores, stream);\n  ASSERT_EQ(table_size_before, table_size_verify0);\n\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys,\n                             table_size_before * sizeof(K),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,\n                             table_size_before * dim * sizeof(V),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores,\n                             table_size_before * sizeof(S),\n                             cudaMemcpyDeviceToHost, stream));\n\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n\n  for (size_t i = 0; i < table_size_verify0; i++) {\n    test_util::ValueArray<V, dim>* vec =\n        reinterpret_cast<test_util::ValueArray<V, dim>*>(h_tmp_values +\n                                                         i * dim);\n    map_before_insert[h_tmp_keys[i]] = *vec;\n  }\n\n  auto start = std::chrono::steady_clock::now();\n\n  find_or_insert_safe_ptr(table, len, d_tmp_keys, nullptr, values, dim, stream);\n\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  auto end = std::chrono::steady_clock::now();\n  auto diff = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start);\n\n  float dur = diff.count();\n\n  size_t table_size_after = table->size(stream);\n  size_t table_size_verify1 = table->export_batch(\n      table->capacity(), 0, d_tmp_keys, d_tmp_values, d_tmp_scores, stream);\n\n  ASSERT_EQ(table_size_verify1, table_size_after);\n\n  size_t new_cap = table_size_after;\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys,\n                             table_size_after * sizeof(K),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,\n                             table_size_after * dim * sizeof(V),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores,\n                             table_size_after * sizeof(S),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  int64_t new_cap_K = (int64_t)new_cap;\n  for (int64_t i = new_cap_K - 1; i >= 0; i--) {\n    test_util::ValueArray<V, dim>* vec =\n        reinterpret_cast<test_util::ValueArray<V, dim>*>(h_tmp_values +\n                                                         i * dim);\n    map_after_insert[h_tmp_keys[i]] = *vec;\n  }\n\n  size_t value_diff_cnt = 0;\n  for (auto& it : map_after_insert) {\n    test_util::ValueArray<V, dim>& vec = map_after_insert.at(it.first);\n    for (size_t j = 0; j < dim; j++) {\n      if (vec[j] != static_cast<float>(it.first * 0.00001)) {\n        ++value_diff_cnt;\n        break;\n      }\n    }\n  }\n  ASSERT_EQ(value_diff_cnt, 0);\n  std::cout << \"Check find_or_insert behavior got \"\n            << \"value_diff_cnt: \" << value_diff_cnt\n            << \", while table_size_before: \" << table_size_before\n            << \", while table_size_after: \" << table_size_after\n            << \", while len: \" << len << std::endl;\n\n  CUDA_CHECK(cudaFreeAsync(d_tmp_keys, stream));\n  CUDA_CHECK(cudaFreeAsync(d_tmp_values, stream));\n  CUDA_CHECK(cudaFreeAsync(d_tmp_scores, stream));\n  free(h_tmp_keys);\n  free(h_tmp_values);\n  free(h_tmp_scores);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n}\n\nvoid test_find_or_insert_values_check(size_t max_hbm_for_vectors) {\n  const size_t U = 524288;\n  const size_t init_capacity = 1024;\n  const size_t B = 524288 + 13;\n  constexpr size_t dim = 64;\n\n  TableOptions opt;\n\n  opt.max_capacity = U;\n  opt.init_capacity = init_capacity;\n  opt.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>;\n  opt.dim = 64;\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  std::unique_ptr<Table> table = std::make_unique<Table>();\n  table->init(opt);\n\n  test_util::KVMSBuffer<K, V, S> data_buffer;\n  data_buffer.Reserve(B, dim, stream);\n\n  size_t offset = 0;\n  S score = 0;\n  for (int i = 0; i < 20; i++) {\n    test_util::create_random_keys<K, S, V, dim>(\n        data_buffer.keys_ptr(false), data_buffer.scores_ptr(false),\n        data_buffer.values_ptr(false), (int)B, B * 16);\n    data_buffer.SyncData(true, stream);\n\n    CheckFindOrInsertValues<K, V, S, Table, dim>(\n        table.get(), data_buffer.keys_ptr(), data_buffer.values_ptr(),\n        data_buffer.scores_ptr(), B, stream);\n\n    offset += B;\n    score += 1;\n  }\n}\n\nTEST(FindOrInsertPtrTest, test_export_batch_if) {\n  test_export_batch_if(16);\n  test_export_batch_if(0, 33);\n}\nTEST(FindOrInsertPtrTest, test_basic) {\n  test_basic(16, 3);\n  test_basic(0);\n}\nTEST(FindOrInsertPtrTest, test_basic_when_full) {\n  test_basic_when_full(16, 4);\n  test_basic_when_full(0);\n}\nTEST(FindOrInsertPtrTest, test_erase_if_pred) {\n  test_erase_if_pred(16);\n  test_erase_if_pred(0, 18);\n}\nTEST(FindOrInsertPtrTest, test_rehash) {\n  test_rehash(16);\n  test_rehash(0, 44);\n}\nTEST(FindOrInsertPtrTest, test_rehash_on_big_batch) {\n  test_rehash_on_big_batch(16, 23);\n  test_rehash_on_big_batch(0);\n}\nTEST(FindOrInsertPtrTest, test_evict_strategy_lru_basic) {\n  test_evict_strategy_lru_basic(16);\n  test_evict_strategy_lru_basic(0, 18);\n}\nTEST(FindOrInsertPtrTest, test_evict_strategy_lfu_basic) {\n  test_evict_strategy_lfu_basic(16, 29);\n  test_evict_strategy_lfu_basic(0);\n}\nTEST(FindOrInsertPtrTest, test_evict_strategy_epochlru_basic) {\n  test_evict_strategy_epochlru_basic(16, 45);\n  test_evict_strategy_epochlru_basic(0);\n}\nTEST(FindOrInsertPtrTest, test_evict_strategy_epochlfu_basic) {\n  test_evict_strategy_epochlfu_basic(16);\n  test_evict_strategy_epochlfu_basic(0, 59);\n}\nTEST(FindOrInsertPtrTest, test_evict_strategy_customized_basic) {\n  test_evict_strategy_customized_basic(16, 38);\n  test_evict_strategy_customized_basic(0);\n}\nTEST(FindOrInsertPtrTest, test_evict_strategy_customized_advanced) {\n  test_evict_strategy_customized_advanced(16);\n  test_evict_strategy_customized_advanced(0, 25);\n}\nTEST(FindOrInsertPtrTest, test_evict_strategy_customized_correct_rate) {\n  // TODO(rhdong): after blossom CI issue is resolved, the skip logic.\n  const bool skip_hmem_check = (nullptr != std::getenv(\"IS_BLOSSOM_CI\"));\n  test_evict_strategy_customized_correct_rate(16, 16);\n  if (!skip_hmem_check) {\n    test_evict_strategy_customized_correct_rate(0);\n  } else {\n    std::cout << \"The HMEM check is skipped in blossom CI!\" << std::endl;\n  }\n}\n\n// Turn on to verify that it can't deal with multi-threads cases\n// TEST(FindOrInsertPtrTest, test_find_or_insert_multi_threads) {\n//   test_find_or_insert_multi_threads(16, 0.25f, 0.125f);\n//   test_find_or_insert_multi_threads(16, 0.375f, 0.125f);\n//   test_find_or_insert_multi_threads(0, 0.25f, 0.125f);\n//   test_find_or_insert_multi_threads(0, 0.375f, 0.125f);\n// }\n// TEST(FindOrInsertPtrTest, test_dynamic_rehash_on_multi_threads) {\n//   test_dynamic_rehash_on_multi_threads(16);\n//   test_dynamic_rehash_on_multi_threads(0, 19);\n// }\n\n// Turn on to verify that it can't deal with small capacity case\n// TEST(FindOrInsertPtrTest, test_find_or_insert_values_check) {\n//   test_find_or_insert_values_check(16);\n//   // TODO(rhdong): Add back when diff error issue fixed in hybrid mode.\n//   test_find_or_insert_values_check(0);\n// }"
  },
  {
    "path": "tests/find_or_insert_ptr_test.cc.cu",
    "content": "/*\n * Copyright (c) 2023, NVIDIA CORPORATION.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *     http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n/*\n * test APIs: find_or_insert and assign,\n * move insert operation from `insert_or_assign` to `find`.\n */\n\n#include <gtest/gtest.h>\n#include <math.h>\n#include <stdio.h>\n#include <stdlib.h>\n#include <algorithm>\n#include <array>\n#include <iostream>\n#include <thread>\n#include <vector>\n#include \"merlin_hashtable.cuh\"\n#include \"test_util.cuh\"\n\nconstexpr size_t DIM = 16;\nusing K = uint64_t;\nusing V = float;\nusing S = uint64_t;\nusing EvictStrategy = nv::merlin::EvictStrategy;\nusing TableOptions = nv::merlin::HashTableOptions;\n\ntemplate <class K, class S>\nstruct EraseIfPredFunctor {\n  __forceinline__ __device__ bool operator()(const K& key, S& score,\n                                             const K& pattern,\n                                             const S& threshold) {\n    return ((key & 0x7f > pattern) && (score > threshold));\n  }\n};\n\ntemplate <class K, class S>\nstruct ExportIfPredFunctor {\n  __forceinline__ __device__ bool operator()(const K& key, S& score,\n                                             const K& pattern,\n                                             const S& threshold) {\n    return score > threshold;\n  }\n};\n\nvoid test_basic(size_t max_hbm_for_vectors, int key_start = 0) {\n  constexpr uint64_t INIT_CAPACITY = 64 * 1024 * 1024UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t KEY_NUM = 1 * 1024 * 1024UL;\n  constexpr uint64_t TEST_TIMES = 1;\n\n  K* h_keys;\n  S* h_scores;\n  V* h_vectors;\n  bool* h_found;\n\n  TableOptions options;\n\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));\n\n  CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n\n  test_util::create_random_keys<K, S, V, DIM>(h_keys, h_scores, h_vectors,\n                                              KEY_NUM);\n\n  K* d_keys;\n  S* d_scores = nullptr;\n  V* d_vectors;\n  V* d_new_vectors;\n  bool* d_found;\n  size_t dump_counter = 0;\n\n  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMalloc(&d_new_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n\n  CUDA_CHECK(\n      cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),\n                        cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors, KEY_NUM * sizeof(V) * options.dim,\n                        cudaMemcpyHostToDevice));\n\n  CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  uint64_t total_size = 0;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    {\n      V** d_vectors_ptr = nullptr;\n      bool* d_found;\n      CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));\n\n      table->find_or_insert(KEY_NUM, d_keys, d_vectors_ptr, d_found, d_scores,\n                            stream);\n      test_util::read_or_write_ptr(d_vectors_ptr, d_vectors, d_found,\n                                   options.dim, KEY_NUM, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      CUDA_CHECK(cudaFree(d_vectors_ptr));\n      CUDA_CHECK(cudaFree(d_found));\n    }\n\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, KEY_NUM);\n\n    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n    CUDA_CHECK(cudaMemset(d_scores, 0, KEY_NUM * sizeof(S)));\n    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n\n    {\n      V** d_vectors_ptr = nullptr;\n      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));\n      table->find(KEY_NUM, d_keys, d_vectors_ptr, d_found, d_scores, stream);\n      test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim, KEY_NUM,\n                               stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      CUDA_CHECK(cudaFree(d_vectors_ptr));\n    }\n\n    int found_num = 0;\n    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n\n    for (int i = 0; i < KEY_NUM; i++) {\n      if (h_found[i]) found_num++;\n      ASSERT_EQ(h_scores[i], h_keys[i]);\n      for (int j = 0; j < options.dim; j++) {\n        ASSERT_EQ(h_vectors[i * options.dim + j],\n                  static_cast<float>(h_keys[i] * 0.00001));\n      }\n    }\n    ASSERT_EQ(found_num, KEY_NUM);\n\n    CUDA_CHECK(cudaMemset(d_new_vectors, 2, KEY_NUM * sizeof(V) * options.dim));\n    table->assign(KEY_NUM, reinterpret_cast<const K*>(d_keys),\n                  reinterpret_cast<const float*>(d_new_vectors),\n                  reinterpret_cast<const S*>(d_scores), stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, KEY_NUM);\n\n    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n    CUDA_CHECK(cudaMemset(d_new_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n    {\n      V** d_vectors_ptr = nullptr;\n      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));\n      test_util::array2ptr(d_vectors_ptr, d_new_vectors, options.dim, KEY_NUM,\n                           stream);\n      table->find(KEY_NUM, d_keys, d_new_vectors, d_found, d_scores, stream);\n      test_util::read_from_ptr(d_vectors_ptr, d_new_vectors, options.dim,\n                               KEY_NUM, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      CUDA_CHECK(cudaFree(d_vectors_ptr));\n    }\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_new_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n    found_num = 0;\n    uint32_t i_value = 0x2020202;\n    for (int i = 0; i < KEY_NUM; i++) {\n      if (h_found[i]) found_num++;\n      for (int j = 0; j < options.dim; j++) {\n        ASSERT_EQ(h_vectors[i * options.dim + j],\n                  *(reinterpret_cast<float*>(&i_value)));\n      }\n    }\n    ASSERT_EQ(found_num, KEY_NUM);\n\n    table->accum_or_assign(KEY_NUM, d_keys, d_vectors, d_found, d_scores,\n                           stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, KEY_NUM);\n\n    table->erase(KEY_NUM >> 1, d_keys, stream);\n    size_t total_size_after_erase = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size_after_erase, total_size >> 1);\n\n    table->clear(stream);\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    {\n      V** d_vectors_ptr = nullptr;\n      bool* d_found;\n      CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));\n      table->find_or_insert(KEY_NUM, d_keys, d_vectors_ptr, d_found, d_scores,\n                            stream);\n      test_util::read_or_write_ptr(d_vectors_ptr, d_vectors, d_found,\n                                   options.dim, KEY_NUM, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      CUDA_CHECK(cudaFree(d_vectors_ptr));\n      CUDA_CHECK(cudaFree(d_found));\n    }\n\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, KEY_NUM);\n\n    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n    CUDA_CHECK(cudaMemset(d_scores, 0, KEY_NUM * sizeof(S)));\n    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n\n    {\n      V** d_vectors_ptr = nullptr;\n      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));\n      table->find(KEY_NUM, d_keys, d_vectors_ptr, d_found, d_scores, stream);\n      test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim, KEY_NUM,\n                               stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      CUDA_CHECK(cudaFree(d_vectors_ptr));\n    }\n\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    found_num = 0;\n    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n\n    for (int i = 0; i < KEY_NUM; i++) {\n      if (h_found[i]) found_num++;\n      ASSERT_EQ(h_scores[i], h_keys[i]);\n      for (int j = 0; j < options.dim; j++) {\n        ASSERT_EQ(h_vectors[i * options.dim + j],\n                  static_cast<float>(h_keys[i] * 0.00001));\n      }\n    }\n    ASSERT_EQ(found_num, KEY_NUM);\n\n    CUDA_CHECK(cudaMemset(d_keys, 0, KEY_NUM * sizeof(K)));\n    CUDA_CHECK(cudaMemset(d_scores, 0, KEY_NUM * sizeof(S)));\n    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n    dump_counter = table->export_batch(table->capacity(), 0, d_keys, d_vectors,\n                                       d_scores, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    ASSERT_EQ(dump_counter, KEY_NUM);\n    CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),\n                          cudaMemcpyDeviceToHost));\n    for (int i = 0; i < KEY_NUM; i++) {\n      ASSERT_EQ(h_scores[i], h_keys[i]);\n      for (int j = 0; j < options.dim; j++) {\n        ASSERT_EQ(h_vectors[i * options.dim + j],\n                  static_cast<float>(h_keys[i] * 0.00001));\n      }\n    }\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaFreeHost(h_keys));\n  CUDA_CHECK(cudaFreeHost(h_scores));\n  CUDA_CHECK(cudaFreeHost(h_vectors));\n  CUDA_CHECK(cudaFreeHost(h_found));\n\n  CUDA_CHECK(cudaFree(d_keys));\n  CUDA_CHECK(cudaFree(d_scores));\n  CUDA_CHECK(cudaFree(d_vectors));\n  CUDA_CHECK(cudaFree(d_new_vectors));\n  CUDA_CHECK(cudaFree(d_found));\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_basic_when_full(size_t max_hbm_for_vectors, int key_start = 0) {\n  constexpr uint64_t INIT_CAPACITY = 1 * 1024 * 1024UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t KEY_NUM = 1 * 1024 * 1024UL;\n  constexpr uint64_t TEST_TIMES = 1;\n\n  K* h_keys;\n  S* h_scores;\n  V* h_vectors;\n  bool* h_found;\n\n  TableOptions options;\n\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));\n\n  CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n\n  test_util::create_random_keys<K, S, V, DIM>(h_keys, h_scores, nullptr,\n                                              KEY_NUM);\n\n  K* d_keys;\n  S* d_scores = nullptr;\n  V* d_vectors;\n  V* d_def_val;\n  V** d_vectors_ptr;\n  bool* d_found;\n\n  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMalloc(&d_def_val, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));\n  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n\n  CUDA_CHECK(\n      cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),\n                        cudaMemcpyHostToDevice));\n\n  CUDA_CHECK(cudaMemset(d_vectors, 1, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMemset(d_def_val, 2, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMemset(d_vectors_ptr, 0, KEY_NUM * sizeof(V*)));\n  CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  uint64_t total_size = 0;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    {\n      V** d_vectors_ptr = nullptr;\n      bool* d_found;\n      CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));\n      table->find_or_insert(KEY_NUM, d_keys, d_vectors_ptr, d_found, d_scores,\n                            stream);\n      test_util::read_or_write_ptr(d_vectors_ptr, d_vectors, d_found,\n                                   options.dim, KEY_NUM, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      CUDA_CHECK(cudaFree(d_vectors_ptr));\n      CUDA_CHECK(cudaFree(d_found));\n    }\n\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    uint64_t total_size_after_insert = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    table->erase(KEY_NUM, d_keys, stream);\n    size_t total_size_after_erase = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size_after_erase, 0);\n\n    {\n      V** d_vectors_ptr = nullptr;\n      bool* d_found;\n      CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));\n      table->find_or_insert(KEY_NUM, d_keys, d_vectors_ptr, d_found, d_scores,\n                            stream);\n      test_util::read_or_write_ptr(d_vectors_ptr, d_vectors, d_found,\n                                   options.dim, KEY_NUM, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      CUDA_CHECK(cudaFree(d_vectors_ptr));\n      CUDA_CHECK(cudaFree(d_found));\n    }\n\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    uint64_t total_size_after_reinsert = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size_after_insert, total_size_after_reinsert);\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,\n                        cudaMemcpyDeviceToHost));\n\n  CUDA_CHECK(cudaFreeHost(h_keys));\n  CUDA_CHECK(cudaFreeHost(h_scores));\n  CUDA_CHECK(cudaFreeHost(h_found));\n\n  CUDA_CHECK(cudaFree(d_keys));\n  CUDA_CHECK(cudaFree(d_scores));\n  CUDA_CHECK(cudaFree(d_vectors));\n  CUDA_CHECK(cudaFree(d_def_val));\n  CUDA_CHECK(cudaFree(d_vectors_ptr));\n  CUDA_CHECK(cudaFree(d_found));\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_erase_if_pred(size_t max_hbm_for_vectors, int key_start = 0) {\n  constexpr uint64_t INIT_CAPACITY = 256UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t KEY_NUM = 128UL;\n  constexpr uint64_t TEST_TIMES = 1;\n  constexpr uint64_t BUCKET_MAX_SIZE = 128;\n\n  K* h_keys;\n  S* h_scores;\n  V* h_vectors;\n  bool* h_found;\n\n  TableOptions options;\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  std::unique_ptr<Table> table = std::make_unique<Table>();\n  table->init(options);\n\n  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));\n\n  K* d_keys;\n  S* d_scores = nullptr;\n  V* d_vectors;\n  bool* d_found;\n\n  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  uint64_t total_size = 0;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    test_util::create_keys_in_one_buckets<K, S, V, DIM>(\n        h_keys, h_scores, h_vectors, KEY_NUM, INIT_CAPACITY);\n    CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    {\n      V** d_vectors_ptr = nullptr;\n      bool* d_found;\n      CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));\n      table->find_or_insert(KEY_NUM, d_keys, d_vectors_ptr, d_found, d_scores,\n                            stream);\n      test_util::read_or_write_ptr(d_vectors_ptr, d_vectors, d_found,\n                                   options.dim, KEY_NUM, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      CUDA_CHECK(cudaFree(d_vectors_ptr));\n      CUDA_CHECK(cudaFree(d_found));\n    }\n\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n    K pattern = 100;\n    S threshold = 0;\n    size_t erase_num = table->template erase_if<EraseIfPredFunctor>(\n        pattern, threshold, stream);\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ((erase_num + total_size), BUCKET_MAX_SIZE);\n\n    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n\n    {\n      V** d_vectors_ptr = nullptr;\n      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));\n      table->find(KEY_NUM, d_keys, d_vectors_ptr, d_found, d_scores, stream);\n      test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim, KEY_NUM,\n                               stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      CUDA_CHECK(cudaFree(d_vectors_ptr));\n    }\n\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    int found_num = 0;\n\n    CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));\n    CUDA_CHECK(cudaMemset(h_scores, 0, KEY_NUM * sizeof(S)));\n    CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n    for (int i = 0; i < KEY_NUM; i++) {\n      if (h_found[i]) {\n        found_num++;\n        ASSERT_EQ(h_scores[i], h_keys[i]);\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors[i * options.dim + j],\n                    static_cast<float>(h_keys[i] * 0.00001));\n        }\n      }\n    }\n    ASSERT_EQ(found_num, (BUCKET_MAX_SIZE - erase_num));\n\n    table->clear(stream);\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,\n                        cudaMemcpyDeviceToHost));\n\n  CUDA_CHECK(cudaFreeHost(h_keys));\n  CUDA_CHECK(cudaFreeHost(h_scores));\n  CUDA_CHECK(cudaFreeHost(h_vectors));\n  CUDA_CHECK(cudaFreeHost(h_found));\n\n  CUDA_CHECK(cudaFree(d_keys));\n  CUDA_CHECK(cudaFree(d_scores));\n  CUDA_CHECK(cudaFree(d_vectors));\n  CUDA_CHECK(cudaFree(d_found));\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_rehash(size_t max_hbm_for_vectors, int key_start = 0) {\n  constexpr uint64_t BUCKET_MAX_SIZE = 128ul;\n  constexpr uint64_t INIT_CAPACITY = BUCKET_MAX_SIZE;\n  constexpr uint64_t MAX_CAPACITY = 4 * INIT_CAPACITY;\n  constexpr uint64_t KEY_NUM = BUCKET_MAX_SIZE * 2;\n  constexpr uint64_t TEST_TIMES = 100;\n  K* h_keys;\n  S* h_scores;\n  V* h_vectors;\n  bool* h_found;\n\n  TableOptions options;\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_bucket_size = BUCKET_MAX_SIZE;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));\n\n  K* d_keys;\n  S* d_scores = nullptr;\n  V* d_vectors;\n  bool* d_found;\n  size_t dump_counter = 0;\n\n  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  uint64_t total_size = 0;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n    test_util::create_keys_in_one_buckets<K, S, V, DIM>(\n        h_keys, h_scores, h_vectors, KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE);\n    CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    {\n      V** d_vectors_ptr = nullptr;\n      bool* d_found;\n      CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));\n      table->find_or_insert(KEY_NUM, d_keys, d_vectors_ptr, d_found, d_scores,\n                            stream);\n      test_util::read_or_write_ptr(d_vectors_ptr, d_vectors, d_found,\n                                   options.dim, KEY_NUM, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      CUDA_CHECK(cudaFree(d_vectors_ptr));\n      CUDA_CHECK(cudaFree(d_found));\n    }\n\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaDeviceSynchronize());\n    ASSERT_EQ(total_size, KEY_NUM);\n\n    dump_counter = table->export_batch(table->capacity(), 0, d_keys, d_vectors,\n                                       d_scores, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(dump_counter, KEY_NUM);\n\n    table->reserve(MAX_CAPACITY, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(table->capacity(), MAX_CAPACITY);\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, KEY_NUM);\n\n    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n    {\n      V** d_vectors_ptr = nullptr;\n      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, BUCKET_MAX_SIZE * sizeof(V*)));\n      table->find(BUCKET_MAX_SIZE, d_keys, d_vectors_ptr, d_found, d_scores,\n                  stream);\n      test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim,\n                               BUCKET_MAX_SIZE, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      CUDA_CHECK(cudaFree(d_vectors_ptr));\n    }\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    int found_num = 0;\n\n    CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));\n    CUDA_CHECK(cudaMemset(h_scores, 0, KEY_NUM * sizeof(S)));\n    CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n    CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n    for (int i = 0; i < BUCKET_MAX_SIZE; i++) {\n      if (h_found[i]) {\n        found_num++;\n        ASSERT_EQ(h_scores[i], h_keys[i]);\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors[i * options.dim + j],\n                    static_cast<float>(h_keys[i] * 0.00001));\n        }\n      }\n    }\n    ASSERT_EQ(found_num, BUCKET_MAX_SIZE);\n\n    table->clear(stream);\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,\n                        cudaMemcpyDeviceToHost));\n\n  CUDA_CHECK(cudaFreeHost(h_keys));\n  CUDA_CHECK(cudaFreeHost(h_scores));\n  CUDA_CHECK(cudaFreeHost(h_vectors));\n  CUDA_CHECK(cudaFreeHost(h_found));\n\n  CUDA_CHECK(cudaFree(d_keys));\n  CUDA_CHECK(cudaFree(d_scores));\n  CUDA_CHECK(cudaFree(d_vectors));\n  CUDA_CHECK(cudaFree(d_found));\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_rehash_on_big_batch(size_t max_hbm_for_vectors, int key_start = 0) {\n  constexpr uint64_t INIT_CAPACITY = 1024;\n  constexpr uint64_t MAX_CAPACITY = 16 * 1024;\n  constexpr uint64_t INIT_KEY_NUM = 1024;\n  constexpr uint64_t KEY_NUM = 2048;\n  K* h_keys;\n  S* h_scores;\n  V* h_vectors;\n  bool* h_found;\n\n  TableOptions options;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_bucket_size = 128;\n  options.max_load_factor = 0.6;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));\n\n  K* d_keys;\n  S* d_scores = nullptr;\n  V* d_vectors;\n  bool* d_found;\n  size_t dump_counter = 0;\n\n  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  uint64_t total_size = 0;\n  uint64_t expected_size = 0;\n  std::unique_ptr<Table> table = std::make_unique<Table>();\n  table->init(options);\n\n  test_util::create_random_keys<K, S, V, DIM>(h_keys, h_scores, h_vectors,\n                                              KEY_NUM);\n\n  CUDA_CHECK(\n      cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),\n                        cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors, KEY_NUM * sizeof(V) * options.dim,\n                        cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n\n  total_size = table->size(stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  ASSERT_EQ(total_size, 0);\n\n  {\n    V** d_vectors_ptr = nullptr;\n    bool* d_found;\n    CUDA_CHECK(cudaMalloc(&d_found, INIT_KEY_NUM * sizeof(bool)));\n    CUDA_CHECK(cudaMalloc(&d_vectors_ptr, INIT_KEY_NUM * sizeof(V*)));\n    table->find_or_insert(INIT_KEY_NUM, d_keys, d_vectors_ptr, d_found,\n                          d_scores, stream);\n    test_util::read_or_write_ptr(d_vectors_ptr, d_vectors, d_found, options.dim,\n                                 INIT_KEY_NUM, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    CUDA_CHECK(cudaFree(d_vectors_ptr));\n    CUDA_CHECK(cudaFree(d_found));\n  }\n\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  expected_size = INIT_KEY_NUM;\n\n  total_size = table->size(stream);\n  CUDA_CHECK(cudaDeviceSynchronize());\n  ASSERT_EQ(total_size, expected_size);\n  ASSERT_EQ(table->capacity(), (INIT_CAPACITY * 2));\n\n  {\n    V** d_vectors_ptr = nullptr;\n    bool* d_found;\n    CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n    CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));\n    table->find_or_insert(KEY_NUM, d_keys, d_vectors_ptr, d_found, d_scores,\n                          stream);\n    test_util::read_or_write_ptr(d_vectors_ptr, d_vectors, d_found, options.dim,\n                                 KEY_NUM, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    CUDA_CHECK(cudaFree(d_vectors_ptr));\n    CUDA_CHECK(cudaFree(d_found));\n  }\n\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  expected_size = KEY_NUM;\n\n  total_size = table->size(stream);\n  CUDA_CHECK(cudaDeviceSynchronize());\n  ASSERT_EQ(total_size, expected_size);\n  ASSERT_EQ(table->capacity(), KEY_NUM * 4);\n\n  dump_counter = table->export_batch(table->capacity(), 0, d_keys, d_vectors,\n                                     d_scores, stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  ASSERT_EQ(dump_counter, expected_size);\n\n  CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMemset(d_scores, 0, KEY_NUM * sizeof(S)));\n  {\n    V** d_vectors_ptr = nullptr;\n    CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));\n    table->find(KEY_NUM, d_keys, d_vectors_ptr, d_found, d_scores, stream);\n    test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim, KEY_NUM,\n                             stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    CUDA_CHECK(cudaFree(d_vectors_ptr));\n  }\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  int found_num = 0;\n\n  CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));\n  CUDA_CHECK(cudaMemset(h_scores, 0, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(\n      cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K), cudaMemcpyDeviceToHost));\n  CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                        cudaMemcpyDeviceToHost));\n  CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),\n                        cudaMemcpyDeviceToHost));\n  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,\n                        cudaMemcpyDeviceToHost));\n  for (int i = 0; i < KEY_NUM; i++) {\n    if (h_found[i]) {\n      found_num++;\n      ASSERT_EQ(h_scores[i], h_keys[i]);\n      for (int j = 0; j < options.dim; j++) {\n        ASSERT_EQ(h_vectors[i * options.dim + j],\n                  static_cast<float>(h_keys[i] * 0.00001));\n      }\n    }\n  }\n  ASSERT_EQ(found_num, KEY_NUM);\n\n  table->clear(stream);\n  total_size = table->size(stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  ASSERT_EQ(total_size, 0);\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,\n                        cudaMemcpyDeviceToHost));\n\n  CUDA_CHECK(cudaFreeHost(h_keys));\n  CUDA_CHECK(cudaFreeHost(h_scores));\n  CUDA_CHECK(cudaFreeHost(h_vectors));\n  CUDA_CHECK(cudaFreeHost(h_found));\n\n  CUDA_CHECK(cudaFree(d_keys));\n  CUDA_CHECK(cudaFree(d_scores));\n  CUDA_CHECK(cudaFree(d_vectors));\n  CUDA_CHECK(cudaFree(d_found));\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_dynamic_rehash_on_multi_threads(size_t max_hbm_for_vectors,\n                                          int key_start = 0) {\n  constexpr uint64_t BUCKET_MAX_SIZE = 128ul;\n  constexpr uint64_t INIT_CAPACITY = 4 * 1024;\n  constexpr uint64_t MAX_CAPACITY = 16 * 1024 * INIT_CAPACITY;\n  constexpr uint64_t KEY_NUM = 256;\n  constexpr uint64_t THREAD_N = 8;\n\n  std::vector<std::thread> threads;\n\n  TableOptions options;\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_load_factor = 0.50f;\n  options.max_bucket_size = BUCKET_MAX_SIZE;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>;\n\n  std::shared_ptr<Table> table = std::make_shared<Table>();\n  table->init(options);\n\n  auto worker_function = [&table, KEY_NUM, options](int task_n) {\n    K* h_keys;\n    V* h_vectors;\n    bool* h_found;\n\n    size_t current_capacity = table->capacity();\n\n    CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));\n    CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));\n    CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));\n\n    K* d_keys;\n    V* d_vectors;\n    bool* d_found;\n\n    CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));\n    CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));\n    CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n\n    cudaStream_t stream;\n    CUDA_CHECK(cudaStreamCreate(&stream));\n\n    while (table->capacity() < MAX_CAPACITY) {\n      test_util::create_random_keys<K, S, V, DIM>(h_keys, nullptr, h_vectors,\n                                                  KEY_NUM);\n      CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),\n                            cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,\n                            KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n\n      {\n        V** d_vectors_ptr = nullptr;\n        bool* d_found;\n        CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n        CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));\n        table->find_or_insert(KEY_NUM, d_keys, d_vectors_ptr, d_found, nullptr,\n                              stream);\n        test_util::read_or_write_ptr(d_vectors_ptr, d_vectors, d_found,\n                                     options.dim, KEY_NUM, stream);\n        CUDA_CHECK(cudaStreamSynchronize(stream));\n        CUDA_CHECK(cudaFree(d_vectors_ptr));\n        CUDA_CHECK(cudaFree(d_found));\n      }\n\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n      {\n        V** d_vectors_ptr = nullptr;\n        CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));\n        table->find(KEY_NUM, d_keys, d_vectors_ptr, d_found, nullptr, stream);\n        test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim, KEY_NUM,\n                                 stream);\n        CUDA_CHECK(cudaStreamSynchronize(stream));\n        CUDA_CHECK(cudaFree(d_vectors_ptr));\n      }\n\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      int found_num = 0;\n\n      CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));\n      CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n      CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),\n                            cudaMemcpyDeviceToHost));\n      CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                            cudaMemcpyDeviceToHost));\n\n      CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                            KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDeviceToHost));\n      for (int i = 0; i < KEY_NUM; i++) {\n        if (h_found[i]) {\n          found_num++;\n          for (int j = 0; j < options.dim; j++) {\n            ASSERT_EQ(h_vectors[i * options.dim + j],\n                      static_cast<float>(h_keys[i] * 0.00001));\n          }\n        }\n      }\n      ASSERT_EQ(found_num, KEY_NUM);\n      if (task_n == 0 && current_capacity != table->capacity()) {\n        std::cout << \"[test_dynamic_rehash_on_multi_threads] The capacity \"\n                     \"changed from \"\n                  << current_capacity << \" to \" << table->capacity()\n                  << std::endl;\n        current_capacity = table->capacity();\n      }\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n    }\n    CUDA_CHECK(cudaStreamDestroy(stream));\n\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n\n    CUDA_CHECK(cudaFreeHost(h_keys));\n    CUDA_CHECK(cudaFreeHost(h_found));\n    CUDA_CHECK(cudaFreeHost(h_vectors));\n\n    CUDA_CHECK(cudaFree(d_keys));\n    CUDA_CHECK(cudaFree(d_vectors));\n    CUDA_CHECK(cudaFree(d_found));\n    CUDA_CHECK(cudaDeviceSynchronize());\n\n    CudaCheckError();\n  };\n\n  for (int i = 0; i < THREAD_N; ++i)\n    threads.emplace_back(std::thread(worker_function, i));\n\n  for (auto& th : threads) {\n    th.join();\n  }\n  ASSERT_EQ(table->capacity(), MAX_CAPACITY);\n}\n\nvoid test_export_batch_if(size_t max_hbm_for_vectors, int key_start = 0) {\n  constexpr uint64_t INIT_CAPACITY = 256UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t KEY_NUM = 128UL;\n  constexpr uint64_t TEST_TIMES = 1;\n  constexpr uint64_t BUCKET_MAX_SIZE = 128ul;\n\n  K* h_keys;\n  S* h_scores;\n  V* h_vectors;\n  size_t h_dump_counter = 0;\n\n  TableOptions options;\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>;\n\n  std::unique_ptr<Table> table = std::make_unique<Table>();\n  table->init(options);\n\n  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));\n\n  K* d_keys;\n  S* d_scores = nullptr;\n  V* d_vectors;\n  bool* d_found;\n  size_t* d_dump_counter;\n  int found_num = 0;\n  bool* h_found;\n\n  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n  CUDA_CHECK(cudaMalloc(&d_dump_counter, sizeof(size_t)));\n\n  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  S threshold = test_util::host_nano<S>();\n  uint64_t total_size = 0;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    test_util::create_random_keys<K, S, V, DIM>(h_keys, h_scores, h_vectors,\n                                                KEY_NUM);\n\n    CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyHostToDevice));\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    {\n      V** d_vectors_ptr = nullptr;\n      bool* d_found;\n      CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));\n      table->find_or_insert(KEY_NUM, d_keys, d_vectors_ptr, d_found, nullptr,\n                            stream);\n      test_util::read_or_write_ptr(d_vectors_ptr, d_vectors, d_found,\n                                   options.dim, KEY_NUM, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      CUDA_CHECK(cudaFree(d_vectors_ptr));\n      CUDA_CHECK(cudaFree(d_found));\n    }\n\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, KEY_NUM);\n\n    CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));\n    CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n    CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),\n                          cudaMemcpyDeviceToHost));\n\n    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n    {\n      V** d_vectors_ptr = nullptr;\n      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));\n      table->find(KEY_NUM, d_keys, d_vectors_ptr, d_found, nullptr, stream);\n      test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim, KEY_NUM,\n                               stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      CUDA_CHECK(cudaFree(d_vectors_ptr));\n    }\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                          cudaMemcpyDeviceToHost));\n\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n    found_num = 0;\n    for (int i = 0; i < BUCKET_MAX_SIZE; i++) {\n      if (h_found[i]) {\n        found_num++;\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors[i * options.dim + j],\n                    static_cast<float>(h_keys[i] * 0.00001));\n        }\n      }\n    }\n    ASSERT_EQ(found_num, KEY_NUM);\n\n    K pattern = 100;\n\n    table->template export_batch_if<ExportIfPredFunctor>(\n        pattern, threshold, table->capacity(), 0, d_dump_counter, d_keys,\n        d_vectors, d_scores, stream);\n\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    CUDA_CHECK(cudaMemcpy(&h_dump_counter, d_dump_counter, sizeof(size_t),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),\n                          cudaMemcpyDeviceToHost));\n\n    size_t expected_export_count = 0;\n    for (int i = 0; i < KEY_NUM; i++) {\n      if (h_scores[i] > threshold) expected_export_count++;\n    }\n    ASSERT_EQ(expected_export_count, h_dump_counter);\n\n    table->template export_batch_if<ExportIfPredFunctor>(\n        pattern, test_util::host_nano<S>(), table->capacity(), 0,\n        d_dump_counter, d_keys, d_vectors, d_scores, stream);\n\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    CUDA_CHECK(cudaMemcpy(&h_dump_counter, d_dump_counter, sizeof(size_t),\n                          cudaMemcpyDeviceToHost));\n\n    ASSERT_EQ(0, h_dump_counter);\n\n    CUDA_CHECK(cudaMemset(h_keys, 0, KEY_NUM * sizeof(K)));\n    CUDA_CHECK(cudaMemset(h_scores, 0, KEY_NUM * sizeof(S)));\n    CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n\n    CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n\n    for (int i = 0; i < h_dump_counter; i++) {\n      ASSERT_GT(h_scores[i], threshold);\n      for (int j = 0; j < options.dim; j++) {\n        ASSERT_EQ(h_vectors[i * options.dim + j],\n                  static_cast<float>(h_keys[i] * 0.00001));\n      }\n    }\n\n    table->clear(stream);\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n  }\n  CUDA_CHECK(cudaDeviceSynchronize());\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,\n                        cudaMemcpyDeviceToHost));\n\n  CUDA_CHECK(cudaFreeHost(h_keys));\n  CUDA_CHECK(cudaFreeHost(h_scores));\n  CUDA_CHECK(cudaFreeHost(h_vectors));\n  CUDA_CHECK(cudaFreeHost(h_found));\n\n  CUDA_CHECK(cudaFree(d_keys));\n  CUDA_CHECK(cudaFree(d_scores));\n  CUDA_CHECK(cudaFree(d_vectors));\n  CUDA_CHECK(cudaFree(d_found));\n  CUDA_CHECK(cudaFree(d_dump_counter));\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_basic_for_cpu_io(int key_start = 0) {\n  constexpr uint64_t INIT_CAPACITY = 64 * 1024 * 1024UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t KEY_NUM = 1 * 1024 * 1024UL;\n  constexpr uint64_t TEST_TIMES = 1;\n\n  K* h_keys;\n  S* h_scores;\n  V* h_vectors;\n  bool* h_found;\n\n  TableOptions options;\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_hbm_for_vectors = nv::merlin::GB(0);\n  options.io_by_cpu = true;\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));\n\n  CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n\n  test_util::create_random_keys<K, S, V, DIM>(h_keys, h_scores, nullptr,\n                                              KEY_NUM);\n\n  K* d_keys;\n  S* d_scores = nullptr;\n  V* d_vectors;\n  V* d_def_val;\n  V** d_vectors_ptr;\n  bool* d_found;\n  size_t dump_counter = 0;\n\n  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMalloc(&d_def_val, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));\n  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n\n  CUDA_CHECK(\n      cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),\n                        cudaMemcpyHostToDevice));\n\n  CUDA_CHECK(cudaMemset(d_vectors, 1, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMemset(d_def_val, 2, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMemset(d_vectors_ptr, 0, KEY_NUM * sizeof(V*)));\n  CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  uint64_t total_size = 0;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    {\n      V** d_vectors_ptr = nullptr;\n      bool* d_found;\n      CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));\n      table->find_or_insert(KEY_NUM, d_keys, d_vectors_ptr, d_found, d_scores,\n                            stream);\n      test_util::read_or_write_ptr(d_vectors_ptr, d_vectors, d_found,\n                                   options.dim, KEY_NUM, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      CUDA_CHECK(cudaFree(d_vectors_ptr));\n      CUDA_CHECK(cudaFree(d_found));\n    }\n\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, KEY_NUM);\n\n    CUDA_CHECK(cudaMemset(d_vectors, 2, KEY_NUM * sizeof(V) * options.dim));\n    table->assign(KEY_NUM, d_keys, d_vectors, d_scores, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, KEY_NUM);\n\n    {\n      V** d_vectors_ptr = nullptr;\n      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));\n      table->find(KEY_NUM, d_keys, d_vectors_ptr, d_found, nullptr, stream);\n      test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim, KEY_NUM,\n                               stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      CUDA_CHECK(cudaFree(d_vectors_ptr));\n    }\n\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    int found_num = 0;\n    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                          cudaMemcpyDeviceToHost));\n\n    for (int i = 0; i < KEY_NUM; i++) {\n      if (h_found[i]) found_num++;\n    }\n    ASSERT_EQ(found_num, KEY_NUM);\n\n    table->accum_or_assign(KEY_NUM, d_keys, d_vectors, d_found, d_scores,\n                           stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, KEY_NUM);\n\n    table->erase(KEY_NUM >> 1, d_keys, stream);\n    size_t total_size_after_erase = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size_after_erase, total_size >> 1);\n\n    table->clear(stream);\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    {\n      V** d_vectors_ptr = nullptr;\n      bool* d_found;\n      CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));\n      table->find_or_insert(KEY_NUM, d_keys, d_vectors_ptr, d_found, d_scores,\n                            stream);\n      test_util::read_or_write_ptr(d_vectors_ptr, d_vectors, d_found,\n                                   options.dim, KEY_NUM, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      CUDA_CHECK(cudaFree(d_vectors_ptr));\n      CUDA_CHECK(cudaFree(d_found));\n    }\n\n    dump_counter = table->export_batch(table->capacity(), 0, d_keys, d_vectors,\n                                       d_scores, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(dump_counter, KEY_NUM);\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,\n                        cudaMemcpyDeviceToHost));\n\n  CUDA_CHECK(cudaFreeHost(h_keys));\n  CUDA_CHECK(cudaFreeHost(h_scores));\n  CUDA_CHECK(cudaFreeHost(h_vectors));\n  CUDA_CHECK(cudaFreeHost(h_found));\n\n  CUDA_CHECK(cudaFree(d_keys));\n  CUDA_CHECK(cudaFree(d_scores));\n  CUDA_CHECK(cudaFree(d_vectors));\n  CUDA_CHECK(cudaFree(d_def_val));\n  CUDA_CHECK(cudaFree(d_vectors_ptr));\n  CUDA_CHECK(cudaFree(d_found));\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_evict_strategy_lru_basic(size_t max_hbm_for_vectors,\n                                   int key_start = 0) {\n  constexpr uint64_t BUCKET_NUM = 8UL;\n  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;\n  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;\n  constexpr uint64_t TEST_KEY_NUM = 4;\n  constexpr uint64_t TEMP_KEY_NUM =\n      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;\n  constexpr uint64_t TEST_TIMES = 128;\n\n  TableOptions options;\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>;\n\n  std::vector<K> h_keys_base(BASE_KEY_NUM);\n  std::vector<S> h_scores_base(BASE_KEY_NUM);\n  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_test(TEST_KEY_NUM);\n  std::vector<S> h_scores_test(TEST_KEY_NUM);\n  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_temp(TEMP_KEY_NUM);\n  std::vector<S> h_scores_temp(TEMP_KEY_NUM);\n  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);\n\n  K* d_keys_temp;\n  S* d_scores_temp = nullptr;\n  V* d_vectors_temp;\n\n  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));\n  CUDA_CHECK(\n      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));\n\n  test_util::create_keys_in_one_buckets<K, S, V, DIM>(\n      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),\n      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF);\n\n  test_util::create_keys_in_one_buckets<K, S, V, DIM>(\n      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),\n      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,\n      0xFFFFFFFFFFFFFFFD);\n\n  h_keys_test[2] = h_keys_base[72];\n  h_keys_test[3] = h_keys_base[73];\n\n  for (int i = 0; i < options.dim; i++) {\n    h_vectors_test[2 * options.dim + i] = h_vectors_base[72 * options.dim + i];\n    h_vectors_test[3 * options.dim + i] = h_vectors_base[73 * options.dim + i];\n  }\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  size_t total_size = 0;\n  size_t dump_counter = 0;\n  S start_ts;\n  S end_ts;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    {\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n\n      {\n        V** d_vectors_ptr = nullptr;\n        bool* d_found;\n        CUDA_CHECK(cudaMalloc(&d_found, BASE_KEY_NUM * sizeof(bool)));\n        CUDA_CHECK(cudaMalloc(&d_vectors_ptr, BASE_KEY_NUM * sizeof(V*)));\n        start_ts = test_util::host_nano<S>(stream);\n        table->find_or_insert(BASE_KEY_NUM, d_keys_temp, d_vectors_ptr, d_found,\n                              nullptr, stream);\n        test_util::read_or_write_ptr(d_vectors_ptr, d_vectors_temp, d_found,\n                                     options.dim, BASE_KEY_NUM, stream);\n        CUDA_CHECK(cudaStreamSynchronize(stream));\n        end_ts = test_util::host_nano<S>(stream);\n        CUDA_CHECK(cudaFree(d_vectors_ptr));\n        CUDA_CHECK(cudaFree(d_found));\n      }\n\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      std::vector<S> h_scores_temp_sorted(h_scores_temp);\n      std::sort(h_scores_temp_sorted.begin(), h_scores_temp_sorted.end());\n\n      ASSERT_GE(h_scores_temp_sorted[0], start_ts);\n      ASSERT_LE(h_scores_temp_sorted[BASE_KEY_NUM - 1], end_ts);\n      for (int i = 0; i < dump_counter; i++) {\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n\n    {\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),\n                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),\n                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),\n                            TEST_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      start_ts = test_util::host_nano<S>(stream);\n      table->assign(TEST_KEY_NUM, d_keys_temp, d_vectors_temp, nullptr, stream);\n\n      {\n        V** d_vectors_ptr = nullptr;\n        bool* d_found;\n        CUDA_CHECK(cudaMalloc(&d_found, TEST_KEY_NUM * sizeof(bool)));\n        CUDA_CHECK(cudaMalloc(&d_vectors_ptr, TEST_KEY_NUM * sizeof(V*)));\n        table->find_or_insert(TEST_KEY_NUM, d_keys_temp, d_vectors_ptr, d_found,\n                              nullptr, stream);\n        test_util::read_or_write_ptr(d_vectors_ptr, d_vectors_temp, d_found,\n                                     options.dim, TEST_KEY_NUM, stream);\n        CUDA_CHECK(cudaStreamSynchronize(stream));\n        end_ts = test_util::host_nano<S>(stream);\n        CUDA_CHECK(cudaFree(d_vectors_ptr));\n        CUDA_CHECK(cudaFree(d_found));\n      }\n\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            TEMP_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      std::array<S, TEMP_KEY_NUM> h_scores_temp_sorted;\n      int ctr = 0;\n      for (int i = 0; i < TEMP_KEY_NUM; i++) {\n        if (h_keys_test.end() !=\n            std::find(h_keys_test.begin(), h_keys_test.end(), h_keys_temp[i])) {\n          ASSERT_GT(h_scores_temp[i], BUCKET_MAX_SIZE);\n          h_scores_temp_sorted[ctr++] = h_scores_temp[i];\n        } else {\n          ASSERT_LE(h_scores_temp[i], start_ts);\n        }\n      }\n      std::sort(h_scores_temp_sorted.begin(),\n                h_scores_temp_sorted.begin() + ctr);\n\n      ASSERT_GE(h_scores_temp_sorted[0], start_ts);\n      ASSERT_LE(h_scores_temp_sorted[ctr - 1], end_ts);\n      for (int i = 0; i < dump_counter; i++) {\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaFree(d_keys_temp));\n  CUDA_CHECK(cudaFree(d_scores_temp));\n  CUDA_CHECK(cudaFree(d_vectors_temp));\n\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_evict_strategy_lfu_basic(size_t max_hbm_for_vectors,\n                                   int key_start = 0) {\n  constexpr uint64_t BUCKET_NUM = 8UL;\n  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;\n  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;\n  constexpr uint64_t TEST_KEY_NUM = 4;\n  constexpr uint64_t TEMP_KEY_NUM =\n      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;\n  constexpr uint64_t TEST_TIMES = 128;\n\n  TableOptions options;\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLfu>;\n\n  std::vector<K> h_keys_base(BASE_KEY_NUM);\n  std::vector<S> h_scores_base(BASE_KEY_NUM);\n  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_test(TEST_KEY_NUM);\n  std::vector<S> h_scores_test(TEST_KEY_NUM);\n  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_temp(TEMP_KEY_NUM);\n  std::vector<S> h_scores_temp(TEMP_KEY_NUM);\n  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);\n\n  K* d_keys_temp;\n  S* d_scores_temp = nullptr;\n  V* d_vectors_temp;\n\n  int freq_range = 1000;\n\n  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));\n  CUDA_CHECK(\n      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));\n\n  test_util::create_keys_in_one_buckets_lfu<K, S, V, DIM>(\n      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),\n      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF,\n      freq_range);\n\n  test_util::create_keys_in_one_buckets_lfu<K, S, V, DIM>(\n      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),\n      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,\n      0xFFFFFFFFFFFFFFFD, freq_range);\n\n  h_keys_test[2] = h_keys_base[72];\n  h_keys_test[3] = h_keys_base[73];\n\n  h_scores_test[2] = h_keys_base[72] % freq_range;\n  h_scores_test[3] = h_keys_base[73] % freq_range;\n\n  for (int i = 0; i < options.dim; i++) {\n    h_vectors_test[2 * options.dim + i] = h_vectors_base[72 * options.dim + i];\n    h_vectors_test[3 * options.dim + i] = h_vectors_base[73 * options.dim + i];\n  }\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  size_t total_size = 0;\n  size_t dump_counter = 0;\n  S global_epoch = 1;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    {\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      {\n        V** d_vectors_ptr = nullptr;\n        bool* d_found;\n        CUDA_CHECK(cudaMalloc(&d_found, BASE_KEY_NUM * sizeof(bool)));\n        CUDA_CHECK(cudaMalloc(&d_vectors_ptr, BASE_KEY_NUM * sizeof(V*)));\n        table->find_or_insert(BASE_KEY_NUM, d_keys_temp, d_vectors_ptr, d_found,\n                              d_scores_temp, stream);\n        test_util::read_or_write_ptr(d_vectors_ptr, d_vectors_temp, d_found,\n                                     options.dim, BASE_KEY_NUM, stream);\n        CUDA_CHECK(cudaStreamSynchronize(stream));\n        CUDA_CHECK(cudaFree(d_vectors_ptr));\n        CUDA_CHECK(cudaFree(d_found));\n      }\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      for (int i = 0; i < dump_counter; i++) {\n        ASSERT_EQ(h_scores_temp[i], h_keys_temp[i] % freq_range);\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n\n    {\n      global_epoch++;\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),\n                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),\n                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),\n                            TEST_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      table->assign(TEST_KEY_NUM, d_keys_temp, d_vectors_temp, d_scores_temp,\n                    stream);\n      {\n        V** d_vectors_ptr = nullptr;\n        bool* d_found;\n        CUDA_CHECK(cudaMalloc(&d_found, TEST_KEY_NUM * sizeof(bool)));\n        CUDA_CHECK(cudaMalloc(&d_vectors_ptr, TEST_KEY_NUM * sizeof(V*)));\n        table->find_or_insert(TEST_KEY_NUM, d_keys_temp, d_vectors_ptr, d_found,\n                              d_scores_temp, stream);\n        test_util::read_or_write_ptr(d_vectors_ptr, d_vectors_temp, d_found,\n                                     options.dim, TEST_KEY_NUM, stream);\n        CUDA_CHECK(cudaStreamSynchronize(stream));\n        CUDA_CHECK(cudaFree(d_vectors_ptr));\n        CUDA_CHECK(cudaFree(d_found));\n      }\n\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            TEMP_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      for (int i = 0; i < dump_counter; i++) {\n        bool in_base =\n            h_keys_base.end() !=\n            std::find(h_keys_base.begin(), h_keys_base.end(), h_keys_temp[i]);\n        bool in_test =\n            h_keys_test.end() !=\n            std::find(h_keys_test.begin(), h_keys_test.end(), h_keys_temp[i]);\n        if (in_base && in_test) {\n          ASSERT_EQ(h_scores_temp[i], (h_keys_temp[i] % freq_range) *\n                                          3);  // update score when found.\n        } else {\n          ASSERT_EQ(h_scores_temp[i], (h_keys_temp[i] % freq_range));\n        }\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaFree(d_keys_temp));\n  CUDA_CHECK(cudaFree(d_scores_temp));\n  CUDA_CHECK(cudaFree(d_vectors_temp));\n\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_evict_strategy_epochlru_basic(size_t max_hbm_for_vectors,\n                                        int key_start = 0) {\n  constexpr int RSHIFT_ON_NANO = 20;\n\n  constexpr uint64_t BUCKET_NUM = 8UL;\n  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;\n  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;\n  constexpr uint64_t TEST_KEY_NUM = 4;\n  constexpr uint64_t TEMP_KEY_NUM =\n      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;\n  constexpr uint64_t TEST_TIMES = 128;\n\n  TableOptions options;\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kEpochLru>;\n\n  std::vector<K> h_keys_base(BASE_KEY_NUM);\n  std::vector<S> h_scores_base(BASE_KEY_NUM);\n  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_test(TEST_KEY_NUM);\n  std::vector<S> h_scores_test(TEST_KEY_NUM);\n  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_temp(TEMP_KEY_NUM);\n  std::vector<S> h_scores_temp(TEMP_KEY_NUM);\n  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);\n\n  K* d_keys_temp;\n  S* d_scores_temp = nullptr;\n  V* d_vectors_temp;\n\n  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));\n  CUDA_CHECK(\n      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));\n\n  test_util::create_keys_in_one_buckets<K, S, V, DIM>(\n      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),\n      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF);\n\n  test_util::create_keys_in_one_buckets<K, S, V, DIM>(\n      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),\n      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,\n      0xFFFFFFFFFFFFFFFD);\n\n  h_keys_test[2] = h_keys_base[72];\n  h_keys_test[3] = h_keys_base[73];\n\n  for (int i = 0; i < options.dim; i++) {\n    h_vectors_test[2 * options.dim + i] = h_vectors_base[72 * options.dim + i];\n    h_vectors_test[3 * options.dim + i] = h_vectors_base[73 * options.dim + i];\n  }\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  size_t total_size = 0;\n  size_t dump_counter = 0;\n  S start_ts;\n  S end_ts;\n\n  S global_epoch = 1;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    {\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      {\n        V** d_vectors_ptr = nullptr;\n        bool* d_found;\n        CUDA_CHECK(cudaMalloc(&d_found, BASE_KEY_NUM * sizeof(bool)));\n        CUDA_CHECK(cudaMalloc(&d_vectors_ptr, BASE_KEY_NUM * sizeof(V*)));\n        start_ts =\n            (test_util::host_nano<S>(stream) >> RSHIFT_ON_NANO) & 0xFFFFFFFF;\n        table->set_global_epoch(global_epoch);\n        table->find_or_insert(BASE_KEY_NUM, d_keys_temp, d_vectors_ptr, d_found,\n                              nullptr, stream);\n        test_util::read_or_write_ptr(d_vectors_ptr, d_vectors_temp, d_found,\n                                     options.dim, BASE_KEY_NUM, stream);\n        CUDA_CHECK(cudaStreamSynchronize(stream));\n        end_ts =\n            (test_util::host_nano<S>(stream) >> RSHIFT_ON_NANO) & 0xFFFFFFFF;\n        CUDA_CHECK(cudaFree(d_vectors_ptr));\n        CUDA_CHECK(cudaFree(d_found));\n      }\n\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      std::vector<S> h_scores_temp_sorted(h_scores_temp);\n      std::sort(h_scores_temp_sorted.begin(), h_scores_temp_sorted.end());\n\n      ASSERT_GE(h_scores_temp_sorted[0], (global_epoch << 32 | start_ts));\n      ASSERT_LE(h_scores_temp_sorted[BASE_KEY_NUM - 1],\n                (global_epoch << 32 | end_ts));\n      for (int i = 0; i < dump_counter; i++) {\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n\n    {\n      global_epoch++;\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),\n                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),\n                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),\n                            TEST_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      start_ts =\n          (test_util::host_nano<S>(stream) >> RSHIFT_ON_NANO) & 0xFFFFFFFF;\n      table->set_global_epoch(global_epoch);\n      table->assign(TEST_KEY_NUM, d_keys_temp, d_vectors_temp, nullptr, stream);\n      {\n        V** d_vectors_ptr = nullptr;\n        bool* d_found;\n        CUDA_CHECK(cudaMalloc(&d_found, TEST_KEY_NUM * sizeof(bool)));\n        CUDA_CHECK(cudaMalloc(&d_vectors_ptr, TEST_KEY_NUM * sizeof(V*)));\n        table->find_or_insert(TEST_KEY_NUM, d_keys_temp, d_vectors_ptr, d_found,\n                              nullptr, stream);\n        test_util::read_or_write_ptr(d_vectors_ptr, d_vectors_temp, d_found,\n                                     options.dim, TEST_KEY_NUM, stream);\n        CUDA_CHECK(cudaStreamSynchronize(stream));\n\n        end_ts =\n            (test_util::host_nano<S>(stream) >> RSHIFT_ON_NANO) & 0xFFFFFFFF;\n        CUDA_CHECK(cudaFree(d_vectors_ptr));\n        CUDA_CHECK(cudaFree(d_found));\n      }\n\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            TEMP_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      std::vector<S> h_scores_temp_sorted;\n      h_scores_temp_sorted.reserve(TEMP_KEY_NUM);\n      for (int i = 0; i < TEMP_KEY_NUM; i++) {\n        if (h_keys_test.end() !=\n            std::find(h_keys_test.begin(), h_keys_test.end(), h_keys_temp[i])) {\n          ASSERT_GE(h_scores_temp[i], (global_epoch << 32 | start_ts));\n          h_scores_temp_sorted.push_back(h_scores_temp[i]);\n        } else {\n          ASSERT_LE(h_scores_temp[i], (global_epoch << 32 | start_ts));\n        }\n      }\n      std::sort(h_scores_temp_sorted.begin(), h_scores_temp_sorted.end());\n\n      if (!h_scores_temp_sorted.empty()) {\n        ASSERT_GE(h_scores_temp_sorted[0], (global_epoch << 32 | start_ts));\n        ASSERT_LE(h_scores_temp_sorted[h_scores_temp_sorted.size() - 1],\n                  (global_epoch << 32 | end_ts));\n      }\n      for (int i = 0; i < dump_counter; i++) {\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaFree(d_keys_temp));\n  CUDA_CHECK(cudaFree(d_scores_temp));\n  CUDA_CHECK(cudaFree(d_vectors_temp));\n\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_evict_strategy_epochlfu_basic(size_t max_hbm_for_vectors,\n                                        int key_start = 0) {\n  constexpr uint64_t BUCKET_NUM = 8UL;\n  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;\n  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;\n  constexpr uint64_t TEST_KEY_NUM = 4;\n  constexpr uint64_t TEMP_KEY_NUM =\n      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;\n  constexpr uint64_t TEST_TIMES = 128;\n\n  TableOptions options;\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kEpochLfu>;\n\n  std::vector<K> h_keys_base(BASE_KEY_NUM);\n  std::vector<S> h_scores_base(BASE_KEY_NUM);\n  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_test(TEST_KEY_NUM);\n  std::vector<S> h_scores_test(TEST_KEY_NUM);\n  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_temp(TEMP_KEY_NUM);\n  std::vector<S> h_scores_temp(TEMP_KEY_NUM);\n  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);\n\n  K* d_keys_temp;\n  S* d_scores_temp = nullptr;\n  V* d_vectors_temp;\n\n  int freq_range = 1000;\n\n  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));\n  CUDA_CHECK(\n      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));\n\n  test_util::create_keys_in_one_buckets_lfu<K, S, V, DIM>(\n      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),\n      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF,\n      freq_range);\n\n  test_util::create_keys_in_one_buckets_lfu<K, S, V, DIM>(\n      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),\n      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,\n      0xFFFFFFFFFFFFFFFD, freq_range);\n\n  // Simulate overflow of low 32bits.\n  h_scores_base[71] = static_cast<S>(std::numeric_limits<uint32_t>::max() -\n                                     static_cast<uint32_t>(1));\n\n  h_keys_test[1] = h_keys_base[71];\n  h_keys_test[2] = h_keys_base[72];\n  h_keys_test[3] = h_keys_base[73];\n\n  h_scores_test[1] = h_scores_base[71];\n  h_scores_test[2] = h_keys_base[72] % freq_range;\n  h_scores_test[3] = h_keys_base[73] % freq_range;\n\n  for (int i = 0; i < options.dim; i++) {\n    h_vectors_test[1 * options.dim + i] = h_vectors_base[71 * options.dim + i];\n    h_vectors_test[2 * options.dim + i] = h_vectors_base[72 * options.dim + i];\n    h_vectors_test[3 * options.dim + i] = h_vectors_base[73 * options.dim + i];\n  }\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  size_t total_size = 0;\n  size_t dump_counter = 0;\n  S global_epoch = 1;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    {\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      {\n        V** d_vectors_ptr = nullptr;\n        bool* d_found;\n        CUDA_CHECK(cudaMalloc(&d_found, BASE_KEY_NUM * sizeof(bool)));\n        CUDA_CHECK(cudaMalloc(&d_vectors_ptr, BASE_KEY_NUM * sizeof(V*)));\n        table->set_global_epoch(global_epoch);\n        table->find_or_insert(BASE_KEY_NUM, d_keys_temp, d_vectors_ptr, d_found,\n                              d_scores_temp, stream);\n        test_util::read_or_write_ptr(d_vectors_ptr, d_vectors_temp, d_found,\n                                     options.dim, BASE_KEY_NUM, stream);\n        CUDA_CHECK(cudaStreamSynchronize(stream));\n        CUDA_CHECK(cudaFree(d_vectors_ptr));\n        CUDA_CHECK(cudaFree(d_found));\n      }\n\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      for (int i = 0; i < dump_counter; i++) {\n        if (h_keys_temp[i] == h_keys_base[71]) {\n          S expected_score = test_util::make_expected_score_for_epochlfu<S>(\n              global_epoch, h_scores_base[71]);\n          ASSERT_EQ(h_scores_temp[i], expected_score);\n        } else {\n          S expected_score = test_util::make_expected_score_for_epochlfu<S>(\n              global_epoch, (h_keys_temp[i] % freq_range));\n          ASSERT_EQ(h_scores_temp[i], expected_score);\n        }\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n\n    {\n      global_epoch++;\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),\n                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),\n                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),\n                            TEST_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      table->set_global_epoch(global_epoch);\n      table->assign(TEST_KEY_NUM, d_keys_temp, d_vectors_temp, d_scores_temp,\n                    stream);\n      {\n        V** d_vectors_ptr = nullptr;\n        bool* d_found;\n        CUDA_CHECK(cudaMalloc(&d_found, TEST_KEY_NUM * sizeof(bool)));\n        CUDA_CHECK(cudaMalloc(&d_vectors_ptr, TEST_KEY_NUM * sizeof(V*)));\n        table->find_or_insert(TEST_KEY_NUM, d_keys_temp, d_vectors_ptr, d_found,\n                              d_scores_temp, stream);\n        test_util::read_or_write_ptr(d_vectors_ptr, d_vectors_temp, d_found,\n                                     options.dim, TEST_KEY_NUM, stream);\n        CUDA_CHECK(cudaStreamSynchronize(stream));\n        CUDA_CHECK(cudaFree(d_vectors_ptr));\n        CUDA_CHECK(cudaFree(d_found));\n      }\n\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            TEMP_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      ASSERT_TRUE(h_keys_temp.end() != std::find(h_keys_temp.begin(),\n                                                 h_keys_temp.end(),\n                                                 h_keys_base[71]));\n\n      for (int i = 0; i < dump_counter; i++) {\n        bool in_base =\n            h_keys_base.end() !=\n            std::find(h_keys_base.begin(), h_keys_base.end(), h_keys_temp[i]);\n        bool in_test =\n            h_keys_test.end() !=\n            std::find(h_keys_test.begin(), h_keys_test.end(), h_keys_temp[i]);\n\n        if (in_base && in_test) {\n          if (h_keys_temp[i] == h_keys_base[71]) {\n            S expected_score = test_util::make_expected_score_for_epochlfu<S>(\n                global_epoch, h_scores_base[71] * 2);\n            ASSERT_EQ(h_scores_temp[i], expected_score);\n          } else {\n            S expected_score = test_util::make_expected_score_for_epochlfu<S>(\n                global_epoch,\n                (h_keys_temp[i] % freq_range) * 3);  // update score when found.\n            ASSERT_EQ(h_scores_temp[i], expected_score);\n          }\n        } else {\n          if (h_keys_temp[i] == h_keys_base[71]) {\n            S expected_score = test_util::make_expected_score_for_epochlfu<S>(\n                global_epoch - static_cast<S>(in_base), h_scores_base[71]);\n            ASSERT_EQ(h_scores_temp[i], expected_score);\n          } else {\n            S expected_score = test_util::make_expected_score_for_epochlfu<S>(\n                global_epoch - static_cast<S>(in_base),\n                (h_keys_temp[i] % freq_range));\n\n            ASSERT_EQ(h_scores_temp[i], expected_score);\n          }\n        }\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaFree(d_keys_temp));\n  CUDA_CHECK(cudaFree(d_scores_temp));\n  CUDA_CHECK(cudaFree(d_vectors_temp));\n\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_evict_strategy_customized_basic(size_t max_hbm_for_vectors,\n                                          int key_start = 0) {\n  constexpr uint64_t BUCKET_NUM = 8UL;\n  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;\n  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;\n  constexpr uint64_t TEST_KEY_NUM = 128;\n  constexpr uint64_t TEMP_KEY_NUM =\n      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;\n  constexpr uint64_t TEST_TIMES = 128;\n\n  TableOptions options;\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  std::vector<K> h_keys_base(BASE_KEY_NUM);\n  std::vector<S> h_scores_base(BASE_KEY_NUM);\n  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_test(TEST_KEY_NUM);\n  std::vector<S> h_scores_test(TEST_KEY_NUM);\n  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_temp(TEMP_KEY_NUM);\n  std::vector<S> h_scores_temp(TEMP_KEY_NUM);\n  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);\n\n  K* d_keys_temp;\n  S* d_scores_temp = nullptr;\n  V* d_vectors_temp;\n\n  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));\n  CUDA_CHECK(\n      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));\n\n  test_util::create_keys_in_one_buckets<K, S, V, DIM>(\n      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),\n      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF);\n\n  const S base_score_start = 1000;\n  for (int i = 0; i < BASE_KEY_NUM; i++) {\n    h_scores_base[i] = base_score_start + i;\n  }\n\n  test_util::create_keys_in_one_buckets<K, S, V, DIM>(\n      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),\n      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,\n      0xFFFFFFFFFFFFFFFD);\n  const S test_score_start = base_score_start + BASE_KEY_NUM;\n  for (int i = 0; i < TEST_KEY_NUM; i++) {\n    h_scores_test[i] = test_score_start + i;\n  }\n  for (int i = 64; i < TEST_KEY_NUM; i++) {\n    h_keys_test[i] = h_keys_base[i];\n    for (int j = 0; j < options.dim; j++) {\n      h_vectors_test[i * options.dim + j] = h_vectors_base[i * options.dim + j];\n    }\n  }\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  size_t total_size = 0;\n  size_t dump_counter = 0;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    {\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      {\n        V** d_vectors_ptr = nullptr;\n        bool* d_found;\n        CUDA_CHECK(cudaMalloc(&d_found, BASE_KEY_NUM * sizeof(bool)));\n        CUDA_CHECK(cudaMalloc(&d_vectors_ptr, BASE_KEY_NUM * sizeof(V*)));\n        table->find_or_insert(BASE_KEY_NUM, d_keys_temp, d_vectors_ptr, d_found,\n                              d_scores_temp, stream);\n        test_util::read_or_write_ptr(d_vectors_ptr, d_vectors_temp, d_found,\n                                     options.dim, BASE_KEY_NUM, stream);\n        CUDA_CHECK(cudaStreamSynchronize(stream));\n        CUDA_CHECK(cudaFree(d_vectors_ptr));\n        CUDA_CHECK(cudaFree(d_found));\n      }\n\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      std::vector<S> h_scores_temp_sorted(h_scores_temp);\n      std::sort(h_scores_temp_sorted.begin(), h_scores_temp_sorted.end());\n\n      auto expected_range = test_util::range<S, TEMP_KEY_NUM>(base_score_start);\n      ASSERT_TRUE(std::equal(h_scores_temp_sorted.begin(),\n                             h_scores_temp_sorted.end(),\n                             expected_range.begin()));\n      for (int i = 0; i < dump_counter; i++) {\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n\n    {\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),\n                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),\n                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),\n                            TEST_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      table->assign(TEST_KEY_NUM, d_keys_temp, d_vectors_temp, d_scores_temp,\n                    stream);\n      {\n        V** d_vectors_ptr = nullptr;\n        bool* d_found;\n        CUDA_CHECK(cudaMalloc(&d_found, TEST_KEY_NUM * sizeof(bool)));\n        CUDA_CHECK(cudaMalloc(&d_vectors_ptr, TEST_KEY_NUM * sizeof(V*)));\n        table->find_or_insert(TEST_KEY_NUM, d_keys_temp, d_vectors_ptr, d_found,\n                              d_scores_temp, stream);\n        test_util::read_or_write_ptr(d_vectors_ptr, d_vectors_temp, d_found,\n                                     options.dim, TEST_KEY_NUM, stream);\n        CUDA_CHECK(cudaStreamSynchronize(stream));\n        CUDA_CHECK(cudaFree(d_vectors_ptr));\n        CUDA_CHECK(cudaFree(d_found));\n      }\n\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            TEMP_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      std::vector<S> h_scores_temp_sorted(h_scores_temp);\n      std::sort(h_scores_temp_sorted.begin(), h_scores_temp_sorted.end());\n\n      auto expected_range_test =\n          test_util::range<S, TEST_KEY_NUM>(test_score_start);\n      ASSERT_TRUE(std::equal(h_scores_temp_sorted.begin(),\n                             h_scores_temp_sorted.end(),\n                             expected_range_test.begin()));\n      for (int i = 0; i < dump_counter; i++) {\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaFree(d_keys_temp));\n  CUDA_CHECK(cudaFree(d_scores_temp));\n  CUDA_CHECK(cudaFree(d_vectors_temp));\n\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_evict_strategy_customized_advanced(size_t max_hbm_for_vectors,\n                                             int key_start = 0) {\n  constexpr uint64_t BUCKET_NUM = 8UL;\n  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;\n  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;\n  constexpr uint64_t TEST_KEY_NUM = 8;\n  constexpr uint64_t TEMP_KEY_NUM =\n      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;\n  constexpr uint64_t TEST_TIMES = 256;\n\n  TableOptions options;\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  std::vector<K> h_keys_base(BASE_KEY_NUM);\n  std::vector<S> h_scores_base(BASE_KEY_NUM);\n  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_test(TEST_KEY_NUM);\n  std::vector<S> h_scores_test(TEST_KEY_NUM);\n  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_temp(TEMP_KEY_NUM);\n  std::vector<S> h_scores_temp(TEMP_KEY_NUM);\n  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);\n\n  K* d_keys_temp;\n  S* d_scores_temp = nullptr;\n  V* d_vectors_temp;\n\n  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));\n  CUDA_CHECK(\n      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));\n\n  test_util::create_keys_in_one_buckets<K, S, V, DIM>(\n      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),\n      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF);\n\n  const S base_score_start = 1000;\n  for (int i = 0; i < BASE_KEY_NUM; i++) {\n    h_scores_base[i] = base_score_start + i;\n  }\n\n  test_util::create_keys_in_one_buckets<K, S, V, DIM>(\n      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),\n      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,\n      0xFFFFFFFFFFFFFFFD);\n\n  h_keys_test[4] = h_keys_base[72];\n  h_keys_test[5] = h_keys_base[73];\n  h_keys_test[6] = h_keys_base[74];\n  h_keys_test[7] = h_keys_base[75];\n\n  // replace four new keys to lower scores, would not be inserted.\n  h_scores_test[0] = 20;\n  h_scores_test[1] = 78;\n  h_scores_test[2] = 97;\n  h_scores_test[3] = 98;\n\n  // replace three exist keys to new scores, just refresh the score for them.\n  h_scores_test[4] = 99;\n  h_scores_test[5] = 1010;\n  h_scores_test[6] = 1020;\n  h_scores_test[7] = 1035;\n\n  for (int i = 4; i < TEST_KEY_NUM; i++) {\n    for (int j = 0; j < options.dim; j++) {\n      h_vectors_test[i * options.dim + j] =\n          static_cast<V>(h_keys_test[i] * 0.00001);\n    }\n  }\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  size_t total_size = 0;\n  size_t dump_counter = 0;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    {\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      {\n        V** d_vectors_ptr = nullptr;\n        bool* d_found;\n        CUDA_CHECK(cudaMalloc(&d_found, BASE_KEY_NUM * sizeof(bool)));\n        CUDA_CHECK(cudaMalloc(&d_vectors_ptr, BASE_KEY_NUM * sizeof(V*)));\n        table->find_or_insert(BASE_KEY_NUM, d_keys_temp, d_vectors_ptr, d_found,\n                              d_scores_temp, stream);\n        test_util::read_or_write_ptr(d_vectors_ptr, d_vectors_temp, d_found,\n                                     options.dim, BASE_KEY_NUM, stream);\n        CUDA_CHECK(cudaStreamSynchronize(stream));\n        CUDA_CHECK(cudaFree(d_vectors_ptr));\n        CUDA_CHECK(cudaFree(d_found));\n      }\n\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      std::vector<S> h_scores_temp_sorted(h_scores_temp);\n      std::sort(h_scores_temp_sorted.begin(), h_scores_temp_sorted.end());\n\n      auto expected_range = test_util::range<S, TEMP_KEY_NUM>(base_score_start);\n      ASSERT_TRUE(std::equal(h_scores_temp_sorted.begin(),\n                             h_scores_temp_sorted.end(),\n                             expected_range.begin()));\n      for (int i = 0; i < dump_counter; i++) {\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n\n    {\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),\n                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),\n                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),\n                            TEST_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      table->assign(TEST_KEY_NUM, d_keys_temp, d_vectors_temp, d_scores_temp,\n                    stream);\n      {\n        V** d_vectors_ptr = nullptr;\n        bool* d_found;\n        CUDA_CHECK(cudaMalloc(&d_found, TEST_KEY_NUM * sizeof(bool)));\n        CUDA_CHECK(cudaMalloc(&d_vectors_ptr, TEST_KEY_NUM * sizeof(V*)));\n        table->find_or_insert(TEST_KEY_NUM, d_keys_temp, d_vectors_ptr, d_found,\n                              d_scores_temp, stream);\n        test_util::read_or_write_ptr(d_vectors_ptr, d_vectors_temp, d_found,\n                                     options.dim, TEST_KEY_NUM, stream);\n        CUDA_CHECK(cudaStreamSynchronize(stream));\n        CUDA_CHECK(cudaFree(d_vectors_ptr));\n        CUDA_CHECK(cudaFree(d_found));\n      }\n\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            TEMP_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      for (int i = 0; i < TEST_KEY_NUM; i++) {\n        if (i < 4) {\n          ASSERT_EQ(h_keys_temp.end(),\n                    std::find(h_keys_temp.begin(), h_keys_temp.end(),\n                              h_keys_test[i]));\n        } else {\n          ASSERT_NE(h_keys_temp.end(),\n                    std::find(h_keys_temp.begin(), h_keys_temp.end(),\n                              h_keys_test[i]));\n        }\n      }\n      for (int i = 0; i < TEMP_KEY_NUM; i++) {\n        if (h_keys_temp[i] == h_keys_test[4])\n          ASSERT_EQ(h_scores_temp[i], h_scores_test[4]);\n        if (h_keys_temp[i] == h_keys_test[5])\n          ASSERT_EQ(h_scores_temp[i], h_scores_test[5]);\n        if (h_keys_temp[i] == h_keys_test[6])\n          ASSERT_EQ(h_scores_temp[i], h_scores_test[6]);\n        if (h_keys_temp[i] == h_keys_test[7])\n          ASSERT_EQ(h_scores_temp[i], h_scores_test[7]);\n\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaFree(d_keys_temp));\n  CUDA_CHECK(cudaFree(d_scores_temp));\n  CUDA_CHECK(cudaFree(d_vectors_temp));\n\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_evict_strategy_customized_correct_rate(size_t max_hbm_for_vectors,\n                                                 int key_start = 0) {\n  constexpr uint64_t BATCH_SIZE = 1024 * 1024ul;\n  constexpr uint64_t STEPS = 128;\n  constexpr uint64_t MAX_BUCKET_SIZE = 128;\n  constexpr uint64_t INIT_CAPACITY = BATCH_SIZE * STEPS;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t TEST_TIMES = 1;\n  float expected_correct_rate = 0.964;\n  const int rounds = 12;\n\n  TableOptions options;\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_bucket_size = MAX_BUCKET_SIZE;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  K* h_keys_base = test_util::HostBuffer<K>(BATCH_SIZE).ptr();\n  S* h_scores_base = test_util::HostBuffer<S>(BATCH_SIZE).ptr();\n  V* h_vectors_base = test_util::HostBuffer<V>(BATCH_SIZE * options.dim).ptr();\n\n  K* h_keys_temp = test_util::HostBuffer<K>(MAX_CAPACITY).ptr();\n  S* h_scores_temp = test_util::HostBuffer<S>(MAX_CAPACITY).ptr();\n  V* h_vectors_temp =\n      test_util::HostBuffer<V>(MAX_CAPACITY * options.dim).ptr();\n\n  K* d_keys_temp;\n  S* d_scores_temp = nullptr;\n  V* d_vectors_temp;\n\n  CUDA_CHECK(cudaMalloc(&d_keys_temp, MAX_CAPACITY * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores_temp, MAX_CAPACITY * sizeof(S)));\n  CUDA_CHECK(\n      cudaMalloc(&d_vectors_temp, MAX_CAPACITY * sizeof(V) * options.dim));\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  size_t total_size = 0;\n  size_t global_start_key = 100000;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n    size_t start_key = global_start_key;\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    for (int r = 0; r < rounds; r++) {\n      size_t expected_min_key = global_start_key + INIT_CAPACITY * r;\n      size_t expected_max_key = global_start_key + INIT_CAPACITY * (r + 1) - 1;\n      size_t expected_table_size =\n          (r == 0) ? size_t(expected_correct_rate * INIT_CAPACITY)\n                   : INIT_CAPACITY;\n\n      for (int s = 0; s < STEPS; s++) {\n        test_util::create_continuous_keys<K, S, V, DIM>(\n            h_keys_base, h_scores_base, h_vectors_base, BATCH_SIZE, start_key);\n        start_key += BATCH_SIZE;\n\n        CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base, BATCH_SIZE * sizeof(K),\n                              cudaMemcpyHostToDevice));\n        CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base,\n                              BATCH_SIZE * sizeof(S), cudaMemcpyHostToDevice));\n        CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base,\n                              BATCH_SIZE * sizeof(V) * options.dim,\n                              cudaMemcpyHostToDevice));\n        table->assign(BATCH_SIZE, d_keys_temp, d_vectors_temp, d_scores_temp,\n                      stream);\n        {\n          V** d_vectors_ptr = nullptr;\n          bool* d_found;\n          CUDA_CHECK(cudaMalloc(&d_found, BATCH_SIZE * sizeof(bool)));\n          CUDA_CHECK(cudaMalloc(&d_vectors_ptr, BATCH_SIZE * sizeof(V*)));\n          table->find_or_insert(BATCH_SIZE, d_keys_temp, d_vectors_ptr, d_found,\n                                d_scores_temp, stream);\n          test_util::read_or_write_ptr(d_vectors_ptr, d_vectors_temp, d_found,\n                                       options.dim, BATCH_SIZE, stream);\n          CUDA_CHECK(cudaStreamSynchronize(stream));\n          CUDA_CHECK(cudaFree(d_vectors_ptr));\n          CUDA_CHECK(cudaFree(d_found));\n        }\n        CUDA_CHECK(cudaStreamSynchronize(stream));\n      }\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_GE(total_size, expected_table_size);\n      ASSERT_EQ(MAX_CAPACITY, table->capacity());\n\n      size_t dump_counter = table->export_batch(\n          MAX_CAPACITY, 0, d_keys_temp, d_vectors_temp, d_scores_temp, stream);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp, d_keys_temp, MAX_CAPACITY * sizeof(K),\n                            cudaMemcpyDeviceToHost));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp, d_scores_temp,\n                            MAX_CAPACITY * sizeof(S), cudaMemcpyDeviceToHost));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp, d_vectors_temp,\n                            MAX_CAPACITY * sizeof(V) * options.dim,\n                            cudaMemcpyDeviceToHost));\n\n      size_t bigger_score_counter = 0;\n      K max_key = 0;\n      size_t values_error_counter = 0;\n      for (int i = 0; i < dump_counter; i++) {\n        ASSERT_EQ(h_keys_temp[i], h_scores_temp[i]);\n        max_key = std::max(max_key, h_keys_temp[i]);\n        if (h_scores_temp[i] >= expected_min_key) bigger_score_counter++;\n        for (int j = 0; j < options.dim; j++) {\n          if (h_vectors_temp[i * options.dim + j] !=\n              static_cast<float>(h_keys_temp[i] * 0.00001)) {\n            values_error_counter++;\n          }\n        }\n      }\n\n      ASSERT_EQ(values_error_counter, 0);\n      float correct_rate = (bigger_score_counter * 1.0) / MAX_CAPACITY;\n      std::cout << std::setprecision(3) << \"[Round \" << r << \"]\"\n                << \"correct_rate=\" << correct_rate << std::endl;\n      ASSERT_GE(max_key, expected_max_key);\n      ASSERT_GE(correct_rate, expected_correct_rate);\n    }\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaFree(d_keys_temp));\n  CUDA_CHECK(cudaFree(d_scores_temp));\n  CUDA_CHECK(cudaFree(d_vectors_temp));\n\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_find_or_insert_multi_threads(size_t max_hbm_for_vectors,\n                                       const float BATCH_0_RATIO,\n                                       const float BATCH_1_RATIO,\n                                       bool capacity_silent = true) {\n  constexpr uint64_t THREAD_N = 64UL;\n  const uint64_t BATCH_0_SIZE = static_cast<uint64_t>(THREAD_N * BATCH_0_RATIO);\n  const uint64_t BATCH_1_SIZE = static_cast<uint64_t>(THREAD_N * BATCH_1_RATIO);\n  const uint64_t BATCH_2_SIZE = THREAD_N - BATCH_0_SIZE - BATCH_1_SIZE;\n\n  constexpr uint64_t INIT_CAPACITY = 32 * 1024 * 1024UL;\n  constexpr uint64_t MAX_CAPACITY = 128 * 1024 * 1024UL;\n  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;\n  constexpr uint64_t KEY_NUM = 1 * 1024 * 1024UL;\n\n  std::vector<std::thread> threads;\n\n  TableOptions options;\n\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_load_factor = 0.50f;\n  options.max_bucket_size = BUCKET_MAX_SIZE;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>;\n\n  std::shared_ptr<Table> table = std::make_shared<Table>();\n  table->init(options);\n  // assert every key is different\n  auto worker1 = [&table, KEY_NUM, options, capacity_silent](int batch,\n                                                             int task_n) {\n    K* h_keys;\n    V* h_vectors;\n    bool* h_found;\n\n    size_t current_capacity = table->capacity();\n\n    CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));\n    CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));\n    CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));\n\n    K* d_keys;\n    V* d_vectors;\n    bool* d_found;\n\n    CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));\n    CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));\n    CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n\n    cudaStream_t stream;\n    CUDA_CHECK(cudaStreamCreate(&stream));\n\n    test_util::create_random_keys<K, S, V, DIM>(h_keys, nullptr, h_vectors,\n                                                KEY_NUM);\n    CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n\n    table->assign(KEY_NUM, d_keys, d_vectors, nullptr, stream);\n    {\n      V** d_vectors_ptr = nullptr;\n      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));\n      table->find(KEY_NUM, d_keys, d_vectors_ptr, d_found, nullptr, stream);\n      test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim, KEY_NUM,\n                               stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      CUDA_CHECK(cudaFree(d_vectors_ptr));\n    }\n\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    {\n      int found_num = 0;\n      CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));\n      CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                            cudaMemcpyDeviceToHost));\n      for (int i = 0; i < KEY_NUM; i++) {\n        if (h_found[i]) {\n          found_num++;\n        }\n      }\n      ASSERT_EQ(found_num, 0);\n    }\n\n    {\n      V** d_vectors_ptr = nullptr;\n      bool* d_found;\n      CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));\n      table->find_or_insert(KEY_NUM, d_keys, d_vectors_ptr, d_found, nullptr,\n                            stream);\n      test_util::read_or_write_ptr(d_vectors_ptr, d_vectors, d_found,\n                                   options.dim, KEY_NUM, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      CUDA_CHECK(cudaFree(d_vectors_ptr));\n      CUDA_CHECK(cudaFree(d_found));\n    }\n\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n    {\n      V** d_vectors_ptr = nullptr;\n      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));\n      table->find(KEY_NUM, d_keys, d_vectors_ptr, d_found, nullptr, stream);\n      test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim, KEY_NUM,\n                               stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      CUDA_CHECK(cudaFree(d_vectors_ptr));\n    }\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    int found_num = 0;\n\n    CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));\n    CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n    CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                          cudaMemcpyDeviceToHost));\n\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n    thread_local bool print_unequal{false};\n    thread_local uint64_t err_times{0};\n    for (int i = 0; i < KEY_NUM; i++) {\n      if (h_found[i]) {\n        found_num++;\n        for (int j = 0; j < options.dim; j++) {\n          if (batch == 2) {\n            if (h_vectors[i * options.dim + j] !=\n                static_cast<float>(h_keys[i] * 0.00001)) {\n              if (!print_unequal) {\n                std::cout << \" [Thread \" << task_n << \"]\\t\";\n                UNEQUAL_EXPR(h_vectors[i * options.dim + j],\n                             static_cast<float>(h_keys[i] * 0.00001));\n                print_unequal = true;\n              }\n              err_times += 1;\n            }\n          } else {\n            ASSERT_EQ(h_vectors[i * options.dim + j],\n                      static_cast<float>(h_keys[i] * 0.00001));\n          }\n        }\n      }\n    }\n\n    bool print_thread_id{false};\n    if (batch == 0 || batch == 1) {\n      ASSERT_EQ(found_num, KEY_NUM);\n      ASSERT_EQ(err_times, 0);\n    } else {\n      if (found_num != KEY_NUM or err_times != 0) {\n        std::cout << \" [Thread \" << task_n << \"]\\t\"\n                  << \"Number of keys(insert/found/error) : \" << \"(\" << KEY_NUM\n                  << \"/\" << found_num << \"/\" << err_times << \") \\t\";\n        print_thread_id = true;\n      }\n    }\n    if (current_capacity != table->capacity() && !capacity_silent) {\n      if (!print_thread_id) std::cout << \" [Thread \" << task_n << \"]\\t\";\n\n      std::cout << \"The capacity changed from \" << current_capacity << \" to \"\n                << table->capacity() << std::endl;\n    } else if (print_thread_id) {\n      std::cout << std::endl;\n    }\n\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    CUDA_CHECK(cudaStreamDestroy(stream));\n\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n\n    CUDA_CHECK(cudaFreeHost(h_keys));\n    CUDA_CHECK(cudaFreeHost(h_found));\n    CUDA_CHECK(cudaFreeHost(h_vectors));\n\n    CUDA_CHECK(cudaFree(d_keys));\n    CUDA_CHECK(cudaFree(d_vectors));\n    CUDA_CHECK(cudaFree(d_found));\n    CUDA_CHECK(cudaDeviceSynchronize());\n\n    CudaCheckError();\n  };\n  auto worker2 = [&table, KEY_NUM, options, capacity_silent](int batch,\n                                                             int task_n) {\n    K* h_keys;\n    V* h_vectors;\n    bool* h_found;\n\n    size_t current_capacity = table->capacity();\n\n    CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));\n    CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));\n    CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));\n\n    K* d_keys;\n    V* d_vectors;\n    V* d_new_vectors;\n    bool* d_found;\n\n    CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));\n    CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));\n    CUDA_CHECK(cudaMalloc(&d_new_vectors, KEY_NUM * sizeof(V) * options.dim));\n    CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n\n    cudaStream_t stream;\n    CUDA_CHECK(cudaStreamCreate(&stream));\n\n    test_util::create_random_keys<K, S, V, DIM>(h_keys, nullptr, h_vectors,\n                                                KEY_NUM);\n    CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n    CUDA_CHECK(cudaMemset(d_new_vectors, 2, KEY_NUM * sizeof(V) * options.dim));\n\n    {\n      V** d_vectors_ptr = nullptr;\n      bool* d_found;\n      CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));\n      table->find_or_insert(KEY_NUM, d_keys, d_vectors_ptr, d_found, nullptr,\n                            stream);\n      test_util::read_or_write_ptr(d_vectors_ptr, d_vectors, d_found,\n                                   options.dim, KEY_NUM, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      CUDA_CHECK(cudaFree(d_vectors_ptr));\n      CUDA_CHECK(cudaFree(d_found));\n    }\n\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    table->assign(KEY_NUM, d_keys, d_new_vectors, nullptr, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n    {\n      V** d_vectors_ptr = nullptr;\n      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));\n      table->find(KEY_NUM, d_keys, d_vectors_ptr, d_found, nullptr, stream);\n      test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim, KEY_NUM,\n                               stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      CUDA_CHECK(cudaFree(d_vectors_ptr));\n    }\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    int found_num = 0;\n\n    CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));\n    CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n    CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                          cudaMemcpyDeviceToHost));\n\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n    thread_local bool print_unequal{false};\n    thread_local uint64_t err_times{0};\n    uint32_t i_value = 0x2020202;\n    for (int i = 0; i < KEY_NUM; i++) {\n      if (h_found[i]) {\n        found_num++;\n        for (int j = 0; j < options.dim; j++) {\n          if (batch == 2) {\n            if (h_vectors[i * options.dim + j] !=\n                *(reinterpret_cast<float*>(&i_value))) {\n              if (!print_unequal) {\n                std::cout << \" [Thread \" << task_n << \"]\\t\";\n                UNEQUAL_EXPR(h_vectors[i * options.dim + j],\n                             *(reinterpret_cast<float*>(&i_value)));\n                print_unequal = true;\n              }\n              err_times += 1;\n            }\n          } else {\n            ASSERT_EQ(h_vectors[i * options.dim + j],\n                      *(reinterpret_cast<float*>(&i_value)));\n          }\n        }\n      }\n    }\n\n    bool print_thread_id{false};\n    if (batch == 0 || batch == 1) {\n      ASSERT_EQ(found_num, KEY_NUM);\n      ASSERT_EQ(err_times, 0);\n    } else {\n      if (found_num != KEY_NUM or err_times != 0) {\n        std::cout << \" [Thread \" << task_n << \"]\\t\"\n                  << \"Number of keys(insert/found/error) : \" << \"(\" << KEY_NUM\n                  << \"/\" << found_num << \"/\" << err_times << \") \\t\";\n        print_thread_id = true;\n      }\n    }\n    if (current_capacity != table->capacity() && !capacity_silent) {\n      if (!print_thread_id) std::cout << \" [Thread \" << task_n << \"]\\t\";\n\n      std::cout << \"The capacity changed from \" << current_capacity << \" to \"\n                << table->capacity() << std::endl;\n    } else if (print_thread_id) {\n      std::cout << std::endl;\n    }\n\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    CUDA_CHECK(cudaStreamDestroy(stream));\n\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n\n    CUDA_CHECK(cudaFreeHost(h_keys));\n    CUDA_CHECK(cudaFreeHost(h_found));\n    CUDA_CHECK(cudaFreeHost(h_vectors));\n\n    CUDA_CHECK(cudaFree(d_keys));\n    CUDA_CHECK(cudaFree(d_vectors));\n    CUDA_CHECK(cudaFree(d_new_vectors));\n    CUDA_CHECK(cudaFree(d_found));\n    CUDA_CHECK(cudaDeviceSynchronize());\n\n    CudaCheckError();\n  };\n\n  /* the table is relative idle, and assume there is no eviction */\n  int batch = 0;\n  std::cout << \"[Batch 0] \" << BATCH_0_SIZE << \" threads\\n\";\n  for (int i = 0; i < BATCH_0_SIZE; i += 2) {\n    threads.emplace_back(std::thread(worker1, batch, i));\n    threads.emplace_back(std::thread(worker2, batch, i + 1));\n  }\n  for (auto& th : threads) {\n    th.join();\n  }\n  threads.clear();\n\n  /* test the correct of APIs serially */\n  batch = 1;\n  std::cout << \"[Batch 1] \" << BATCH_1_SIZE << \" threads\\n\";\n  for (int i = BATCH_0_SIZE; i < BATCH_0_SIZE + BATCH_1_SIZE; i += 2) {\n    auto th = std::thread(worker1, batch, i);\n    th.join();\n    th = std::thread(worker2, batch, i + 1);\n    th.join();\n  }\n\n  /* eviction may occur */\n  batch = 2;\n  std::cout << \"[Batch 2] \" << BATCH_2_SIZE << \" threads\\n\";\n  for (int i = BATCH_0_SIZE + BATCH_1_SIZE; i < THREAD_N; i += 2) {\n    threads.emplace_back(std::thread(worker1, batch, i));\n    threads.emplace_back(std::thread(worker2, batch, i + 1));\n  }\n  for (auto& th : threads) {\n    th.join();\n  }\n  ASSERT_EQ(table->capacity(), MAX_CAPACITY);\n}\n\ntemplate <typename K, typename V, typename S, typename Table, size_t dim = 64>\nvoid CheckFindOrInsertValues(Table* table, K* keys, V* values, S* scores,\n                             size_t len, cudaStream_t stream) {\n  std::map<K, test_util::ValueArray<V, dim>> map_before_insert;\n  std::map<K, test_util::ValueArray<V, dim>> map_after_insert;\n  K* h_tmp_keys = nullptr;\n  V* h_tmp_values = nullptr;\n  S* h_tmp_scores = nullptr;\n\n  K* d_tmp_keys = nullptr;\n  V* d_tmp_values = nullptr;\n  S* d_tmp_scores = nullptr;\n\n  size_t table_size_before = table->size(stream);\n  size_t cap = table_size_before + len;\n\n  CUDA_CHECK(cudaMallocAsync(&d_tmp_keys, cap * sizeof(K), stream));\n  CUDA_CHECK(cudaMemsetAsync(d_tmp_keys, 0, cap * sizeof(K), stream));\n  CUDA_CHECK(cudaMallocAsync(&d_tmp_values, cap * dim * sizeof(V), stream));\n  CUDA_CHECK(cudaMemsetAsync(d_tmp_values, 0, cap * dim * sizeof(V), stream));\n  CUDA_CHECK(cudaMallocAsync(&d_tmp_scores, cap * sizeof(S), stream));\n  CUDA_CHECK(cudaMemsetAsync(d_tmp_scores, 0, cap * sizeof(S), stream));\n  h_tmp_keys = (K*)malloc(cap * sizeof(K));\n  h_tmp_values = (V*)malloc(cap * dim * sizeof(V));\n  h_tmp_scores = (S*)malloc(cap * sizeof(S));\n\n  size_t table_size_verify0 = table->export_batch(\n      table->capacity(), 0, d_tmp_keys, d_tmp_values, d_tmp_scores, stream);\n  ASSERT_EQ(table_size_before, table_size_verify0);\n\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys,\n                             table_size_before * sizeof(K),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,\n                             table_size_before * dim * sizeof(V),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores,\n                             table_size_before * sizeof(S),\n                             cudaMemcpyDeviceToHost, stream));\n\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n\n  for (size_t i = 0; i < table_size_verify0; i++) {\n    test_util::ValueArray<V, dim>* vec =\n        reinterpret_cast<test_util::ValueArray<V, dim>*>(h_tmp_values +\n                                                         i * dim);\n    map_before_insert[h_tmp_keys[i]] = *vec;\n  }\n\n  auto start = std::chrono::steady_clock::now();\n\n  {\n    V** d_vectors_ptr = nullptr;\n    bool* d_found;\n    CUDA_CHECK(cudaMalloc(&d_found, len * sizeof(bool)));\n    CUDA_CHECK(cudaMalloc(&d_vectors_ptr, len * sizeof(V*)));\n    table->find_or_insert(len, keys, d_vectors_ptr, d_found, nullptr, stream);\n    test_util::read_or_write_ptr(d_vectors_ptr, values, d_found, dim, len,\n                                 stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    CUDA_CHECK(cudaFree(d_vectors_ptr));\n    CUDA_CHECK(cudaFree(d_found));\n  }\n\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  auto end = std::chrono::steady_clock::now();\n  auto diff = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start);\n\n  float dur = diff.count();\n\n  size_t table_size_after = table->size(stream);\n  size_t table_size_verify1 = table->export_batch(\n      table->capacity(), 0, d_tmp_keys, d_tmp_values, d_tmp_scores, stream);\n\n  ASSERT_EQ(table_size_verify1, table_size_after);\n\n  size_t new_cap = table_size_after;\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys,\n                             table_size_after * sizeof(K),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,\n                             table_size_after * dim * sizeof(V),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores,\n                             table_size_after * sizeof(S),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  int64_t new_cap_K = (int64_t)new_cap;\n  for (int64_t i = new_cap_K - 1; i >= 0; i--) {\n    test_util::ValueArray<V, dim>* vec =\n        reinterpret_cast<test_util::ValueArray<V, dim>*>(h_tmp_values +\n                                                         i * dim);\n    map_after_insert[h_tmp_keys[i]] = *vec;\n  }\n\n  size_t value_diff_cnt = 0;\n  for (auto& it : map_after_insert) {\n    test_util::ValueArray<V, dim>& vec = map_after_insert.at(it.first);\n    for (size_t j = 0; j < dim; j++) {\n      if (vec[j] != static_cast<float>(it.first * 0.00001)) {\n        ++value_diff_cnt;\n        break;\n      }\n    }\n  }\n  ASSERT_EQ(value_diff_cnt, 0);\n  std::cout << \"Check find_or_insert behavior got \"\n            << \"value_diff_cnt: \" << value_diff_cnt\n            << \", while table_size_before: \" << table_size_before\n            << \", while table_size_after: \" << table_size_after\n            << \", while len: \" << len << std::endl;\n\n  CUDA_CHECK(cudaFreeAsync(d_tmp_keys, stream));\n  CUDA_CHECK(cudaFreeAsync(d_tmp_values, stream));\n  CUDA_CHECK(cudaFreeAsync(d_tmp_scores, stream));\n  free(h_tmp_keys);\n  free(h_tmp_values);\n  free(h_tmp_scores);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n}\n\nvoid test_find_or_insert_values_check(size_t max_hbm_for_vectors) {\n  const size_t U = 524288;\n  const size_t init_capacity = 1024;\n  const size_t B = 524288 + 13;\n  constexpr size_t dim = 64;\n\n  TableOptions opt;\n\n  opt.max_capacity = U;\n  opt.init_capacity = init_capacity;\n  opt.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>;\n  opt.dim = 64;\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  std::unique_ptr<Table> table = std::make_unique<Table>();\n  table->init(opt);\n\n  test_util::KVMSBuffer<K, V, S> data_buffer;\n  data_buffer.Reserve(B, dim, stream);\n\n  size_t offset = 0;\n  S score = 0;\n  for (int i = 0; i < 20; i++) {\n    test_util::create_random_keys<K, S, V, dim>(\n        data_buffer.keys_ptr(false), data_buffer.scores_ptr(false),\n        data_buffer.values_ptr(false), (int)B, B * 16);\n    data_buffer.SyncData(true, stream);\n\n    CheckFindOrInsertValues<K, V, S, Table, dim>(\n        table.get(), data_buffer.keys_ptr(), data_buffer.values_ptr(),\n        data_buffer.scores_ptr(), B, stream);\n\n    offset += B;\n    score += 1;\n  }\n}\n\nvoid test_duplicated_keys(size_t max_hbm_for_vectors, size_t key_start = 0) {\n  constexpr uint64_t INIT_CAPACITY = 64 * 1024 * 1024UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t KEY_NUM = 1024UL;\n  constexpr uint64_t TEST_TIMES = 3;\n\n  K* h_keys;\n  S* h_scores;\n  V* h_vectors;\n  bool* h_found;\n\n  TableOptions options;\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));\n\n  CUDA_CHECK(cudaMemset(h_keys, 1, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n\n  K* d_keys;\n  S* d_scores = nullptr;\n  V* d_vectors;\n  V* d_new_vectors;\n  bool* d_found;\n\n  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMalloc(&d_new_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n\n  CUDA_CHECK(\n      cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),\n                        cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors, KEY_NUM * sizeof(V) * options.dim,\n                        cudaMemcpyHostToDevice));\n\n  CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  uint64_t total_size = 0;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    {\n      V** d_vectors_ptr = nullptr;\n      bool* d_found;\n      CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));\n\n      table->find_or_insert(KEY_NUM, d_keys, d_vectors_ptr, d_found, d_scores,\n                            stream, false);\n      test_util::read_or_write_ptr(d_vectors_ptr, d_vectors, d_found,\n                                   options.dim, KEY_NUM, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      CUDA_CHECK(cudaFree(d_vectors_ptr));\n      CUDA_CHECK(cudaFree(d_found));\n    }\n\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 1);\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaFreeHost(h_keys));\n  CUDA_CHECK(cudaFreeHost(h_scores));\n  CUDA_CHECK(cudaFreeHost(h_found));\n\n  CUDA_CHECK(cudaFree(d_keys));\n  CUDA_CHECK(cudaFree(d_scores));\n  CUDA_CHECK(cudaFree(d_vectors));\n  CUDA_CHECK(cudaFree(d_new_vectors));\n  CUDA_CHECK(cudaFree(d_found));\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nTEST(FindOrInsertPtrTest, test_export_batch_if) {\n  test_export_batch_if(16);\n  test_export_batch_if(0, 33);\n}\nTEST(FindOrInsertPtrTest, test_find_or_insert_multi_threads) {\n  test_find_or_insert_multi_threads(16, 0.25f, 0.125f);\n  test_find_or_insert_multi_threads(16, 0.375f, 0.125f);\n  test_find_or_insert_multi_threads(0, 0.25f, 0.125f);\n  test_find_or_insert_multi_threads(0, 0.375f, 0.125f);\n}\nTEST(FindOrInsertPtrTest, test_basic) {\n  test_basic(16, 3);\n  test_basic(0);\n}\nTEST(FindOrInsertPtrTest, test_basic_when_full) {\n  test_basic_when_full(16, 4);\n  test_basic_when_full(0);\n}\nTEST(FindOrInsertPtrTest, test_erase_if_pred) {\n  test_erase_if_pred(16);\n  test_erase_if_pred(0, 18);\n}\nTEST(FindOrInsertPtrTest, test_rehash) {\n  test_rehash(16);\n  test_rehash(0, 44);\n}\nTEST(FindOrInsertPtrTest, test_rehash_on_big_batch) {\n  test_rehash_on_big_batch(16, 23);\n  test_rehash_on_big_batch(0);\n}\nTEST(FindOrInsertPtrTest, test_dynamic_rehash_on_multi_threads) {\n  test_dynamic_rehash_on_multi_threads(16);\n  test_dynamic_rehash_on_multi_threads(0, 19);\n}\nTEST(FindOrInsertPtrTest, test_basic_for_cpu_io) {\n  test_basic_for_cpu_io();\n  test_basic_for_cpu_io(52);\n}\nTEST(FindOrInsertPtrTest, test_evict_strategy_lru_basic) {\n  test_evict_strategy_lru_basic(16);\n  test_evict_strategy_lru_basic(0, 18);\n}\nTEST(FindOrInsertPtrTest, test_evict_strategy_lfu_basic) {\n  test_evict_strategy_lfu_basic(16, 29);\n  test_evict_strategy_lfu_basic(0);\n}\nTEST(FindOrInsertPtrTest, test_evict_strategy_epochlru_basic) {\n  test_evict_strategy_epochlru_basic(16, 45);\n  test_evict_strategy_epochlru_basic(0);\n}\nTEST(FindOrInsertPtrTest, test_evict_strategy_epochlfu_basic) {\n  test_evict_strategy_epochlfu_basic(16);\n  test_evict_strategy_epochlfu_basic(0, 59);\n}\nTEST(FindOrInsertPtrTest, test_evict_strategy_customized_basic) {\n  test_evict_strategy_customized_basic(16, 38);\n  test_evict_strategy_customized_basic(0);\n}\nTEST(FindOrInsertPtrTest, test_evict_strategy_customized_advanced) {\n  test_evict_strategy_customized_advanced(16);\n  test_evict_strategy_customized_advanced(0, 25);\n}\nTEST(FindOrInsertPtrTest, test_evict_strategy_customized_correct_rate) {\n  // TODO(rhdong): after blossom CI issue is resolved, the skip logic.\n  const bool skip_hmem_check = (nullptr != std::getenv(\"IS_BLOSSOM_CI\"));\n  test_evict_strategy_customized_correct_rate(16, 16);\n  if (!skip_hmem_check) {\n    test_evict_strategy_customized_correct_rate(0);\n  } else {\n    std::cout << \"The HMEM check is skipped in blossom CI!\" << std::endl;\n  }\n}\nTEST(FindOrInsertPtrTest, test_find_or_insert_values_check) {\n  test_find_or_insert_values_check(16);\n  // TODO(rhdong): Add back when diff error issue fixed in hybrid mode.\n  test_find_or_insert_values_check(0);\n}\nTEST(FindOrInsertPtrTest, test_duplicated_keys) {\n  test_duplicated_keys(16, 39);\n  test_duplicated_keys(0);\n}\n"
  },
  {
    "path": "tests/find_or_insert_test.cc.cu",
    "content": "/*\n * Copyright (c) 2023, NVIDIA CORPORATION.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *     http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n/*\n * test APIs: find_or_insert and assign,\n * move insert operation from `insert_or_assign` to `find`.\n */\n\n#include <gtest/gtest.h>\n#include <math.h>\n#include <stdio.h>\n#include <stdlib.h>\n#include <algorithm>\n#include <array>\n#include <iostream>\n#include <thread>\n#include <vector>\n#include \"merlin_hashtable.cuh\"\n#include \"test_util.cuh\"\n\nconstexpr size_t DIM = 16;\nusing K = uint64_t;\nusing V = float;\nusing S = uint64_t;\nusing EvictStrategy = nv::merlin::EvictStrategy;\nusing TableOptions = nv::merlin::HashTableOptions;\n\ntemplate <class K, class S>\nstruct EraseIfPredFunctor {\n  __forceinline__ __device__ bool operator()(const K& key, S& score,\n                                             const K& pattern,\n                                             const S& threshold) {\n    return ((key & 0x7f > pattern) && (score > threshold));\n  }\n};\n\ntemplate <class K, class S>\nstruct ExportIfPredFunctor {\n  __forceinline__ __device__ bool operator()(const K& key, S& score,\n                                             const K& pattern,\n                                             const S& threshold) {\n    return score > threshold;\n  }\n};\n\nvoid test_basic(size_t max_hbm_for_vectors, int key_start = 0) {\n  constexpr uint64_t INIT_CAPACITY = 64 * 1024 * 1024UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t KEY_NUM = 1 * 1024 * 1024UL;\n  constexpr uint64_t TEST_TIMES = 1;\n\n  K* h_keys;\n  S* h_scores;\n  V* h_vectors;\n  bool* h_found;\n\n  TableOptions options;\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));\n\n  CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n\n  test_util::create_random_keys<K, S, V, DIM>(h_keys, h_scores, h_vectors,\n                                              KEY_NUM);\n\n  K* d_keys;\n  S* d_scores = nullptr;\n  V* d_vectors;\n  V* d_new_vectors;\n  bool* d_found;\n  size_t dump_counter = 0;\n\n  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMalloc(&d_new_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n\n  CUDA_CHECK(\n      cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),\n                        cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors, KEY_NUM * sizeof(V) * options.dim,\n                        cudaMemcpyHostToDevice));\n\n  CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  uint64_t total_size = 0;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    table->find_or_insert(KEY_NUM, d_keys, d_vectors, d_scores, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, KEY_NUM);\n\n    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n    CUDA_CHECK(cudaMemset(d_scores, 0, KEY_NUM * sizeof(S)));\n    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n\n    {\n      V** d_vectors_ptr = nullptr;\n      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));\n      table->find(KEY_NUM, d_keys, d_vectors_ptr, d_found, d_scores, stream);\n      test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim, KEY_NUM,\n                               stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      CUDA_CHECK(cudaFree(d_vectors_ptr));\n    }\n\n    int found_num = 0;\n    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n\n    for (int i = 0; i < KEY_NUM; i++) {\n      if (h_found[i]) found_num++;\n      ASSERT_EQ(h_scores[i], h_keys[i]);\n      for (int j = 0; j < options.dim; j++) {\n        ASSERT_EQ(h_vectors[i * options.dim + j],\n                  static_cast<float>(h_keys[i] * 0.00001));\n      }\n    }\n    ASSERT_EQ(found_num, KEY_NUM);\n\n    CUDA_CHECK(cudaMemset(d_new_vectors, 2, KEY_NUM * sizeof(V) * options.dim));\n    table->assign(KEY_NUM, reinterpret_cast<const K*>(d_keys),\n                  reinterpret_cast<const float*>(d_new_vectors),\n                  reinterpret_cast<const S*>(d_scores), stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, KEY_NUM);\n\n    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n    CUDA_CHECK(cudaMemset(d_new_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n    {\n      V** d_vectors_ptr = nullptr;\n      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));\n      test_util::array2ptr(d_vectors_ptr, d_new_vectors, options.dim, KEY_NUM,\n                           stream);\n      table->find(KEY_NUM, d_keys, d_new_vectors, d_found, d_scores, stream);\n      test_util::read_from_ptr(d_vectors_ptr, d_new_vectors, options.dim,\n                               KEY_NUM, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      CUDA_CHECK(cudaFree(d_vectors_ptr));\n    }\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_new_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n    found_num = 0;\n    uint32_t i_value = 0x2020202;\n    for (int i = 0; i < KEY_NUM; i++) {\n      if (h_found[i]) found_num++;\n      for (int j = 0; j < options.dim; j++) {\n        ASSERT_EQ(h_vectors[i * options.dim + j],\n                  *(reinterpret_cast<float*>(&i_value)));\n      }\n    }\n    ASSERT_EQ(found_num, KEY_NUM);\n\n    table->accum_or_assign(KEY_NUM, d_keys, d_vectors, d_found, d_scores,\n                           stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, KEY_NUM);\n\n    table->erase(KEY_NUM >> 1, d_keys, stream);\n    size_t total_size_after_erase = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size_after_erase, total_size >> 1);\n\n    table->clear(stream);\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    table->find_or_insert(KEY_NUM, d_keys, d_vectors, d_scores, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, KEY_NUM);\n\n    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n    CUDA_CHECK(cudaMemset(d_scores, 0, KEY_NUM * sizeof(S)));\n    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n\n    {\n      V** d_vectors_ptr = nullptr;\n      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));\n      table->find(KEY_NUM, d_keys, d_vectors_ptr, d_found, d_scores, stream);\n      test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim, KEY_NUM,\n                               stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      CUDA_CHECK(cudaFree(d_vectors_ptr));\n    }\n\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    found_num = 0;\n    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n\n    for (int i = 0; i < KEY_NUM; i++) {\n      if (h_found[i]) found_num++;\n      ASSERT_EQ(h_scores[i], h_keys[i]);\n      for (int j = 0; j < options.dim; j++) {\n        ASSERT_EQ(h_vectors[i * options.dim + j],\n                  static_cast<float>(h_keys[i] * 0.00001));\n      }\n    }\n    ASSERT_EQ(found_num, KEY_NUM);\n\n    CUDA_CHECK(cudaMemset(d_keys, 0, KEY_NUM * sizeof(K)));\n    CUDA_CHECK(cudaMemset(d_scores, 0, KEY_NUM * sizeof(S)));\n    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n    dump_counter = table->export_batch(table->capacity(), 0, d_keys, d_vectors,\n                                       d_scores, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    ASSERT_EQ(dump_counter, KEY_NUM);\n    CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),\n                          cudaMemcpyDeviceToHost));\n    for (int i = 0; i < KEY_NUM; i++) {\n      ASSERT_EQ(h_scores[i], h_keys[i]);\n      for (int j = 0; j < options.dim; j++) {\n        ASSERT_EQ(h_vectors[i * options.dim + j],\n                  static_cast<float>(h_keys[i] * 0.00001));\n      }\n    }\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaFreeHost(h_keys));\n  CUDA_CHECK(cudaFreeHost(h_scores));\n  CUDA_CHECK(cudaFreeHost(h_vectors));\n  CUDA_CHECK(cudaFreeHost(h_found));\n\n  CUDA_CHECK(cudaFree(d_keys));\n  CUDA_CHECK(cudaFree(d_scores));\n  CUDA_CHECK(cudaFree(d_vectors));\n  CUDA_CHECK(cudaFree(d_new_vectors));\n  CUDA_CHECK(cudaFree(d_found));\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_basic_when_full(size_t max_hbm_for_vectors, int key_start = 0) {\n  constexpr uint64_t INIT_CAPACITY = 1 * 1024 * 1024UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t KEY_NUM = INIT_CAPACITY;\n  constexpr uint64_t TEST_TIMES = 1;\n\n  K* h_keys;\n  S* h_scores;\n  V* h_vectors;\n  bool* h_found;\n\n  TableOptions options;\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));\n\n  CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n\n  test_util::create_random_keys<K, S, V, DIM>(h_keys, h_scores, h_vectors,\n                                              KEY_NUM);\n\n  K* d_keys;\n  S* d_scores = nullptr;\n  V* d_vectors;\n  V* d_def_val;\n  bool* d_found;\n\n  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMalloc(&d_def_val, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n\n  CUDA_CHECK(\n      cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),\n                        cudaMemcpyHostToDevice));\n\n  CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors, KEY_NUM * sizeof(V) * options.dim,\n                        cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemset(d_def_val, 2, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  uint64_t total_size = 0;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    table->find_or_insert(KEY_NUM, d_keys, d_vectors, d_scores, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    uint64_t total_size_after_insert = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    CUDA_CHECK(cudaMemset(d_def_val, 0, KEY_NUM * sizeof(V) * options.dim));\n    table->find(KEY_NUM, d_keys, d_def_val, d_found, nullptr, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    int found_num = 0;\n\n    CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));\n    CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_def_val,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n    for (int i = 0; i < KEY_NUM; i++) {\n      if (h_found[i]) {\n        found_num++;\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors[i * options.dim + j],\n                    static_cast<float>(h_keys[i] * 0.00001));\n        }\n      }\n    }\n    ASSERT_EQ(total_size_after_insert, found_num);\n\n    table->erase(KEY_NUM, d_keys, stream);\n    size_t total_size_after_erase = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size_after_erase, 0);\n\n    table->find_or_insert(KEY_NUM, d_keys, d_vectors, d_scores, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    uint64_t total_size_after_reinsert = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size_after_insert, total_size_after_reinsert);\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,\n                        cudaMemcpyDeviceToHost));\n\n  CUDA_CHECK(cudaFreeHost(h_keys));\n  CUDA_CHECK(cudaFreeHost(h_scores));\n  CUDA_CHECK(cudaFreeHost(h_vectors));\n  CUDA_CHECK(cudaFreeHost(h_found));\n\n  CUDA_CHECK(cudaFree(d_keys));\n  CUDA_CHECK(cudaFree(d_scores));\n  CUDA_CHECK(cudaFree(d_vectors));\n  CUDA_CHECK(cudaFree(d_def_val));\n  CUDA_CHECK(cudaFree(d_found));\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_erase_if_pred(size_t max_hbm_for_vectors, int key_start = 0) {\n  constexpr uint64_t INIT_CAPACITY = 256UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t KEY_NUM = 128UL;\n  constexpr uint64_t TEST_TIMES = 1;\n  constexpr uint64_t BUCKET_MAX_SIZE = 128;\n\n  K* h_keys;\n  S* h_scores;\n  V* h_vectors;\n  bool* h_found;\n\n  TableOptions options;\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  std::unique_ptr<Table> table = std::make_unique<Table>();\n  table->init(options);\n\n  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));\n\n  K* d_keys;\n  S* d_scores = nullptr;\n  V* d_vectors;\n  bool* d_found;\n\n  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  uint64_t total_size = 0;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    test_util::create_keys_in_one_buckets<K, S, V, DIM>(\n        h_keys, h_scores, h_vectors, KEY_NUM, INIT_CAPACITY);\n    CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    table->find_or_insert(KEY_NUM, d_keys, d_vectors, d_scores, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n    K pattern = 100;\n    S threshold = 0;\n    size_t erase_num = table->template erase_if<EraseIfPredFunctor>(\n        pattern, threshold, stream);\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ((erase_num + total_size), BUCKET_MAX_SIZE);\n\n    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n\n    {\n      V** d_vectors_ptr = nullptr;\n      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));\n      table->find(KEY_NUM, d_keys, d_vectors_ptr, d_found, d_scores, stream);\n      test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim, KEY_NUM,\n                               stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      CUDA_CHECK(cudaFree(d_vectors_ptr));\n    }\n\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    int found_num = 0;\n\n    CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));\n    CUDA_CHECK(cudaMemset(h_scores, 0, KEY_NUM * sizeof(S)));\n    CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n    for (int i = 0; i < KEY_NUM; i++) {\n      if (h_found[i]) {\n        found_num++;\n        ASSERT_EQ(h_scores[i], h_keys[i]);\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors[i * options.dim + j],\n                    static_cast<float>(h_keys[i] * 0.00001));\n        }\n      }\n    }\n    ASSERT_EQ(found_num, (BUCKET_MAX_SIZE - erase_num));\n\n    table->clear(stream);\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,\n                        cudaMemcpyDeviceToHost));\n\n  CUDA_CHECK(cudaFreeHost(h_keys));\n  CUDA_CHECK(cudaFreeHost(h_scores));\n  CUDA_CHECK(cudaFreeHost(h_vectors));\n  CUDA_CHECK(cudaFreeHost(h_found));\n\n  CUDA_CHECK(cudaFree(d_keys));\n  CUDA_CHECK(cudaFree(d_scores));\n  CUDA_CHECK(cudaFree(d_vectors));\n  CUDA_CHECK(cudaFree(d_found));\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_rehash(size_t max_hbm_for_vectors, int key_start = 0) {\n  constexpr uint64_t BUCKET_MAX_SIZE = 128ul;\n  constexpr uint64_t INIT_CAPACITY = BUCKET_MAX_SIZE;\n  constexpr uint64_t MAX_CAPACITY = 4 * INIT_CAPACITY;\n  constexpr uint64_t KEY_NUM = BUCKET_MAX_SIZE * 2;\n  constexpr uint64_t TEST_TIMES = 100;\n  K* h_keys;\n  S* h_scores;\n  V* h_vectors;\n  bool* h_found;\n\n  TableOptions options;\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_bucket_size = BUCKET_MAX_SIZE;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));\n\n  K* d_keys;\n  S* d_scores = nullptr;\n  V* d_vectors;\n  bool* d_found;\n  size_t dump_counter = 0;\n\n  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  uint64_t total_size = 0;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n    test_util::create_keys_in_one_buckets<K, S, V, DIM>(\n        h_keys, h_scores, h_vectors, KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE);\n    CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    table->find_or_insert(KEY_NUM, d_keys, d_vectors, d_scores, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaDeviceSynchronize());\n    ASSERT_EQ(total_size, KEY_NUM);\n\n    dump_counter = table->export_batch(table->capacity(), 0, d_keys, d_vectors,\n                                       d_scores, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(dump_counter, KEY_NUM);\n\n    table->reserve(MAX_CAPACITY, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(table->capacity(), MAX_CAPACITY);\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, KEY_NUM);\n\n    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n    {\n      V** d_vectors_ptr = nullptr;\n      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, BUCKET_MAX_SIZE * sizeof(V*)));\n      table->find(BUCKET_MAX_SIZE, d_keys, d_vectors_ptr, d_found, d_scores,\n                  stream);\n      test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim,\n                               BUCKET_MAX_SIZE, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      CUDA_CHECK(cudaFree(d_vectors_ptr));\n    }\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    int found_num = 0;\n\n    CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));\n    CUDA_CHECK(cudaMemset(h_scores, 0, KEY_NUM * sizeof(S)));\n    CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n    CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n    for (int i = 0; i < BUCKET_MAX_SIZE; i++) {\n      if (h_found[i]) {\n        found_num++;\n        ASSERT_EQ(h_scores[i], h_keys[i]);\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors[i * options.dim + j],\n                    static_cast<float>(h_keys[i] * 0.00001));\n        }\n      }\n    }\n    ASSERT_EQ(found_num, BUCKET_MAX_SIZE);\n\n    table->clear(stream);\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,\n                        cudaMemcpyDeviceToHost));\n\n  CUDA_CHECK(cudaFreeHost(h_keys));\n  CUDA_CHECK(cudaFreeHost(h_scores));\n  CUDA_CHECK(cudaFreeHost(h_vectors));\n  CUDA_CHECK(cudaFreeHost(h_found));\n\n  CUDA_CHECK(cudaFree(d_keys));\n  CUDA_CHECK(cudaFree(d_scores));\n  CUDA_CHECK(cudaFree(d_vectors));\n  CUDA_CHECK(cudaFree(d_found));\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_rehash_on_big_batch(size_t max_hbm_for_vectors, int key_start = 0) {\n  constexpr uint64_t INIT_CAPACITY = 1024;\n  constexpr uint64_t MAX_CAPACITY = 16 * 1024;\n  constexpr uint64_t INIT_KEY_NUM = 1024;\n  constexpr uint64_t KEY_NUM = 2048;\n  K* h_keys;\n  S* h_scores;\n  V* h_vectors;\n  bool* h_found;\n\n  TableOptions options;\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_bucket_size = 128;\n  options.max_load_factor = 0.6;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));\n\n  K* d_keys;\n  S* d_scores = nullptr;\n  V* d_vectors;\n  bool* d_found;\n  size_t dump_counter = 0;\n\n  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  uint64_t total_size = 0;\n  uint64_t expected_size = 0;\n  std::unique_ptr<Table> table = std::make_unique<Table>();\n  table->init(options);\n\n  test_util::create_random_keys<K, S, V, DIM>(h_keys, h_scores, h_vectors,\n                                              KEY_NUM);\n\n  CUDA_CHECK(\n      cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),\n                        cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors, KEY_NUM * sizeof(V) * options.dim,\n                        cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n\n  total_size = table->size(stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  ASSERT_EQ(total_size, 0);\n\n  table->find_or_insert(INIT_KEY_NUM, d_keys, d_vectors, d_scores, stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  expected_size = INIT_KEY_NUM;\n\n  total_size = table->size(stream);\n  CUDA_CHECK(cudaDeviceSynchronize());\n  ASSERT_EQ(total_size, expected_size);\n  ASSERT_EQ(table->capacity(), (INIT_CAPACITY * 2));\n\n  table->find_or_insert(KEY_NUM, d_keys, d_vectors, d_scores, stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  expected_size = KEY_NUM;\n\n  total_size = table->size(stream);\n  CUDA_CHECK(cudaDeviceSynchronize());\n  ASSERT_EQ(total_size, expected_size);\n  ASSERT_EQ(table->capacity(), KEY_NUM * 4);\n\n  dump_counter = table->export_batch(table->capacity(), 0, d_keys, d_vectors,\n                                     d_scores, stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  ASSERT_EQ(dump_counter, expected_size);\n\n  CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMemset(d_scores, 0, KEY_NUM * sizeof(S)));\n  {\n    V** d_vectors_ptr = nullptr;\n    CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));\n    table->find(KEY_NUM, d_keys, d_vectors_ptr, d_found, d_scores, stream);\n    test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim, KEY_NUM,\n                             stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    CUDA_CHECK(cudaFree(d_vectors_ptr));\n  }\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  int found_num = 0;\n\n  CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));\n  CUDA_CHECK(cudaMemset(h_scores, 0, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(\n      cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K), cudaMemcpyDeviceToHost));\n  CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                        cudaMemcpyDeviceToHost));\n  CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),\n                        cudaMemcpyDeviceToHost));\n  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,\n                        cudaMemcpyDeviceToHost));\n  for (int i = 0; i < KEY_NUM; i++) {\n    if (h_found[i]) {\n      found_num++;\n      ASSERT_EQ(h_scores[i], h_keys[i]);\n      for (int j = 0; j < options.dim; j++) {\n        ASSERT_EQ(h_vectors[i * options.dim + j],\n                  static_cast<float>(h_keys[i] * 0.00001));\n      }\n    }\n  }\n  ASSERT_EQ(found_num, KEY_NUM);\n\n  table->clear(stream);\n  total_size = table->size(stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  ASSERT_EQ(total_size, 0);\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,\n                        cudaMemcpyDeviceToHost));\n\n  CUDA_CHECK(cudaFreeHost(h_keys));\n  CUDA_CHECK(cudaFreeHost(h_scores));\n  CUDA_CHECK(cudaFreeHost(h_vectors));\n  CUDA_CHECK(cudaFreeHost(h_found));\n\n  CUDA_CHECK(cudaFree(d_keys));\n  CUDA_CHECK(cudaFree(d_scores));\n  CUDA_CHECK(cudaFree(d_vectors));\n  CUDA_CHECK(cudaFree(d_found));\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_dynamic_rehash_on_multi_threads(size_t max_hbm_for_vectors,\n                                          int key_start = 0) {\n  constexpr uint64_t BUCKET_MAX_SIZE = 128ul;\n  constexpr uint64_t INIT_CAPACITY = 4 * 1024;\n  constexpr uint64_t MAX_CAPACITY = 16 * 1024 * INIT_CAPACITY;\n  constexpr uint64_t KEY_NUM = 256;\n  constexpr uint64_t THREAD_N = 8;\n\n  std::vector<std::thread> threads;\n\n  TableOptions options;\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_load_factor = 0.50f;\n  options.max_bucket_size = BUCKET_MAX_SIZE;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>;\n\n  std::shared_ptr<Table> table = std::make_shared<Table>();\n  table->init(options);\n\n  auto worker_function = [&table, KEY_NUM, options](int task_n) {\n    K* h_keys;\n    V* h_vectors;\n    bool* h_found;\n\n    size_t current_capacity = table->capacity();\n\n    CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));\n    CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));\n    CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));\n\n    K* d_keys;\n    V* d_vectors;\n    bool* d_found;\n\n    CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));\n    CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));\n    CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n\n    cudaStream_t stream;\n    CUDA_CHECK(cudaStreamCreate(&stream));\n\n    while (table->capacity() < MAX_CAPACITY) {\n      test_util::create_random_keys<K, S, V, DIM>(h_keys, nullptr, h_vectors,\n                                                  KEY_NUM);\n      CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),\n                            cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,\n                            KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n\n      table->find_or_insert(KEY_NUM, d_keys, d_vectors, nullptr, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n      {\n        V** d_vectors_ptr = nullptr;\n        CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));\n        table->find(KEY_NUM, d_keys, d_vectors_ptr, d_found, nullptr, stream);\n        test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim, KEY_NUM,\n                                 stream);\n        CUDA_CHECK(cudaStreamSynchronize(stream));\n        CUDA_CHECK(cudaFree(d_vectors_ptr));\n      }\n\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      int found_num = 0;\n\n      CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));\n      CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n      CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),\n                            cudaMemcpyDeviceToHost));\n      CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                            cudaMemcpyDeviceToHost));\n\n      CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                            KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDeviceToHost));\n      for (int i = 0; i < KEY_NUM; i++) {\n        if (h_found[i]) {\n          found_num++;\n          for (int j = 0; j < options.dim; j++) {\n            ASSERT_EQ(h_vectors[i * options.dim + j],\n                      static_cast<float>(h_keys[i] * 0.00001));\n          }\n        }\n      }\n      ASSERT_EQ(found_num, KEY_NUM);\n      if (task_n == 0 && current_capacity != table->capacity()) {\n        std::cout << \"[test_dynamic_rehash_on_multi_threads] The capacity \"\n                     \"changed from \"\n                  << current_capacity << \" to \" << table->capacity()\n                  << std::endl;\n        current_capacity = table->capacity();\n      }\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n    }\n    CUDA_CHECK(cudaStreamDestroy(stream));\n\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n\n    CUDA_CHECK(cudaFreeHost(h_keys));\n    CUDA_CHECK(cudaFreeHost(h_found));\n    CUDA_CHECK(cudaFreeHost(h_vectors));\n\n    CUDA_CHECK(cudaFree(d_keys));\n    CUDA_CHECK(cudaFree(d_vectors));\n    CUDA_CHECK(cudaFree(d_found));\n    CUDA_CHECK(cudaDeviceSynchronize());\n\n    CudaCheckError();\n  };\n\n  for (int i = 0; i < THREAD_N; ++i)\n    threads.emplace_back(std::thread(worker_function, i));\n\n  for (auto& th : threads) {\n    th.join();\n  }\n  ASSERT_EQ(table->capacity(), MAX_CAPACITY);\n}\n\nvoid test_export_batch_if(size_t max_hbm_for_vectors, int key_start = 0) {\n  constexpr uint64_t INIT_CAPACITY = 256UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t KEY_NUM = 128UL;\n  constexpr uint64_t TEST_TIMES = 1;\n  constexpr uint64_t BUCKET_MAX_SIZE = 128ul;\n\n  K* h_keys;\n  S* h_scores;\n  V* h_vectors;\n  size_t h_dump_counter = 0;\n\n  TableOptions options;\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>;\n\n  std::unique_ptr<Table> table = std::make_unique<Table>();\n  table->init(options);\n\n  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));\n\n  K* d_keys;\n  S* d_scores = nullptr;\n  V* d_vectors;\n  bool* d_found;\n  size_t* d_dump_counter;\n  int found_num = 0;\n  bool* h_found;\n\n  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n  CUDA_CHECK(cudaMalloc(&d_dump_counter, sizeof(size_t)));\n\n  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  uint64_t total_size = 0;\n\n  S threshold = test_util::host_nano<S>();\n  for (int i = 0; i < TEST_TIMES; i++) {\n    test_util::create_random_keys<K, S, V, DIM>(h_keys, h_scores, h_vectors,\n                                                KEY_NUM);\n\n    CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyHostToDevice));\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    table->find_or_insert(KEY_NUM, d_keys, d_vectors, nullptr, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, KEY_NUM);\n\n    CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));\n    CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n    CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),\n                          cudaMemcpyDeviceToHost));\n\n    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n    {\n      V** d_vectors_ptr = nullptr;\n      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));\n      table->find(KEY_NUM, d_keys, d_vectors_ptr, d_found, nullptr, stream);\n      test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim, KEY_NUM,\n                               stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      CUDA_CHECK(cudaFree(d_vectors_ptr));\n    }\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                          cudaMemcpyDeviceToHost));\n\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n    found_num = 0;\n    for (int i = 0; i < BUCKET_MAX_SIZE; i++) {\n      if (h_found[i]) {\n        found_num++;\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors[i * options.dim + j],\n                    static_cast<float>(h_keys[i] * 0.00001));\n        }\n      }\n    }\n    ASSERT_EQ(found_num, KEY_NUM);\n\n    K pattern = 100;\n\n    table->template export_batch_if<ExportIfPredFunctor>(\n        pattern, threshold, table->capacity(), 0, d_dump_counter, d_keys,\n        d_vectors, d_scores, stream);\n\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    CUDA_CHECK(cudaMemcpy(&h_dump_counter, d_dump_counter, sizeof(size_t),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),\n                          cudaMemcpyDeviceToHost));\n\n    size_t expected_export_count = 0;\n    for (int i = 0; i < KEY_NUM; i++) {\n      if (h_scores[i] > threshold) expected_export_count++;\n    }\n    ASSERT_EQ(expected_export_count, h_dump_counter);\n\n    table->template export_batch_if<ExportIfPredFunctor>(\n        pattern, test_util::host_nano<S>(), table->capacity(), 0,\n        d_dump_counter, d_keys, d_vectors, d_scores, stream);\n\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    CUDA_CHECK(cudaMemcpy(&h_dump_counter, d_dump_counter, sizeof(size_t),\n                          cudaMemcpyDeviceToHost));\n    ASSERT_EQ(0, h_dump_counter);\n\n    CUDA_CHECK(cudaMemset(h_keys, 0, KEY_NUM * sizeof(K)));\n    CUDA_CHECK(cudaMemset(h_scores, 0, KEY_NUM * sizeof(S)));\n    CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n\n    CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n\n    for (int i = 0; i < h_dump_counter; i++) {\n      ASSERT_GT(h_scores[i], threshold);\n      for (int j = 0; j < options.dim; j++) {\n        ASSERT_EQ(h_vectors[i * options.dim + j],\n                  static_cast<float>(h_keys[i] * 0.00001));\n      }\n    }\n\n    table->clear(stream);\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n  }\n  CUDA_CHECK(cudaDeviceSynchronize());\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,\n                        cudaMemcpyDeviceToHost));\n\n  CUDA_CHECK(cudaFreeHost(h_keys));\n  CUDA_CHECK(cudaFreeHost(h_scores));\n  CUDA_CHECK(cudaFreeHost(h_vectors));\n  CUDA_CHECK(cudaFreeHost(h_found));\n\n  CUDA_CHECK(cudaFree(d_keys));\n  CUDA_CHECK(cudaFree(d_scores));\n  CUDA_CHECK(cudaFree(d_vectors));\n  CUDA_CHECK(cudaFree(d_found));\n  CUDA_CHECK(cudaFree(d_dump_counter));\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_basic_for_cpu_io(int key_start = 0) {\n  constexpr uint64_t INIT_CAPACITY = 64 * 1024 * 1024UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t KEY_NUM = 1 * 1024 * 1024UL;\n  constexpr uint64_t TEST_TIMES = 1;\n\n  K* h_keys;\n  S* h_scores;\n  V* h_vectors;\n  bool* h_found;\n\n  TableOptions options;\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_hbm_for_vectors = nv::merlin::GB(0);\n  options.io_by_cpu = true;\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));\n\n  CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n\n  test_util::create_random_keys<K, S, V, DIM>(h_keys, h_scores, nullptr,\n                                              KEY_NUM);\n\n  K* d_keys;\n  S* d_scores = nullptr;\n  V* d_vectors;\n  V* d_def_val;\n  V** d_vectors_ptr;\n  bool* d_found;\n  size_t dump_counter = 0;\n\n  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMalloc(&d_def_val, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));\n  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n\n  CUDA_CHECK(\n      cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),\n                        cudaMemcpyHostToDevice));\n\n  CUDA_CHECK(cudaMemset(d_vectors, 1, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMemset(d_def_val, 2, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMemset(d_vectors_ptr, 0, KEY_NUM * sizeof(V*)));\n  CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  uint64_t total_size = 0;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    table->find_or_insert(KEY_NUM, d_keys, d_vectors, d_scores, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, KEY_NUM);\n\n    CUDA_CHECK(cudaMemset(d_vectors, 2, KEY_NUM * sizeof(V) * options.dim));\n    table->assign(KEY_NUM, d_keys, d_vectors, d_scores, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, KEY_NUM);\n\n    {\n      V** d_vectors_ptr = nullptr;\n      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));\n      table->find(KEY_NUM, d_keys, d_vectors_ptr, d_found, nullptr, stream);\n      test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim, KEY_NUM,\n                               stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      CUDA_CHECK(cudaFree(d_vectors_ptr));\n    }\n\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    int found_num = 0;\n    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                          cudaMemcpyDeviceToHost));\n\n    for (int i = 0; i < KEY_NUM; i++) {\n      if (h_found[i]) found_num++;\n    }\n    ASSERT_EQ(found_num, KEY_NUM);\n\n    table->accum_or_assign(KEY_NUM, d_keys, d_vectors, d_found, d_scores,\n                           stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, KEY_NUM);\n\n    table->erase(KEY_NUM >> 1, d_keys, stream);\n    size_t total_size_after_erase = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size_after_erase, total_size >> 1);\n\n    table->clear(stream);\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    table->find_or_insert(KEY_NUM, d_keys, d_vectors, d_scores, stream);\n\n    dump_counter = table->export_batch(table->capacity(), 0, d_keys, d_vectors,\n                                       d_scores, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(dump_counter, KEY_NUM);\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,\n                        cudaMemcpyDeviceToHost));\n\n  CUDA_CHECK(cudaFreeHost(h_keys));\n  CUDA_CHECK(cudaFreeHost(h_scores));\n  CUDA_CHECK(cudaFreeHost(h_vectors));\n  CUDA_CHECK(cudaFreeHost(h_found));\n\n  CUDA_CHECK(cudaFree(d_keys));\n  CUDA_CHECK(cudaFree(d_scores));\n  CUDA_CHECK(cudaFree(d_vectors));\n  CUDA_CHECK(cudaFree(d_def_val));\n  CUDA_CHECK(cudaFree(d_vectors_ptr));\n  CUDA_CHECK(cudaFree(d_found));\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_evict_strategy_lru_basic(size_t max_hbm_for_vectors,\n                                   int key_start = 0) {\n  constexpr uint64_t BUCKET_NUM = 8UL;\n  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;\n  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;\n  constexpr uint64_t TEST_KEY_NUM = 4;\n  constexpr uint64_t TEMP_KEY_NUM =\n      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;\n  constexpr uint64_t TEST_TIMES = 128;\n\n  TableOptions options;\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>;\n\n  std::vector<K> h_keys_base(BASE_KEY_NUM);\n  std::vector<S> h_scores_base(BASE_KEY_NUM);\n  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_test(TEST_KEY_NUM);\n  std::vector<S> h_scores_test(TEST_KEY_NUM);\n  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_temp(TEMP_KEY_NUM);\n  std::vector<S> h_scores_temp(TEMP_KEY_NUM);\n  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);\n\n  K* d_keys_temp;\n  S* d_scores_temp = nullptr;\n  V* d_vectors_temp;\n\n  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));\n  CUDA_CHECK(\n      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));\n\n  test_util::create_keys_in_one_buckets<K, S, V, DIM>(\n      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),\n      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF);\n\n  test_util::create_keys_in_one_buckets<K, S, V, DIM>(\n      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),\n      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,\n      0xFFFFFFFFFFFFFFFD);\n\n  h_keys_test[2] = h_keys_base[72];\n  h_keys_test[3] = h_keys_base[73];\n\n  for (int i = 0; i < options.dim; i++) {\n    h_vectors_test[2 * options.dim + i] = h_vectors_base[72 * options.dim + i];\n    h_vectors_test[3 * options.dim + i] = h_vectors_base[73 * options.dim + i];\n  }\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  size_t total_size = 0;\n  size_t dump_counter = 0;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    {\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      S start_ts = test_util::host_nano<S>(stream);\n      table->find_or_insert(BASE_KEY_NUM, d_keys_temp, d_vectors_temp, nullptr,\n                            stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      S end_ts = test_util::host_nano<S>(stream);\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      std::vector<S> h_scores_temp_sorted(h_scores_temp);\n      std::sort(h_scores_temp_sorted.begin(), h_scores_temp_sorted.end());\n\n      ASSERT_GE(h_scores_temp_sorted[0], start_ts);\n      ASSERT_LE(h_scores_temp_sorted[TEST_KEY_NUM - 1], end_ts);\n      for (int i = 0; i < dump_counter; i++) {\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n\n    {\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),\n                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),\n                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),\n                            TEST_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      S start_ts = test_util::host_nano<S>(stream);\n      table->assign(TEST_KEY_NUM, d_keys_temp, d_vectors_temp, nullptr, stream);\n      table->find_or_insert(TEST_KEY_NUM, d_keys_temp, d_vectors_temp, nullptr,\n                            stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      S end_ts = test_util::host_nano<S>(stream);\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            TEMP_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      std::array<S, TEMP_KEY_NUM> h_scores_temp_sorted;\n      int ctr = 0;\n      for (int i = 0; i < TEMP_KEY_NUM; i++) {\n        if (h_keys_test.end() !=\n            std::find(h_keys_test.begin(), h_keys_test.end(), h_keys_temp[i])) {\n          ASSERT_GT(h_scores_temp[i], BUCKET_MAX_SIZE);\n          h_scores_temp_sorted[ctr++] = h_scores_temp[i];\n        } else {\n          ASSERT_LE(h_scores_temp[i], start_ts);\n        }\n      }\n      std::sort(h_scores_temp_sorted.begin(),\n                h_scores_temp_sorted.begin() + ctr);\n\n      ASSERT_GE(h_scores_temp_sorted[0], start_ts);\n      ASSERT_LE(h_scores_temp_sorted[ctr - 1], end_ts);\n      for (int i = 0; i < dump_counter; i++) {\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaFree(d_keys_temp));\n  CUDA_CHECK(cudaFree(d_scores_temp));\n  CUDA_CHECK(cudaFree(d_vectors_temp));\n\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_evict_strategy_lfu_basic(size_t max_hbm_for_vectors,\n                                   int key_start = 0) {\n  constexpr uint64_t BUCKET_NUM = 8UL;\n  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;\n  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;\n  constexpr uint64_t TEST_KEY_NUM = 4;\n  constexpr uint64_t TEMP_KEY_NUM =\n      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;\n  constexpr uint64_t TEST_TIMES = 128;\n\n  TableOptions options;\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLfu>;\n\n  std::vector<K> h_keys_base(BASE_KEY_NUM);\n  std::vector<S> h_scores_base(BASE_KEY_NUM);\n  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_test(TEST_KEY_NUM);\n  std::vector<S> h_scores_test(TEST_KEY_NUM);\n  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_temp(TEMP_KEY_NUM);\n  std::vector<S> h_scores_temp(TEMP_KEY_NUM);\n  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);\n\n  K* d_keys_temp;\n  S* d_scores_temp = nullptr;\n  V* d_vectors_temp;\n\n  int freq_range = 1000;\n\n  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));\n  CUDA_CHECK(\n      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));\n\n  test_util::create_keys_in_one_buckets_lfu<K, S, V, DIM>(\n      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),\n      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF,\n      freq_range);\n\n  test_util::create_keys_in_one_buckets_lfu<K, S, V, DIM>(\n      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),\n      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,\n      0xFFFFFFFFFFFFFFFD, freq_range);\n\n  h_keys_test[2] = h_keys_base[72];\n  h_keys_test[3] = h_keys_base[73];\n\n  h_scores_test[2] = h_keys_base[72] % freq_range;\n  h_scores_test[3] = h_keys_base[73] % freq_range;\n\n  for (int i = 0; i < options.dim; i++) {\n    h_vectors_test[2 * options.dim + i] = h_vectors_base[72 * options.dim + i];\n    h_vectors_test[3 * options.dim + i] = h_vectors_base[73 * options.dim + i];\n  }\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  size_t total_size = 0;\n  size_t dump_counter = 0;\n  S global_epoch = 1;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    {\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      table->find_or_insert(BASE_KEY_NUM, d_keys_temp, d_vectors_temp,\n                            d_scores_temp, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      for (int i = 0; i < dump_counter; i++) {\n        ASSERT_EQ(h_scores_temp[i], h_keys_temp[i] % freq_range);\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n\n    {\n      global_epoch++;\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),\n                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),\n                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),\n                            TEST_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      table->assign(TEST_KEY_NUM, d_keys_temp, d_vectors_temp, d_scores_temp,\n                    stream);\n      table->find_or_insert(TEST_KEY_NUM, d_keys_temp, d_vectors_temp,\n                            d_scores_temp, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            TEMP_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      for (int i = 0; i < dump_counter; i++) {\n        bool in_base =\n            h_keys_base.end() !=\n            std::find(h_keys_base.begin(), h_keys_base.end(), h_keys_temp[i]);\n        bool in_test =\n            h_keys_test.end() !=\n            std::find(h_keys_test.begin(), h_keys_test.end(), h_keys_temp[i]);\n        if (in_base && in_test) {\n          ASSERT_EQ(h_scores_temp[i], (h_keys_temp[i] % freq_range) * 3);\n        } else {\n          ASSERT_EQ(h_scores_temp[i], (h_keys_temp[i] % freq_range));\n        }\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaFree(d_keys_temp));\n  CUDA_CHECK(cudaFree(d_scores_temp));\n  CUDA_CHECK(cudaFree(d_vectors_temp));\n\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_evict_strategy_epochlru_basic(size_t max_hbm_for_vectors,\n                                        int key_start = 0) {\n  constexpr int RSHIFT_ON_NANO = 20;\n\n  constexpr uint64_t BUCKET_NUM = 8UL;\n  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;\n  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;\n  constexpr uint64_t TEST_KEY_NUM = 4;\n  constexpr uint64_t TEMP_KEY_NUM =\n      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;\n  constexpr uint64_t TEST_TIMES = 128;\n\n  TableOptions options;\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kEpochLru>;\n\n  std::vector<K> h_keys_base(BASE_KEY_NUM);\n  std::vector<S> h_scores_base(BASE_KEY_NUM);\n  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_test(TEST_KEY_NUM);\n  std::vector<S> h_scores_test(TEST_KEY_NUM);\n  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_temp(TEMP_KEY_NUM);\n  std::vector<S> h_scores_temp(TEMP_KEY_NUM);\n  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);\n\n  K* d_keys_temp;\n  S* d_scores_temp = nullptr;\n  V* d_vectors_temp;\n\n  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));\n  CUDA_CHECK(\n      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));\n\n  test_util::create_keys_in_one_buckets<K, S, V, DIM>(\n      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),\n      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF);\n\n  test_util::create_keys_in_one_buckets<K, S, V, DIM>(\n      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),\n      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,\n      0xFFFFFFFFFFFFFFFD);\n\n  h_keys_test[2] = h_keys_base[72];\n  h_keys_test[3] = h_keys_base[73];\n\n  for (int i = 0; i < options.dim; i++) {\n    h_vectors_test[2 * options.dim + i] = h_vectors_base[72 * options.dim + i];\n    h_vectors_test[3 * options.dim + i] = h_vectors_base[73 * options.dim + i];\n  }\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  size_t total_size = 0;\n  size_t dump_counter = 0;\n  S global_epoch = 1;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    {\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      S start_ts =\n          (test_util::host_nano<S>(stream) >> RSHIFT_ON_NANO) & 0xFFFFFFFF;\n      table->set_global_epoch(global_epoch);\n      table->find_or_insert(BASE_KEY_NUM, d_keys_temp, d_vectors_temp, nullptr,\n                            stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      S end_ts =\n          (test_util::host_nano<S>(stream) >> RSHIFT_ON_NANO) & 0xFFFFFFFF;\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      std::vector<S> h_scores_temp_sorted(h_scores_temp);\n      std::sort(h_scores_temp_sorted.begin(), h_scores_temp_sorted.end());\n\n      ASSERT_GE(h_scores_temp_sorted[0], (global_epoch << 32 | start_ts));\n      ASSERT_LE(h_scores_temp_sorted[BASE_KEY_NUM - 1],\n                (global_epoch << 32 | end_ts));\n      for (int i = 0; i < dump_counter; i++) {\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n\n    {\n      global_epoch++;\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),\n                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),\n                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),\n                            TEST_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      S start_ts =\n          (test_util::host_nano<S>(stream) >> RSHIFT_ON_NANO) & 0xFFFFFFFF;\n      table->set_global_epoch(global_epoch);\n      table->assign(TEST_KEY_NUM, d_keys_temp, d_vectors_temp, nullptr, stream);\n      table->find_or_insert(TEST_KEY_NUM, d_keys_temp, d_vectors_temp, nullptr,\n                            stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      S end_ts =\n          (test_util::host_nano<S>(stream) >> RSHIFT_ON_NANO) & 0xFFFFFFFF;\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            TEMP_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      std::vector<S> h_scores_temp_sorted;\n      h_scores_temp_sorted.reserve(TEMP_KEY_NUM);\n      for (int i = 0; i < TEMP_KEY_NUM; i++) {\n        if (h_keys_test.end() !=\n            std::find(h_keys_test.begin(), h_keys_test.end(), h_keys_temp[i])) {\n          ASSERT_GE(h_scores_temp[i], (global_epoch << 32 | start_ts));\n          h_scores_temp_sorted.push_back(h_scores_temp[i]);\n        } else {\n          ASSERT_LE(h_scores_temp[i], (global_epoch << 32 | start_ts));\n        }\n      }\n      std::sort(h_scores_temp_sorted.begin(), h_scores_temp_sorted.end());\n\n      if (!h_scores_temp_sorted.empty()) {\n        ASSERT_GE(h_scores_temp_sorted[0], (global_epoch << 32 | start_ts));\n        ASSERT_LE(h_scores_temp_sorted[h_scores_temp_sorted.size() - 1],\n                  (global_epoch << 32 | end_ts));\n      }\n      for (int i = 0; i < dump_counter; i++) {\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaFree(d_keys_temp));\n  CUDA_CHECK(cudaFree(d_scores_temp));\n  CUDA_CHECK(cudaFree(d_vectors_temp));\n\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_evict_strategy_epochlfu_basic(size_t max_hbm_for_vectors,\n                                        int key_start = 0) {\n  constexpr uint64_t BUCKET_NUM = 8UL;\n  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;\n  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;\n  constexpr uint64_t TEST_KEY_NUM = 4;\n  constexpr uint64_t TEMP_KEY_NUM =\n      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;\n  constexpr uint64_t TEST_TIMES = 128;\n\n  TableOptions options;\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kEpochLfu>;\n\n  std::vector<K> h_keys_base(BASE_KEY_NUM);\n  std::vector<S> h_scores_base(BASE_KEY_NUM);\n  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_test(TEST_KEY_NUM);\n  std::vector<S> h_scores_test(TEST_KEY_NUM);\n  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_temp(TEMP_KEY_NUM);\n  std::vector<S> h_scores_temp(TEMP_KEY_NUM);\n  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);\n\n  K* d_keys_temp;\n  S* d_scores_temp = nullptr;\n  V* d_vectors_temp;\n\n  int freq_range = 1000;\n\n  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));\n  CUDA_CHECK(\n      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));\n\n  test_util::create_keys_in_one_buckets_lfu<K, S, V, DIM>(\n      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),\n      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF,\n      freq_range);\n\n  test_util::create_keys_in_one_buckets_lfu<K, S, V, DIM>(\n      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),\n      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,\n      0xFFFFFFFFFFFFFFFD, freq_range);\n\n  // Simulate overflow of low 32bits.\n  h_scores_base[71] = static_cast<S>(std::numeric_limits<uint32_t>::max() -\n                                     static_cast<uint32_t>(1));\n\n  h_keys_test[1] = h_keys_base[71];\n  h_keys_test[2] = h_keys_base[72];\n  h_keys_test[3] = h_keys_base[73];\n\n  h_scores_test[1] = h_scores_base[71];\n  h_scores_test[2] = h_keys_base[72] % freq_range;\n  h_scores_test[3] = h_keys_base[73] % freq_range;\n\n  for (int i = 0; i < options.dim; i++) {\n    h_vectors_test[1 * options.dim + i] = h_vectors_base[71 * options.dim + i];\n    h_vectors_test[2 * options.dim + i] = h_vectors_base[72 * options.dim + i];\n    h_vectors_test[3 * options.dim + i] = h_vectors_base[73 * options.dim + i];\n  }\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  size_t total_size = 0;\n  size_t dump_counter = 0;\n  S global_epoch = 1;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    {\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      table->set_global_epoch(global_epoch);\n      table->find_or_insert(BASE_KEY_NUM, d_keys_temp, d_vectors_temp,\n                            d_scores_temp, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      for (int i = 0; i < dump_counter; i++) {\n        if (h_keys_temp[i] == h_keys_base[71]) {\n          S expected_score = test_util::make_expected_score_for_epochlfu<S>(\n              global_epoch, h_scores_base[71]);\n          ASSERT_EQ(h_scores_temp[i], expected_score);\n        } else {\n          S expected_score = test_util::make_expected_score_for_epochlfu<S>(\n              global_epoch, (h_keys_temp[i] % freq_range));\n          ASSERT_EQ(h_scores_temp[i], expected_score);\n        }\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n\n    {\n      global_epoch++;\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),\n                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),\n                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),\n                            TEST_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      table->set_global_epoch(global_epoch);\n      table->assign(TEST_KEY_NUM, d_keys_temp, d_vectors_temp, d_scores_temp,\n                    stream);\n      table->find_or_insert(TEST_KEY_NUM, d_keys_temp, d_vectors_temp,\n                            d_scores_temp, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            TEMP_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      ASSERT_TRUE(h_keys_temp.end() != std::find(h_keys_temp.begin(),\n                                                 h_keys_temp.end(),\n                                                 h_keys_base[71]));\n\n      for (int i = 0; i < dump_counter; i++) {\n        bool in_base =\n            h_keys_base.end() !=\n            std::find(h_keys_base.begin(), h_keys_base.end(), h_keys_temp[i]);\n        bool in_test =\n            h_keys_test.end() !=\n            std::find(h_keys_test.begin(), h_keys_test.end(), h_keys_temp[i]);\n\n        if (in_base && in_test) {\n          if (h_keys_temp[i] == h_keys_base[71]) {\n            S expected_score = test_util::make_expected_score_for_epochlfu<S>(\n                global_epoch, h_scores_base[71] * 2);\n            ASSERT_EQ(h_scores_temp[i], expected_score);\n          } else {\n            S expected_score = test_util::make_expected_score_for_epochlfu<S>(\n                global_epoch,\n                (h_keys_temp[i] % freq_range) * 3);  // update score when found.\n            ASSERT_EQ(h_scores_temp[i], expected_score);\n          }\n        } else {\n          if (h_keys_temp[i] == h_keys_base[71]) {\n            S expected_score = test_util::make_expected_score_for_epochlfu<S>(\n                global_epoch - static_cast<S>(in_base), h_scores_base[71]);\n            ASSERT_EQ(h_scores_temp[i], expected_score);\n          } else {\n            S expected_score = test_util::make_expected_score_for_epochlfu<S>(\n                global_epoch - static_cast<S>(in_base),\n                (h_keys_temp[i] % freq_range));\n\n            ASSERT_EQ(h_scores_temp[i], expected_score);\n          }\n        }\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaFree(d_keys_temp));\n  CUDA_CHECK(cudaFree(d_scores_temp));\n  CUDA_CHECK(cudaFree(d_vectors_temp));\n\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_evict_strategy_customized_basic(size_t max_hbm_for_vectors,\n                                          int key_start = 0) {\n  constexpr uint64_t BUCKET_NUM = 8UL;\n  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;\n  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;\n  constexpr uint64_t TEST_KEY_NUM = 128;\n  constexpr uint64_t TEMP_KEY_NUM =\n      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;\n  constexpr uint64_t TEST_TIMES = 128;\n\n  TableOptions options;\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  std::vector<K> h_keys_base(BASE_KEY_NUM);\n  std::vector<S> h_scores_base(BASE_KEY_NUM);\n  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_test(TEST_KEY_NUM);\n  std::vector<S> h_scores_test(TEST_KEY_NUM);\n  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_temp(TEMP_KEY_NUM);\n  std::vector<S> h_scores_temp(TEMP_KEY_NUM);\n  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);\n\n  K* d_keys_temp;\n  S* d_scores_temp = nullptr;\n  V* d_vectors_temp;\n\n  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));\n  CUDA_CHECK(\n      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));\n\n  test_util::create_keys_in_one_buckets<K, S, V, DIM>(\n      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),\n      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF);\n\n  const S base_score_start = 1000;\n  for (int i = 0; i < BASE_KEY_NUM; i++) {\n    h_scores_base[i] = base_score_start + i;\n  }\n\n  test_util::create_keys_in_one_buckets<K, S, V, DIM>(\n      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),\n      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,\n      0xFFFFFFFFFFFFFFFD);\n  const S test_score_start = base_score_start + BASE_KEY_NUM;\n  for (int i = 0; i < TEST_KEY_NUM; i++) {\n    h_scores_test[i] = test_score_start + i;\n  }\n  for (int i = 64; i < TEST_KEY_NUM; i++) {\n    h_keys_test[i] = h_keys_base[i];\n    for (int j = 0; j < options.dim; j++) {\n      h_vectors_test[i * options.dim + j] = h_vectors_base[i * options.dim + j];\n    }\n  }\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  size_t total_size = 0;\n  size_t dump_counter = 0;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    {\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      table->find_or_insert(BASE_KEY_NUM, d_keys_temp, d_vectors_temp,\n                            d_scores_temp, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      std::vector<S> h_scores_temp_sorted(h_scores_temp);\n      std::sort(h_scores_temp_sorted.begin(), h_scores_temp_sorted.end());\n\n      auto expected_range = test_util::range<S, TEMP_KEY_NUM>(base_score_start);\n      ASSERT_TRUE(std::equal(h_scores_temp_sorted.begin(),\n                             h_scores_temp_sorted.end(),\n                             expected_range.begin()));\n      for (int i = 0; i < dump_counter; i++) {\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n\n    {\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),\n                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),\n                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),\n                            TEST_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      table->assign(TEST_KEY_NUM, d_keys_temp, d_vectors_temp, d_scores_temp,\n                    stream);\n      table->find_or_insert(TEST_KEY_NUM, d_keys_temp, d_vectors_temp,\n                            d_scores_temp, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            TEMP_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      std::vector<S> h_scores_temp_sorted(h_scores_temp);\n      std::sort(h_scores_temp_sorted.begin(), h_scores_temp_sorted.end());\n\n      auto expected_range_test =\n          test_util::range<S, TEST_KEY_NUM>(test_score_start);\n      ASSERT_TRUE(std::equal(h_scores_temp_sorted.begin(),\n                             h_scores_temp_sorted.end(),\n                             expected_range_test.begin()));\n      for (int i = 0; i < dump_counter; i++) {\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaFree(d_keys_temp));\n  CUDA_CHECK(cudaFree(d_scores_temp));\n  CUDA_CHECK(cudaFree(d_vectors_temp));\n\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_evict_strategy_customized_advanced(size_t max_hbm_for_vectors,\n                                             int key_start = 0) {\n  constexpr uint64_t BUCKET_NUM = 8UL;\n  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;\n  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;\n  constexpr uint64_t TEST_KEY_NUM = 8;\n  constexpr uint64_t TEMP_KEY_NUM =\n      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;\n  constexpr uint64_t TEST_TIMES = 256;\n\n  TableOptions options;\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  std::vector<K> h_keys_base(BASE_KEY_NUM);\n  std::vector<S> h_scores_base(BASE_KEY_NUM);\n  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_test(TEST_KEY_NUM);\n  std::vector<S> h_scores_test(TEST_KEY_NUM);\n  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_temp(TEMP_KEY_NUM);\n  std::vector<S> h_scores_temp(TEMP_KEY_NUM);\n  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);\n\n  K* d_keys_temp;\n  S* d_scores_temp = nullptr;\n  V* d_vectors_temp;\n\n  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));\n  CUDA_CHECK(\n      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));\n\n  test_util::create_keys_in_one_buckets<K, S, V, DIM>(\n      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),\n      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF);\n\n  const S base_score_start = 1000;\n  for (int i = 0; i < BASE_KEY_NUM; i++) {\n    h_scores_base[i] = base_score_start + i;\n  }\n\n  test_util::create_keys_in_one_buckets<K, S, V, DIM>(\n      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),\n      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,\n      0xFFFFFFFFFFFFFFFD);\n\n  h_keys_test[4] = h_keys_base[72];\n  h_keys_test[5] = h_keys_base[73];\n  h_keys_test[6] = h_keys_base[74];\n  h_keys_test[7] = h_keys_base[75];\n\n  // replace four new keys to lower scores, would not be inserted.\n  h_scores_test[0] = 20;\n  h_scores_test[1] = 78;\n  h_scores_test[2] = 97;\n  h_scores_test[3] = 98;\n\n  // replace three exist keys to new scores, just refresh the score for them.\n  h_scores_test[4] = 99;\n  h_scores_test[5] = 1010;\n  h_scores_test[6] = 1020;\n  h_scores_test[7] = 1035;\n\n  for (int i = 4; i < TEST_KEY_NUM; i++) {\n    for (int j = 0; j < options.dim; j++) {\n      h_vectors_test[i * options.dim + j] =\n          static_cast<V>(h_keys_test[i] * 0.00001);\n    }\n  }\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  size_t total_size = 0;\n  size_t dump_counter = 0;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    {\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      table->find_or_insert(BASE_KEY_NUM, d_keys_temp, d_vectors_temp,\n                            d_scores_temp, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      std::vector<S> h_scores_temp_sorted(h_scores_temp);\n      std::sort(h_scores_temp_sorted.begin(), h_scores_temp_sorted.end());\n\n      auto expected_range = test_util::range<S, TEMP_KEY_NUM>(base_score_start);\n      ASSERT_TRUE(std::equal(h_scores_temp_sorted.begin(),\n                             h_scores_temp_sorted.end(),\n                             expected_range.begin()));\n      for (int i = 0; i < dump_counter; i++) {\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n\n    {\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),\n                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),\n                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),\n                            TEST_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      table->assign(TEST_KEY_NUM, d_keys_temp, d_vectors_temp, d_scores_temp,\n                    stream);\n      table->find_or_insert(TEST_KEY_NUM, d_keys_temp, d_vectors_temp,\n                            d_scores_temp, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            TEMP_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      for (int i = 0; i < TEST_KEY_NUM; i++) {\n        if (i < 4) {\n          ASSERT_EQ(h_keys_temp.end(),\n                    std::find(h_keys_temp.begin(), h_keys_temp.end(),\n                              h_keys_test[i]));\n        } else {\n          ASSERT_NE(h_keys_temp.end(),\n                    std::find(h_keys_temp.begin(), h_keys_temp.end(),\n                              h_keys_test[i]));\n        }\n      }\n      for (int i = 0; i < TEMP_KEY_NUM; i++) {\n        if (h_keys_temp[i] == h_keys_test[4])\n          ASSERT_EQ(h_scores_temp[i], h_scores_test[4]);\n        if (h_keys_temp[i] == h_keys_test[5])\n          ASSERT_EQ(h_scores_temp[i], h_scores_test[5]);\n        if (h_keys_temp[i] == h_keys_test[6])\n          ASSERT_EQ(h_scores_temp[i], h_scores_test[6]);\n        if (h_keys_temp[i] == h_keys_test[7])\n          ASSERT_EQ(h_scores_temp[i], h_scores_test[7]);\n\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaFree(d_keys_temp));\n  CUDA_CHECK(cudaFree(d_scores_temp));\n  CUDA_CHECK(cudaFree(d_vectors_temp));\n\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\ntemplate <typename K, typename V, typename S, typename Table, size_t dim = 64>\nvoid CheckAssignOnEpochLfu(Table* table,\n                           test_util::KVMSBuffer<K, V, S>* data_buffer,\n                           test_util::KVMSBuffer<K, V, S>* evict_buffer,\n                           test_util::KVMSBuffer<K, V, S>* pre_data_buffer,\n                           size_t len, cudaStream_t stream, TableOptions& opt,\n                           unsigned int global_epoch) {\n  std::map<K, test_util::ValueArray<V, dim>> values_map_before_insert;\n  std::map<K, test_util::ValueArray<V, dim>> values_map_after_insert;\n\n  std::unordered_map<K, S> scores_map_before_insert;\n  std::map<K, S> scores_map_after_insert;\n\n  std::map<K, S> scores_map_current_batch;\n  std::map<K, S> scores_map_current_evict;\n\n  K* keys = data_buffer->keys_ptr();\n  V* values = data_buffer->values_ptr();\n  S* scores = data_buffer->scores_ptr();\n\n  K* evicted_keys = evict_buffer->keys_ptr();\n  V* evicted_values = evict_buffer->values_ptr();\n  S* evicted_scores = evict_buffer->scores_ptr();\n\n  for (size_t i = 0; i < len; i++) {\n    scores_map_current_batch[data_buffer->keys_ptr(false)[i]] =\n        data_buffer->scores_ptr(false)[i];\n  }\n\n  K* h_tmp_keys = nullptr;\n  V* h_tmp_values = nullptr;\n  S* h_tmp_scores = nullptr;\n  bool* h_tmp_founds = nullptr;\n\n  K* d_tmp_keys = nullptr;\n  V* d_tmp_values = nullptr;\n  S* d_tmp_scores = nullptr;\n  bool* d_tmp_founds = nullptr;\n\n  size_t table_size_before = table->size(stream);\n  size_t cap = table_size_before + len;\n\n  CUDA_CHECK(cudaMallocAsync(&d_tmp_keys, cap * sizeof(K), stream));\n  CUDA_CHECK(cudaMemsetAsync(d_tmp_keys, 0, cap * sizeof(K), stream));\n  CUDA_CHECK(cudaMallocAsync(&d_tmp_values, cap * dim * sizeof(V), stream));\n  CUDA_CHECK(cudaMemsetAsync(d_tmp_values, 0, cap * dim * sizeof(V), stream));\n  CUDA_CHECK(cudaMallocAsync(&d_tmp_scores, cap * sizeof(S), stream));\n  CUDA_CHECK(cudaMemsetAsync(d_tmp_scores, 0, cap * sizeof(S), stream));\n  CUDA_CHECK(cudaMallocAsync(&d_tmp_founds, cap * sizeof(bool), stream));\n  CUDA_CHECK(cudaMemsetAsync(d_tmp_founds, 0, cap * sizeof(bool), stream));\n  h_tmp_keys = (K*)malloc(cap * sizeof(K));\n  h_tmp_values = (V*)malloc(cap * dim * sizeof(V));\n  h_tmp_scores = (S*)malloc(cap * sizeof(S));\n  h_tmp_founds = (bool*)malloc(cap * sizeof(bool));\n\n  size_t table_size_verify0 = table->export_batch(\n      table->capacity(), 0, d_tmp_keys, d_tmp_values, d_tmp_scores, stream);\n  ASSERT_EQ(table_size_before, table_size_verify0);\n\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys,\n                             table_size_before * sizeof(K),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,\n                             table_size_before * dim * sizeof(V),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores,\n                             table_size_before * sizeof(S),\n                             cudaMemcpyDeviceToHost, stream));\n\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys + table_size_before, keys,\n                             len * sizeof(K), cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values + table_size_before * dim, values,\n                             len * dim * sizeof(V), cudaMemcpyDeviceToHost,\n                             stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores + table_size_before, scores,\n                             len * sizeof(S), cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n\n  for (size_t i = 0; i < cap; i++) {\n    test_util::ValueArray<V, dim>* vec =\n        reinterpret_cast<test_util::ValueArray<V, dim>*>(h_tmp_values +\n                                                         i * dim);\n    values_map_before_insert[h_tmp_keys[i]] = *vec;\n  }\n\n  for (size_t i = 0; i < table_size_before; i++) {\n    scores_map_before_insert[h_tmp_keys[i]] = h_tmp_scores[i];\n  }\n\n  table->set_global_epoch(global_epoch);\n  table->assign(len, keys, values, scores, stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n\n  {\n    size_t table_size_verify1 = table->export_batch(\n        table->capacity(), 0, d_tmp_keys, d_tmp_values, d_tmp_scores, stream);\n\n    CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys,\n                               table_size_before * sizeof(K),\n                               cudaMemcpyDeviceToHost, stream));\n    CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,\n                               table_size_before * dim * sizeof(V),\n                               cudaMemcpyDeviceToHost, stream));\n    CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores,\n                               table_size_before * sizeof(S),\n                               cudaMemcpyDeviceToHost, stream));\n\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(table_size_verify1, table_size_before);\n\n    size_t score_error_cnt = 0;\n\n    for (int64_t i = table_size_before - 1; i >= 0; i--) {\n      test_util::ValueArray<V, dim>* vec =\n          reinterpret_cast<test_util::ValueArray<V, dim>*>(h_tmp_values +\n                                                           i * dim);\n      values_map_after_insert[h_tmp_keys[i]] = *vec;\n      scores_map_after_insert[h_tmp_keys[i]] = h_tmp_scores[i];\n    }\n\n    for (auto it : scores_map_current_batch) {\n      const K key = it.first;\n      const K score = it.second;\n      S current_score = scores_map_after_insert[key];\n      S score_before_insert = 0;\n      if (scores_map_before_insert.find(key) !=\n          scores_map_before_insert.end()) {\n        score_before_insert = scores_map_before_insert[key];\n        bool valid =\n            ((current_score >> 32) == global_epoch) &&\n            ((current_score & 0xFFFFFFFF) ==\n             ((0xFFFFFFFF & score_before_insert) + (0xFFFFFFFF & score)));\n\n        if (!valid) {\n          score_error_cnt++;\n        }\n      }\n    }\n    std::cout << \"Check assign behavior got \"\n              << \", score_error_cnt: \" << score_error_cnt\n              << \", while len: \" << len << std::endl;\n    ASSERT_EQ(score_error_cnt, 0);\n  }\n\n  for (int64_t i = 0; i < table_size_before; i++) {\n    values_map_before_insert[h_tmp_keys[i]] =\n        values_map_after_insert[h_tmp_keys[i]];\n    scores_map_before_insert[h_tmp_keys[i]] =\n        scores_map_after_insert[h_tmp_keys[i]];\n  }\n  values_map_after_insert.clear();\n  scores_map_after_insert.clear();\n\n  auto start = std::chrono::steady_clock::now();\n  size_t filtered_len = table->insert_and_evict(\n      len, keys, values,\n      (Table::evict_strategy == EvictStrategy::kLru ||\n       Table::evict_strategy == EvictStrategy::kEpochLru)\n          ? nullptr\n          : scores,\n      evicted_keys, evicted_values, evicted_scores, stream);\n  evict_buffer->SyncData(false, stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  auto end = std::chrono::steady_clock::now();\n  auto diff = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start);\n\n  for (size_t i = 0; i < filtered_len; i++) {\n    scores_map_current_evict[evict_buffer->keys_ptr(false)[i]] =\n        evict_buffer->scores_ptr(false)[i];\n  }\n\n  float dur = diff.count();\n\n  size_t table_size_after = table->size(stream);\n  size_t table_size_verify1 = table->export_batch(\n      table->capacity(), 0, d_tmp_keys, d_tmp_values, d_tmp_scores, stream);\n\n  ASSERT_EQ(table_size_verify1, table_size_after);\n\n  size_t new_cap = table_size_after + filtered_len;\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys,\n                             table_size_after * sizeof(K),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,\n                             table_size_after * dim * sizeof(V),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores,\n                             table_size_after * sizeof(S),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys + table_size_after, evicted_keys,\n                             filtered_len * sizeof(K), cudaMemcpyDeviceToHost,\n                             stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values + table_size_after * dim,\n                             evicted_values, filtered_len * dim * sizeof(V),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores + table_size_after, evicted_scores,\n                             filtered_len * sizeof(S), cudaMemcpyDeviceToHost,\n                             stream));\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n\n  size_t key_miss_cnt = 0;\n  size_t value_diff_cnt = 0;\n  size_t score_error_cnt1 = 0;\n  size_t score_error_cnt2 = 0;\n\n  for (int64_t i = new_cap - 1; i >= 0; i--) {\n    test_util::ValueArray<V, dim>* vec =\n        reinterpret_cast<test_util::ValueArray<V, dim>*>(h_tmp_values +\n                                                         i * dim);\n    values_map_after_insert[h_tmp_keys[i]] = *vec;\n    scores_map_after_insert[h_tmp_keys[i]] = h_tmp_scores[i];\n    if (i >= (new_cap - filtered_len)) {\n      bool valid = ((h_tmp_scores[i] >> 32) < (global_epoch - 2));\n      if (!valid) {\n        score_error_cnt1++;\n      }\n    }\n  }\n\n  for (auto it : scores_map_current_batch) {\n    const K key = it.first;\n    const K score = it.second;\n    S current_score = scores_map_after_insert[key];\n    S score_before_insert = 0;\n    if (values_map_after_insert.find(key) != values_map_after_insert.end() &&\n        scores_map_current_evict.find(key) == scores_map_current_evict.end()) {\n      score_before_insert = scores_map_before_insert[key];\n    }\n    bool valid = ((current_score >> 32) == global_epoch) &&\n                 ((current_score & 0xFFFFFFFF) ==\n                  ((0xFFFFFFFF & score_before_insert) + (0xFFFFFFFF & score)));\n\n    if (!valid) {\n      score_error_cnt2++;\n    }\n  }\n\n  for (auto& it : values_map_before_insert) {\n    if (values_map_after_insert.find(it.first) ==\n        values_map_after_insert.end()) {\n      ++key_miss_cnt;\n      continue;\n    }\n    test_util::ValueArray<V, dim>& vec0 = it.second;\n    test_util::ValueArray<V, dim>& vec1 = values_map_after_insert.at(it.first);\n    for (size_t j = 0; j < dim; j++) {\n      if (vec0[j] != vec1[j]) {\n        ++value_diff_cnt;\n        break;\n      }\n    }\n  }\n\n  std::cout << \"Check insert_and_evict behavior got \"\n            << \"key_miss_cnt: \" << key_miss_cnt\n            << \", value_diff_cnt: \" << value_diff_cnt\n            << \", score_error_cnt1: \" << score_error_cnt1\n            << \", score_error_cnt2: \" << score_error_cnt2\n            << \", while table_size_before: \" << table_size_before\n            << \", while table_size_after: \" << table_size_after\n            << \", while len: \" << len << std::endl;\n\n  ASSERT_EQ(key_miss_cnt, 0);\n  ASSERT_EQ(value_diff_cnt, 0);\n  ASSERT_EQ(score_error_cnt1, 0);\n  ASSERT_EQ(score_error_cnt2, 0);\n\n  CUDA_CHECK(cudaFreeAsync(d_tmp_keys, stream));\n  CUDA_CHECK(cudaFreeAsync(d_tmp_values, stream));\n  CUDA_CHECK(cudaFreeAsync(d_tmp_scores, stream));\n  CUDA_CHECK(cudaFreeAsync(d_tmp_founds, stream));\n  free(h_tmp_keys);\n  free(h_tmp_values);\n  free(h_tmp_scores);\n  free(h_tmp_founds);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n}\n\nvoid test_assign_advanced_on_epochlfu(size_t max_hbm_for_vectors) {\n  const size_t U = 1024 * 1024;\n  const size_t B = 100000;\n  constexpr size_t dim = 16;\n\n  TableOptions opt;\n\n  opt.max_capacity = U;\n  opt.init_capacity = U;\n  opt.max_hbm_for_vectors = U * dim * sizeof(V);\n  opt.max_bucket_size = 128;\n  opt.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kEpochLfu>;\n  opt.dim = dim;\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  std::unique_ptr<Table> table = std::make_unique<Table>();\n  table->init(opt);\n\n  test_util::KVMSBuffer<K, V, S> evict_buffer;\n  evict_buffer.Reserve(B, dim, stream);\n  evict_buffer.ToZeros(stream);\n\n  test_util::KVMSBuffer<K, V, S> data_buffer;\n  test_util::KVMSBuffer<K, V, S> pre_data_buffer;\n  data_buffer.Reserve(B, dim, stream);\n  pre_data_buffer.Reserve(B, dim, stream);\n\n  size_t offset = 0;\n  int freq_range = 100;\n  float repeat_rate = 0.9;\n  for (unsigned int global_epoch = 1; global_epoch <= 20; global_epoch++) {\n    repeat_rate = global_epoch <= 1 ? 0.0 : 0.1;\n    if (global_epoch <= 1) {\n      test_util::create_random_keys_advanced<K, S, V>(\n          dim, data_buffer.keys_ptr(false), data_buffer.scores_ptr(false),\n          data_buffer.values_ptr(false), (int)B, B * 32, freq_range);\n    } else {\n      test_util::create_random_keys_advanced<K, S, V>(\n          dim, data_buffer.keys_ptr(false), pre_data_buffer.keys_ptr(false),\n          data_buffer.scores_ptr(false), data_buffer.values_ptr(false), (int)B,\n          B * 32, freq_range, repeat_rate);\n    }\n    data_buffer.SyncData(true, stream);\n    if (global_epoch <= 1) {\n      pre_data_buffer.CopyFrom(data_buffer, stream);\n    }\n\n    CheckAssignOnEpochLfu<K, V, S, Table, dim>(table.get(), &data_buffer,\n                                               &evict_buffer, &pre_data_buffer,\n                                               B, stream, opt, global_epoch);\n\n    pre_data_buffer.CopyFrom(data_buffer, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    offset += B;\n  }\n}\n\nvoid test_evict_strategy_customized_correct_rate(size_t max_hbm_for_vectors) {\n  constexpr uint64_t BATCH_SIZE = 1024 * 1024ul;\n  constexpr uint64_t STEPS = 128;\n  constexpr uint64_t MAX_BUCKET_SIZE = 128;\n  constexpr uint64_t INIT_CAPACITY = BATCH_SIZE * STEPS;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t TEST_TIMES = 1;\n  float expected_correct_rate = 0.964;\n  const int rounds = 12;\n\n  TableOptions options;\n\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_bucket_size = MAX_BUCKET_SIZE;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  K* h_keys_base = test_util::HostBuffer<K>(BATCH_SIZE).ptr();\n  S* h_scores_base = test_util::HostBuffer<S>(BATCH_SIZE).ptr();\n  V* h_vectors_base = test_util::HostBuffer<V>(BATCH_SIZE * options.dim).ptr();\n\n  K* h_keys_temp = test_util::HostBuffer<K>(MAX_CAPACITY).ptr();\n  S* h_scores_temp = test_util::HostBuffer<S>(MAX_CAPACITY).ptr();\n  V* h_vectors_temp =\n      test_util::HostBuffer<V>(MAX_CAPACITY * options.dim).ptr();\n\n  K* d_keys_temp;\n  S* d_scores_temp = nullptr;\n  V* d_vectors_temp;\n\n  CUDA_CHECK(cudaMalloc(&d_keys_temp, MAX_CAPACITY * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores_temp, MAX_CAPACITY * sizeof(S)));\n  CUDA_CHECK(\n      cudaMalloc(&d_vectors_temp, MAX_CAPACITY * sizeof(V) * options.dim));\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  size_t total_size = 0;\n  size_t global_start_key = 100000;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n    size_t start_key = global_start_key;\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    for (int r = 0; r < rounds; r++) {\n      size_t expected_min_key = global_start_key + INIT_CAPACITY * r;\n      size_t expected_max_key = global_start_key + INIT_CAPACITY * (r + 1) - 1;\n      size_t expected_table_size =\n          (r == 0) ? size_t(expected_correct_rate * INIT_CAPACITY)\n                   : INIT_CAPACITY;\n\n      for (int s = 0; s < STEPS; s++) {\n        test_util::create_continuous_keys<K, S, V, DIM>(\n            h_keys_base, h_scores_base, h_vectors_base, BATCH_SIZE, start_key);\n        start_key += BATCH_SIZE;\n\n        CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base, BATCH_SIZE * sizeof(K),\n                              cudaMemcpyHostToDevice));\n        CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base,\n                              BATCH_SIZE * sizeof(S), cudaMemcpyHostToDevice));\n        CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base,\n                              BATCH_SIZE * sizeof(V) * options.dim,\n                              cudaMemcpyHostToDevice));\n        table->assign(BATCH_SIZE, d_keys_temp, d_vectors_temp, d_scores_temp,\n                      stream);\n        table->find_or_insert(BATCH_SIZE, d_keys_temp, d_vectors_temp,\n                              d_scores_temp, stream);\n        CUDA_CHECK(cudaStreamSynchronize(stream));\n      }\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_GE(total_size, expected_table_size);\n      ASSERT_EQ(MAX_CAPACITY, table->capacity());\n\n      size_t dump_counter = table->export_batch(\n          MAX_CAPACITY, 0, d_keys_temp, d_vectors_temp, d_scores_temp, stream);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp, d_keys_temp, MAX_CAPACITY * sizeof(K),\n                            cudaMemcpyDeviceToHost));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp, d_scores_temp,\n                            MAX_CAPACITY * sizeof(S), cudaMemcpyDeviceToHost));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp, d_vectors_temp,\n                            MAX_CAPACITY * sizeof(V) * options.dim,\n                            cudaMemcpyDeviceToHost));\n\n      size_t bigger_score_counter = 0;\n      K max_key = 0;\n      size_t values_error_counter = 0;\n      for (int i = 0; i < dump_counter; i++) {\n        ASSERT_EQ(h_keys_temp[i], h_scores_temp[i]);\n        max_key = std::max(max_key, h_keys_temp[i]);\n        if (h_scores_temp[i] >= expected_min_key) bigger_score_counter++;\n        for (int j = 0; j < options.dim; j++) {\n          if (h_vectors_temp[i * options.dim + j] !=\n              static_cast<float>(h_keys_temp[i] * 0.00001)) {\n            values_error_counter++;\n          }\n        }\n      }\n\n      ASSERT_EQ(values_error_counter, 0);\n      float correct_rate = (bigger_score_counter * 1.0) / MAX_CAPACITY;\n      std::cout << std::setprecision(3) << \"[Round \" << r << \"]\"\n                << \"correct_rate=\" << correct_rate << std::endl;\n      ASSERT_GE(max_key, expected_max_key);\n      ASSERT_GE(correct_rate, expected_correct_rate);\n    }\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaFree(d_keys_temp));\n  CUDA_CHECK(cudaFree(d_scores_temp));\n  CUDA_CHECK(cudaFree(d_vectors_temp));\n\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_find_or_insert_multi_threads(size_t max_hbm_for_vectors,\n                                       const float BATCH_0_RATIO,\n                                       const float BATCH_1_RATIO,\n                                       bool capacity_silent = true) {\n  constexpr uint64_t THREAD_N = 64UL;\n  const uint64_t BATCH_0_SIZE = static_cast<uint64_t>(THREAD_N * BATCH_0_RATIO);\n  const uint64_t BATCH_1_SIZE = static_cast<uint64_t>(THREAD_N * BATCH_1_RATIO);\n  const uint64_t BATCH_2_SIZE = THREAD_N - BATCH_0_SIZE - BATCH_1_SIZE;\n\n  constexpr uint64_t INIT_CAPACITY = 32 * 1024 * 1024UL;\n  constexpr uint64_t MAX_CAPACITY = 128 * 1024 * 1024UL;\n  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;\n  constexpr uint64_t KEY_NUM = 1 * 1024 * 1024UL;\n\n  std::vector<std::thread> threads;\n\n  TableOptions options;\n\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_load_factor = 0.50f;\n  options.max_bucket_size = BUCKET_MAX_SIZE;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>;\n\n  std::shared_ptr<Table> table = std::make_shared<Table>();\n  table->init(options);\n  // assert every key is different\n  auto worker1 = [&table, KEY_NUM, options, capacity_silent](int batch,\n                                                             int task_n) {\n    K* h_keys;\n    V* h_vectors;\n    bool* h_found;\n\n    size_t current_capacity = table->capacity();\n\n    CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));\n    CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));\n    CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));\n\n    K* d_keys;\n    V* d_vectors;\n    bool* d_found;\n\n    CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));\n    CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));\n    CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n\n    cudaStream_t stream;\n    CUDA_CHECK(cudaStreamCreate(&stream));\n\n    test_util::create_random_keys<K, S, V, DIM>(h_keys, nullptr, h_vectors,\n                                                KEY_NUM);\n    CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n\n    table->assign(KEY_NUM, d_keys, d_vectors, nullptr, stream);\n    {\n      V** d_vectors_ptr = nullptr;\n      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));\n      table->find(KEY_NUM, d_keys, d_vectors_ptr, d_found, nullptr, stream);\n      test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim, KEY_NUM,\n                               stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      CUDA_CHECK(cudaFree(d_vectors_ptr));\n    }\n\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    {\n      int found_num = 0;\n      CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));\n      CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                            cudaMemcpyDeviceToHost));\n      for (int i = 0; i < KEY_NUM; i++) {\n        if (h_found[i]) {\n          found_num++;\n        }\n      }\n      ASSERT_EQ(found_num, 0);\n    }\n\n    table->find_or_insert(KEY_NUM, d_keys, d_vectors, nullptr, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n    {\n      V** d_vectors_ptr = nullptr;\n      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));\n      table->find(KEY_NUM, d_keys, d_vectors_ptr, d_found, nullptr, stream);\n      test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim, KEY_NUM,\n                               stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      CUDA_CHECK(cudaFree(d_vectors_ptr));\n    }\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    int found_num = 0;\n\n    CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));\n    CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n    CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                          cudaMemcpyDeviceToHost));\n\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n    thread_local bool print_unequal{false};\n    thread_local uint64_t err_times{0};\n    for (int i = 0; i < KEY_NUM; i++) {\n      if (h_found[i]) {\n        found_num++;\n        for (int j = 0; j < options.dim; j++) {\n          if (batch == 2) {\n            if (h_vectors[i * options.dim + j] !=\n                static_cast<float>(h_keys[i] * 0.00001)) {\n              if (!print_unequal) {\n                std::cout << \" [Thread \" << task_n << \"]\\t\";\n                UNEQUAL_EXPR(h_vectors[i * options.dim + j],\n                             static_cast<float>(h_keys[i] * 0.00001));\n                print_unequal = true;\n              }\n              err_times += 1;\n            }\n          } else {\n            ASSERT_EQ(h_vectors[i * options.dim + j],\n                      static_cast<float>(h_keys[i] * 0.00001));\n          }\n        }\n      }\n    }\n\n    bool print_thread_id{false};\n    if (batch == 0 || batch == 1) {\n      ASSERT_EQ(found_num, KEY_NUM);\n      ASSERT_EQ(err_times, 0);\n    } else {\n      if (found_num != KEY_NUM or err_times != 0) {\n        std::cout << \" [Thread \" << task_n << \"]\\t\"\n                  << \"Number of keys(insert/found/error) : \" << \"(\" << KEY_NUM\n                  << \"/\" << found_num << \"/\" << err_times << \") \\t\";\n        print_thread_id = true;\n      }\n    }\n    if (current_capacity != table->capacity() && !capacity_silent) {\n      if (!print_thread_id) std::cout << \" [Thread \" << task_n << \"]\\t\";\n\n      std::cout << \"The capacity changed from \" << current_capacity << \" to \"\n                << table->capacity() << std::endl;\n    } else if (print_thread_id) {\n      std::cout << std::endl;\n    }\n\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    CUDA_CHECK(cudaStreamDestroy(stream));\n\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n\n    CUDA_CHECK(cudaFreeHost(h_keys));\n    CUDA_CHECK(cudaFreeHost(h_found));\n    CUDA_CHECK(cudaFreeHost(h_vectors));\n\n    CUDA_CHECK(cudaFree(d_keys));\n    CUDA_CHECK(cudaFree(d_vectors));\n    CUDA_CHECK(cudaFree(d_found));\n    CUDA_CHECK(cudaDeviceSynchronize());\n\n    CudaCheckError();\n  };\n  auto worker2 = [&table, KEY_NUM, options, capacity_silent](int batch,\n                                                             int task_n) {\n    K* h_keys;\n    V* h_vectors;\n    bool* h_found;\n\n    size_t current_capacity = table->capacity();\n\n    CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));\n    CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));\n    CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));\n\n    K* d_keys;\n    V* d_vectors;\n    V* d_new_vectors;\n    bool* d_found;\n\n    CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));\n    CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));\n    CUDA_CHECK(cudaMalloc(&d_new_vectors, KEY_NUM * sizeof(V) * options.dim));\n    CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n\n    cudaStream_t stream;\n    CUDA_CHECK(cudaStreamCreate(&stream));\n\n    test_util::create_random_keys<K, S, V, DIM>(h_keys, nullptr, h_vectors,\n                                                KEY_NUM);\n    CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n    CUDA_CHECK(cudaMemset(d_new_vectors, 2, KEY_NUM * sizeof(V) * options.dim));\n\n    table->find_or_insert(KEY_NUM, d_keys, d_vectors, nullptr, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    table->assign(KEY_NUM, d_keys, d_new_vectors, nullptr, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n    {\n      V** d_vectors_ptr = nullptr;\n      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));\n      table->find(KEY_NUM, d_keys, d_vectors_ptr, d_found, nullptr, stream);\n      test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim, KEY_NUM,\n                               stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      CUDA_CHECK(cudaFree(d_vectors_ptr));\n    }\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    int found_num = 0;\n\n    CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));\n    CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n    CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                          cudaMemcpyDeviceToHost));\n\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n    thread_local bool print_unequal{false};\n    thread_local uint64_t err_times{0};\n    uint32_t i_value = 0x2020202;\n    for (int i = 0; i < KEY_NUM; i++) {\n      if (h_found[i]) {\n        found_num++;\n        for (int j = 0; j < options.dim; j++) {\n          if (batch == 2) {\n            if (h_vectors[i * options.dim + j] !=\n                *(reinterpret_cast<float*>(&i_value))) {\n              if (!print_unequal) {\n                std::cout << \" [Thread \" << task_n << \"]\\t\";\n                UNEQUAL_EXPR(h_vectors[i * options.dim + j],\n                             *(reinterpret_cast<float*>(&i_value)));\n                print_unequal = true;\n              }\n              err_times += 1;\n            }\n          } else {\n            ASSERT_EQ(h_vectors[i * options.dim + j],\n                      *(reinterpret_cast<float*>(&i_value)));\n          }\n        }\n      }\n    }\n\n    bool print_thread_id{false};\n    if (batch == 0 || batch == 1) {\n      ASSERT_EQ(found_num, KEY_NUM);\n      ASSERT_EQ(err_times, 0);\n    } else {\n      if (found_num != KEY_NUM or err_times != 0) {\n        std::cout << \" [Thread \" << task_n << \"]\\t\"\n                  << \"Number of keys(insert/found/error) : \" << \"(\" << KEY_NUM\n                  << \"/\" << found_num << \"/\" << err_times << \") \\t\";\n        print_thread_id = true;\n      }\n    }\n    if (current_capacity != table->capacity() && !capacity_silent) {\n      if (!print_thread_id) std::cout << \" [Thread \" << task_n << \"]\\t\";\n\n      std::cout << \"The capacity changed from \" << current_capacity << \" to \"\n                << table->capacity() << std::endl;\n    } else if (print_thread_id) {\n      std::cout << std::endl;\n    }\n\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    CUDA_CHECK(cudaStreamDestroy(stream));\n\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n\n    CUDA_CHECK(cudaFreeHost(h_keys));\n    CUDA_CHECK(cudaFreeHost(h_found));\n    CUDA_CHECK(cudaFreeHost(h_vectors));\n\n    CUDA_CHECK(cudaFree(d_keys));\n    CUDA_CHECK(cudaFree(d_vectors));\n    CUDA_CHECK(cudaFree(d_new_vectors));\n    CUDA_CHECK(cudaFree(d_found));\n    CUDA_CHECK(cudaDeviceSynchronize());\n\n    CudaCheckError();\n  };\n\n  /* the table is relative idle, and assume there is no eviction */\n  int batch = 0;\n  std::cout << \"[Batch 0] \" << BATCH_0_SIZE << \" threads\\n\";\n  for (int i = 0; i < BATCH_0_SIZE; i += 2) {\n    threads.emplace_back(std::thread(worker1, batch, i));\n    threads.emplace_back(std::thread(worker2, batch, i + 1));\n  }\n  for (auto& th : threads) {\n    th.join();\n  }\n  threads.clear();\n\n  /* test the correct of APIs serially */\n  batch = 1;\n  std::cout << \"[Batch 1] \" << BATCH_1_SIZE << \" threads\\n\";\n  for (int i = BATCH_0_SIZE; i < BATCH_0_SIZE + BATCH_1_SIZE; i += 2) {\n    auto th = std::thread(worker1, batch, i);\n    th.join();\n    th = std::thread(worker2, batch, i + 1);\n    th.join();\n  }\n\n  /* eviction may occur */\n  batch = 2;\n  std::cout << \"[Batch 2] \" << BATCH_2_SIZE << \" threads\\n\";\n  for (int i = BATCH_0_SIZE + BATCH_1_SIZE; i < THREAD_N; i += 2) {\n    threads.emplace_back(std::thread(worker1, batch, i));\n    threads.emplace_back(std::thread(worker2, batch, i + 1));\n  }\n  for (auto& th : threads) {\n    th.join();\n  }\n  ASSERT_EQ(table->capacity(), MAX_CAPACITY);\n}\n\n/*\n * This test focus on the compatibility of the value type.\n * In each batch, the batch size is less than the bucket capacity, so it's\n *   always true that the keys inserted in the last batch must exist in HKV.\n * Each kernel only be launched on one SM,\n *   therefore exclude the check of consistency across SMs.\n */\ntemplate <typename V, int Dim>\nvoid test_value_type_hbm_mode() {\n  std::cout << \"size of V: \" << sizeof(V) << \", dim: \" << Dim << std::endl;\n  using Table =\n      nv::merlin::HashTable<K, V, S, nv::merlin::EvictStrategy::kCustomized>;\n  using TableOptions = nv::merlin::HashTableOptions;\n  constexpr uint64_t BUCKET_MAX_SIZE = 128;\n  constexpr uint64_t INIT_CAPACITY = 256UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t KEY_NUM = BUCKET_MAX_SIZE;\n  constexpr uint64_t TEST_TIMES = 2UL;\n\n  K* h_keys;\n  S* h_scores;\n  V* h_vectors;\n  bool* h_found;\n\n  TableOptions options;\n\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = Dim;\n  options.max_bucket_size = BUCKET_MAX_SIZE;\n  options.max_hbm_for_vectors = nv::merlin::GB(16);\n\n  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));\n\n  K* d_keys;\n  S* d_scores;\n  V* d_vectors;\n  bool* d_found;\n\n  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  std::unique_ptr<Table> table = std::make_unique<Table>();\n  table->init(options);\n\n  uint64_t table_size_before = 0;\n  uint64_t table_size_after = 0;\n  uint64_t found_num = 0;\n  uint64_t value_diff_cnt = 0;\n  uint64_t table_size_verify = 0;\n\n  table_size_verify = table->size(stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  ASSERT_EQ(table_size_verify, 0);\n\n  K start_key = 1;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    for (K i = 0; i < KEY_NUM; i++) {\n      h_keys[i] = start_key + static_cast<K>(i);\n      h_scores[i] = h_keys[i];\n      for (size_t j = 0; j < options.dim; j++) {\n        h_vectors[i * options.dim + j] = static_cast<V>(h_keys[i] * 0.1);\n      }\n    }\n    start_key += KEY_NUM;\n\n    // Step1 : insert new Keys.\n    table_size_before = table->size(stream);\n    CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyHostToDevice));\n    table->find_or_insert(KEY_NUM, d_keys, d_vectors, d_scores, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    table_size_after = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_LE(table_size_after, table_size_before + KEY_NUM);\n\n    // Step2 : find new keys.\n    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n    CUDA_CHECK(cudaMemset(d_scores, 0, KEY_NUM * sizeof(S)));\n    table->find_or_insert(KEY_NUM, d_keys, d_vectors, d_scores, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    table_size_verify = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(table_size_verify, table_size_after);\n    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n    found_num = 0;\n    value_diff_cnt = 0;\n    for (int i = 0; i < KEY_NUM; i++) {\n      if (h_vectors[i * options.dim] == static_cast<V>(h_keys[i] * 0.1))\n        ++found_num;\n      for (int j = 0; j < options.dim; j++) {\n        if (h_vectors[i * options.dim + j] != static_cast<V>(h_keys[i] * 0.1)) {\n          ++value_diff_cnt;\n          break;\n        }\n      }\n    }\n    ASSERT_EQ(found_num, KEY_NUM);\n    ASSERT_EQ(value_diff_cnt, 0);\n    std::cout << \"Check find_or_insert behavior got \"\n              << \" key_miss_cnt: \" << KEY_NUM - found_num\n              << \" value_diff_cnt: \" << value_diff_cnt\n              << \" while table_size_before: \" << table_size_before\n              << \", while table_size_after: \" << table_size_after\n              << \", while len: \" << KEY_NUM << std::endl;\n\n    // Step3 : update old keys.\n    for (int i = 0; i < KEY_NUM; i++) {\n      h_scores[i] = h_keys[i];\n      for (int j = 0; j < options.dim; j++) {\n        h_vectors[i * options.dim + j] = static_cast<V>(h_keys[i] * 0.2);\n      }\n    }\n    table_size_before = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyHostToDevice));\n    table->assign(KEY_NUM, d_keys, d_vectors, d_scores, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    table_size_after = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(table_size_before, table_size_after);\n\n    // Step4 : find old keys.\n    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n    CUDA_CHECK(cudaMemset(d_scores, 0, KEY_NUM * sizeof(S)));\n    table->find_or_insert(KEY_NUM, d_keys, d_vectors, d_scores, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    table_size_verify = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(table_size_verify, table_size_after);\n    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n    found_num = 0;\n    value_diff_cnt = 0;\n    for (int i = 0; i < KEY_NUM; i++) {\n      if (h_vectors[i * options.dim] == static_cast<V>(h_keys[i] * 0.2))\n        ++found_num;\n      for (int j = 0; j < options.dim; j++) {\n        if (h_vectors[i * options.dim + j] != static_cast<V>(h_keys[i] * 0.2)) {\n          ++value_diff_cnt;\n          break;\n        }\n      }\n    }\n    ASSERT_EQ(found_num, KEY_NUM);\n    ASSERT_EQ(value_diff_cnt, 0);\n    std::cout << \"Check  assign        behavior got \"\n              << \" key_miss_cnt: \" << KEY_NUM - found_num\n              << \" value_diff_cnt: \" << value_diff_cnt\n              << \" while table_size_before: \" << table_size_before\n              << \", while table_size_after: \" << table_size_after\n              << \", while len: \" << KEY_NUM << std::endl;\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaFreeHost(h_keys));\n  CUDA_CHECK(cudaFreeHost(h_scores));\n  CUDA_CHECK(cudaFreeHost(h_vectors));\n  CUDA_CHECK(cudaFreeHost(h_found));\n\n  CUDA_CHECK(cudaFree(d_keys));\n  CUDA_CHECK(cudaFree(d_scores));\n  CUDA_CHECK(cudaFree(d_vectors));\n  CUDA_CHECK(cudaFree(d_found));\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\ntemplate <typename K, typename V, typename S, typename Table, size_t dim = 64>\nvoid CheckFindOrInsertValues(Table* table, K* keys, V* values, S* scores,\n                             size_t len, cudaStream_t stream) {\n  std::map<K, test_util::ValueArray<V, dim>> map_before_insert;\n  std::map<K, test_util::ValueArray<V, dim>> map_after_insert;\n  K* h_tmp_keys = nullptr;\n  V* h_tmp_values = nullptr;\n  S* h_tmp_scores = nullptr;\n\n  K* d_tmp_keys = nullptr;\n  V* d_tmp_values = nullptr;\n  S* d_tmp_scores = nullptr;\n\n  size_t table_size_before = table->size(stream);\n  size_t cap = table_size_before + len;\n\n  CUDA_CHECK(cudaMallocAsync(&d_tmp_keys, cap * sizeof(K), stream));\n  CUDA_CHECK(cudaMemsetAsync(d_tmp_keys, 0, cap * sizeof(K), stream));\n  CUDA_CHECK(cudaMallocAsync(&d_tmp_values, cap * dim * sizeof(V), stream));\n  CUDA_CHECK(cudaMemsetAsync(d_tmp_values, 0, cap * dim * sizeof(V), stream));\n  CUDA_CHECK(cudaMallocAsync(&d_tmp_scores, cap * sizeof(S), stream));\n  CUDA_CHECK(cudaMemsetAsync(d_tmp_scores, 0, cap * sizeof(S), stream));\n  h_tmp_keys = (K*)malloc(cap * sizeof(K));\n  h_tmp_values = (V*)malloc(cap * dim * sizeof(V));\n  h_tmp_scores = (S*)malloc(cap * sizeof(S));\n\n  size_t table_size_verify0 = table->export_batch(\n      table->capacity(), 0, d_tmp_keys, d_tmp_values, d_tmp_scores, stream);\n  ASSERT_EQ(table_size_before, table_size_verify0);\n\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys,\n                             table_size_before * sizeof(K),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,\n                             table_size_before * dim * sizeof(V),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores,\n                             table_size_before * sizeof(S),\n                             cudaMemcpyDeviceToHost, stream));\n\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n\n  for (size_t i = 0; i < table_size_verify0; i++) {\n    test_util::ValueArray<V, dim>* vec =\n        reinterpret_cast<test_util::ValueArray<V, dim>*>(h_tmp_values +\n                                                         i * dim);\n    map_before_insert[h_tmp_keys[i]] = *vec;\n  }\n\n  auto start = std::chrono::steady_clock::now();\n  table->find_or_insert(len, keys, values, nullptr, stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  auto end = std::chrono::steady_clock::now();\n  auto diff = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start);\n\n  float dur = diff.count();\n\n  size_t table_size_after = table->size(stream);\n  size_t table_size_verify1 = table->export_batch(\n      table->capacity(), 0, d_tmp_keys, d_tmp_values, d_tmp_scores, stream);\n\n  ASSERT_EQ(table_size_verify1, table_size_after);\n\n  size_t new_cap = table_size_after;\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys,\n                             table_size_after * sizeof(K),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,\n                             table_size_after * dim * sizeof(V),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores,\n                             table_size_after * sizeof(S),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  int64_t new_cap_K = (int64_t)new_cap;\n  for (int64_t i = new_cap_K - 1; i >= 0; i--) {\n    test_util::ValueArray<V, dim>* vec =\n        reinterpret_cast<test_util::ValueArray<V, dim>*>(h_tmp_values +\n                                                         i * dim);\n    map_after_insert[h_tmp_keys[i]] = *vec;\n  }\n\n  size_t value_diff_cnt = 0;\n  for (auto& it : map_after_insert) {\n    test_util::ValueArray<V, dim>& vec = map_after_insert.at(it.first);\n    for (size_t j = 0; j < dim; j++) {\n      if (vec[j] != static_cast<float>(it.first * 0.00001)) {\n        ++value_diff_cnt;\n        break;\n      }\n    }\n  }\n  ASSERT_EQ(value_diff_cnt, 0);\n  std::cout << \"Check find_or_insert behavior got \"\n            << \"value_diff_cnt: \" << value_diff_cnt\n            << \", while table_size_before: \" << table_size_before\n            << \", while table_size_after: \" << table_size_after\n            << \", while len: \" << len << std::endl;\n\n  CUDA_CHECK(cudaFreeAsync(d_tmp_keys, stream));\n  CUDA_CHECK(cudaFreeAsync(d_tmp_values, stream));\n  CUDA_CHECK(cudaFreeAsync(d_tmp_scores, stream));\n  free(h_tmp_keys);\n  free(h_tmp_values);\n  free(h_tmp_scores);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n}\n\nvoid test_find_or_insert_values_check(size_t max_hbm_for_vectors) {\n  const size_t U = 524288;\n  const size_t init_capacity = 1024;\n  const size_t B = 524288 + 13;\n  constexpr size_t dim = 64;\n\n  TableOptions opt;\n\n  opt.max_capacity = U;\n  opt.init_capacity = init_capacity;\n  opt.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>;\n  opt.dim = 64;\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  std::unique_ptr<Table> table = std::make_unique<Table>();\n  table->init(opt);\n\n  test_util::KVMSBuffer<K, V, S> data_buffer;\n  data_buffer.Reserve(B, dim, stream);\n\n  size_t offset = 0;\n  S score = 0;\n  for (int i = 0; i < 20; i++) {\n    test_util::create_random_keys<K, S, V, dim>(\n        data_buffer.keys_ptr(false), data_buffer.scores_ptr(false),\n        data_buffer.values_ptr(false), (int)B, B * 16);\n    data_buffer.SyncData(true, stream);\n\n    CheckFindOrInsertValues<K, V, S, Table, dim>(\n        table.get(), data_buffer.keys_ptr(), data_buffer.values_ptr(),\n        data_buffer.scores_ptr(), B, stream);\n\n    offset += B;\n    score += 1;\n  }\n}\n\nTEST(FindOrInsertTest, test_export_batch_if) {\n  test_export_batch_if(16);\n  test_export_batch_if(0, 31);\n}\nTEST(FindOrInsertTest, test_find_or_insert_multi_threads) {\n  test_find_or_insert_multi_threads(16, 0.25f, 0.125f);\n  test_find_or_insert_multi_threads(16, 0.375f, 0.125f);\n  test_find_or_insert_multi_threads(0, 0.25f, 0.125f);\n  test_find_or_insert_multi_threads(0, 0.375f, 0.125f);\n}\nTEST(FindOrInsertTest, test_value_type_hbm_mode) {\n  test_value_type_hbm_mode<int8_t, 64>();\n  test_value_type_hbm_mode<int8_t, 256>();\n  test_value_type_hbm_mode<int8_t, 512>();\n\n  test_value_type_hbm_mode<uint8_t, 63>();\n  test_value_type_hbm_mode<uint8_t, 255>();\n  test_value_type_hbm_mode<uint8_t, 511>();\n\n  test_value_type_hbm_mode<int16_t, 32>();\n  test_value_type_hbm_mode<int16_t, 128>();\n  test_value_type_hbm_mode<int16_t, 256>();\n\n  test_value_type_hbm_mode<int, 16>();\n  test_value_type_hbm_mode<int, 64>();\n  test_value_type_hbm_mode<float, 128>();\n\n  test_value_type_hbm_mode<int64_t, 31>();\n  test_value_type_hbm_mode<double, 63>();\n}\nTEST(FindOrInsertTest, test_basic) {\n  test_basic(16, 61);\n  test_basic(0);\n}\nTEST(FindOrInsertTest, test_basic_when_full) {\n  test_basic_when_full(16);\n  test_basic_when_full(0, 41);\n}\nTEST(FindOrInsertTest, test_erase_if_pred) {\n  test_erase_if_pred(16);\n  test_erase_if_pred(0, 17);\n}\nTEST(FindOrInsertTest, test_rehash) {\n  test_rehash(16);\n  test_rehash(0, 22);\n}\nTEST(FindOrInsertTest, test_rehash_on_big_batch) {\n  test_rehash_on_big_batch(16, 37);\n  test_rehash_on_big_batch(0);\n}\nTEST(FindOrInsertTest, test_dynamic_rehash_on_multi_threads) {\n  test_dynamic_rehash_on_multi_threads(16, 22);\n  test_dynamic_rehash_on_multi_threads(0);\n}\nTEST(FindOrInsertTest, test_basic_for_cpu_io) {\n  test_basic_for_cpu_io(45);\n  test_basic_for_cpu_io();\n}\nTEST(FindOrInsertTest, test_evict_strategy_lru_basic) {\n  test_evict_strategy_lru_basic(16);\n  test_evict_strategy_lru_basic(0, 44);\n}\nTEST(FindOrInsertTest, test_evict_strategy_lfu_basic) {\n  test_evict_strategy_lfu_basic(16, 34);\n  test_evict_strategy_lfu_basic(0);\n}\nTEST(FindOrInsertTest, test_evict_strategy_epochlru_basic) {\n  test_evict_strategy_epochlru_basic(16, 41);\n  test_evict_strategy_epochlru_basic(0);\n}\nTEST(FindOrInsertTest, test_evict_strategy_epochlfu_basic) {\n  test_evict_strategy_epochlfu_basic(16, 42);\n  test_evict_strategy_epochlfu_basic(0);\n}\nTEST(FindOrInsertTest, test_evict_strategy_customized_basic) {\n  test_evict_strategy_customized_basic(16);\n  test_evict_strategy_customized_basic(0, 43);\n}\nTEST(FindOrInsertTest, test_evict_strategy_customized_advanced) {\n  test_evict_strategy_customized_advanced(16, 54);\n  test_evict_strategy_customized_advanced(0);\n}\nTEST(FindOrInsertTest, test_assign_advanced_on_epochlfu) {\n  test_assign_advanced_on_epochlfu(16);\n}\nTEST(FindOrInsertTest, test_evict_strategy_customized_correct_rate) {\n  // TODO(rhdong): after blossom CI issue is resolved, the skip logic.\n  const bool skip_hmem_check = (nullptr != std::getenv(\"IS_BLOSSOM_CI\"));\n  test_evict_strategy_customized_correct_rate(16);\n  if (!skip_hmem_check) {\n    test_evict_strategy_customized_correct_rate(0);\n  } else {\n    std::cout << \"The HMEM check is skipped in blossom CI!\" << std::endl;\n  }\n}\n\nTEST(FindOrInsertTest, test_find_or_insert_values_check) {\n  test_find_or_insert_values_check(16);\n  // TODO(rhdong): Add back when diff error issue fixed in hybrid mode.\n  test_find_or_insert_values_check(0);\n}\n"
  },
  {
    "path": "tests/find_with_missed_keys_test.cc.cu",
    "content": "/*\n * Copyright (c) 2024, NVIDIA CORPORATION.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *     http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n#include <gtest/gtest.h>\n#include <inttypes.h>\n#include <stdint.h>\n#include <stdio.h>\n#include <vector>\n#include \"merlin_hashtable.cuh\"\n#include \"test_util.cuh\"\n\nconstexpr size_t DIM = 16;\nusing K = uint64_t;\nusing V = float;\nusing S = uint64_t;\nusing EvictStrategy = nv::merlin::EvictStrategy;\nusing TableOptions = nv::merlin::HashTableOptions;\n\nvoid test_find(size_t max_hbm_for_vectors, size_t max_bucket_size,\n               double load_factor, bool pipeline_lookup, int key_start = 0) {\n  MERLIN_CHECK(load_factor >= 0.0 && load_factor <= 1.0,\n               \"Invalid `load_factor`\");\n\n  constexpr uint64_t INIT_CAPACITY = 1024 * 1024UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t KEY_NUM = INIT_CAPACITY;\n  constexpr uint64_t TEST_TIMES = 1;\n\n  K* h_keys;\n  S* h_scores;\n  V* h_vectors;\n  K* h_missed_keys;\n  int* h_missed_indices;\n\n  TableOptions options;\n  options.reserved_key_start_bit = key_start;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_hbm_for_vectors = nv::merlin::MB(max_hbm_for_vectors);\n  if (pipeline_lookup) {\n    options.max_bucket_size = 128;\n  } else {\n    options.max_bucket_size = 256;\n  }\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMallocHost(&h_missed_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMallocHost(&h_missed_indices, KEY_NUM * sizeof(int)));\n\n  K* d_keys;\n  S* d_scores;\n  V* d_vectors;\n  K* d_missed_keys;\n  int* d_missed_indices;\n  int* d_missed_size;\n\n  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMalloc(&d_missed_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_missed_indices, KEY_NUM * sizeof(int)));\n  CUDA_CHECK(cudaMalloc(&d_missed_size, sizeof(int)));\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  int missed_size;\n  for (int i = 0; i < TEST_TIMES; ++i) {\n    CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n    test_util::create_random_keys<K, S, V, DIM>(h_keys, h_scores, h_vectors,\n                                                KEY_NUM);\n    CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyHostToDevice));\n\n    Table table;\n    table.init(options);\n    size_t size = table.size(stream);\n    ASSERT_EQ(size, 0);\n\n    size_t insert_num = (double)KEY_NUM * load_factor;\n    table.insert_or_assign(insert_num, d_keys, d_vectors, d_scores, stream);\n    table.find(KEY_NUM, d_keys, d_vectors, d_missed_keys, d_missed_indices,\n               d_missed_size, d_scores, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    CUDA_CHECK(cudaMemcpy(&missed_size, d_missed_size, sizeof(int),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_missed_keys, d_missed_keys, missed_size * sizeof(K),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_missed_indices, d_missed_indices,\n                          missed_size * sizeof(int), cudaMemcpyDeviceToHost));\n\n    if (insert_num == 0) {\n      ASSERT_EQ(missed_size, KEY_NUM);\n    } else {\n      CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                            KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDeviceToHost));\n\n      ASSERT_TRUE(missed_size > 0 && missed_size < KEY_NUM);\n      std::vector<bool> founds(KEY_NUM, true);\n      // Check missed\n      for (int j = 0; j < missed_size; ++j) {\n        int idx = h_missed_indices[i];\n        ASSERT_TRUE(idx >= 0 && idx < KEY_NUM);\n        ASSERT_EQ(h_keys[idx], h_missed_keys[i]);\n        founds[idx] = false;\n      }\n      // Check hitted\n      for (uint64_t j = 0; j < KEY_NUM; ++j) {\n        if (founds[j]) {\n          for (int k = 0; k < options.dim; ++k) {\n            ASSERT_EQ(h_vectors[j * options.dim + k],\n                      static_cast<float>(h_keys[j] * 0.00001));\n          }\n        }\n      }\n    }\n  }\n\n  CUDA_CHECK(cudaFreeHost(h_keys));\n  CUDA_CHECK(cudaFreeHost(h_scores));\n  CUDA_CHECK(cudaFreeHost(h_vectors));\n  CUDA_CHECK(cudaFreeHost(h_missed_keys));\n  CUDA_CHECK(cudaFreeHost(h_missed_indices));\n  CUDA_CHECK(cudaFree(d_keys));\n  CUDA_CHECK(cudaFree(d_scores));\n  CUDA_CHECK(cudaFree(d_vectors));\n  CUDA_CHECK(cudaFree(d_missed_keys));\n  CUDA_CHECK(cudaFree(d_missed_indices));\n  CUDA_CHECK(cudaFree(d_missed_size));\n\n  CudaCheckError();\n}\n\nTEST(FindTest, test_find_when_empty) {\n  // pure HMEM\n  test_find(0, 128, 0.0, true, 12);\n  test_find(0, 256, 0.0, false);\n  // hybrid\n  test_find(32, 128, 0.0, true, 58);\n  test_find(32, 256, 0.0, false);\n  // pure HBM\n  test_find(1024, 128, 0.0, true);\n  test_find(1024, 256, 0.0, false, 12);\n}\n\nTEST(FindTest, test_find_when_full) {\n  // pure HMEM\n  test_find(0, 128, 1.0, true);\n  test_find(0, 256, 1.0, false);\n  // hybrid\n  test_find(32, 128, 1.0, true);\n  test_find(32, 256, 1.0, false, 60);\n  // pure HBM\n  test_find(1024, 128, 1.0, true);\n  test_find(1024, 256, 1.0, false);\n}\n\nTEST(FindTest, test_find_load_factor) {\n  // pure HMEM\n  test_find(0, 128, 0.2, true, 45);\n  test_find(0, 256, 0.2, false, 12);\n  // hybrid\n  test_find(32, 128, 0.2, true, 27);\n  test_find(32, 256, 0.2, false, 53);\n  // pure HBM\n  test_find(1024, 128, 0.2, true, 9);\n  test_find(1024, 256, 0.2, false, 38);\n\n  // pure HMEM\n  test_find(0, 128, 0.5, true, 21);\n  test_find(0, 256, 0.5, false, 46);\n  // hybrid\n  test_find(32, 128, 0.5, true, 31);\n  test_find(32, 256, 0.5, false, 59);\n  // pure HBM\n  test_find(1024, 128, 0.5, true, 4);\n  test_find(1024, 256, 0.5, false, 22);\n\n  // pure HMEM\n  test_find(0, 128, 0.75, true, 11);\n  test_find(0, 256, 0.75, false, 34);\n  // hybrid\n  test_find(32, 128, 0.75, true, 18);\n  test_find(32, 256, 0.75, false, 47);\n  // pure HBM\n  test_find(1024, 128, 0.75, true, 7);\n  test_find(1024, 256, 0.75, false, 29);\n}\n"
  },
  {
    "path": "tests/group_lock_test.cc.cu",
    "content": "/*\n * Copyright (c) 2023, NVIDIA CORPORATION.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *     http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n#include <gtest/gtest.h>\n#include <chrono>\n#include <system_error>\n#include <thread>\n#include <vector>\n#include \"merlin/group_lock.cuh\"\n\nusing namespace nv::merlin;\nusing namespace std::chrono_literals;\n\n// Test the basic functionality of the group_shared_mutex\nTEST(GroupSharedMutexTest, BasicFunctionality) {\n  group_shared_mutex mutex;\n  ASSERT_EQ(mutex.read_count(), 0);\n  ASSERT_EQ(mutex.update_count(), 0);\n\n  {\n    // Multiple reads can acquire the lock simultaneously\n    read_shared_lock read1(mutex);\n    ASSERT_EQ(mutex.read_count(), 1);\n    read_shared_lock read2(mutex);\n    ASSERT_EQ(mutex.read_count(), 2);\n  }\n  ASSERT_EQ(mutex.read_count(), 0);\n  ASSERT_EQ(mutex.update_count(), 0);\n\n  {\n    // A update is blocked by the reads\n    update_shared_lock update(mutex, std::defer_lock);\n    EXPECT_FALSE(update.owns_lock());\n    ASSERT_EQ(mutex.read_count(), 0);\n    ASSERT_EQ(mutex.update_count(), 0);\n    update.lock();\n    ASSERT_EQ(mutex.read_count(), 0);\n    ASSERT_EQ(mutex.update_count(), 1);\n    EXPECT_TRUE(update.owns_lock());\n  }\n  ASSERT_EQ(mutex.read_count(), 0);\n  ASSERT_EQ(mutex.update_count(), 0);\n\n  // A unique lock is also blocked by the reads\n  {\n    update_read_lock unique(mutex, std::defer_lock);\n    ASSERT_EQ(mutex.read_count(), 0);\n    ASSERT_EQ(mutex.update_count(), 0);\n    EXPECT_FALSE(unique.owns_lock());\n    unique.lock();\n    EXPECT_TRUE(unique.owns_lock());\n    ASSERT_EQ(mutex.read_count(), 1);\n    ASSERT_EQ(mutex.update_count(), 1);\n\n    EXPECT_DEATH(unique.lock(), \"trying to lock twice!\");\n  }\n  ASSERT_EQ(mutex.read_count(), 0);\n  ASSERT_EQ(mutex.update_count(), 0);\n}\n\nTEST(GroupSharedMutexTest, AdvancedFunctionalitySingleStream) {\n  group_shared_mutex mutex;\n  bool multiple_read = false;\n  bool multiple_update = false;\n\n  // Test multiple reads\n  std::vector<std::thread> reads;\n  for (int i = 0; i < 50; ++i) {\n    reads.emplace_back([&]() {\n      read_shared_lock read(mutex);\n      EXPECT_TRUE(mutex.read_count() > 0);\n      if (mutex.read_count() > 1) multiple_read = true;\n      std::this_thread::sleep_for(1000ms);\n      ASSERT_EQ(mutex.update_count(), 0);\n    });\n  }\n\n  // Test multiple updates\n  std::vector<std::thread> updates;\n  for (int i = 0; i < 50; ++i) {\n    updates.emplace_back([&]() {\n      update_shared_lock update(mutex);\n      EXPECT_TRUE(mutex.update_count() > 0);\n      if (mutex.update_count() > 1) multiple_update = true;\n      std::this_thread::sleep_for(1000ms);\n      ASSERT_EQ(mutex.read_count(), 0);\n    });\n  }\n\n  // Test multiple uniques\n  std::vector<std::thread> uniques;\n  for (int i = 0; i < 50; ++i) {\n    uniques.emplace_back([&]() {\n      update_read_lock unique(mutex);\n      ASSERT_EQ(mutex.read_count(), 1);\n      ASSERT_EQ(mutex.update_count(), 1);\n      std::this_thread::sleep_for(100ms);\n    });\n  }\n\n  for (auto& th : reads) {\n    th.join();\n  }\n\n  for (auto& th : updates) {\n    th.join();\n  }\n\n  for (auto& th : uniques) {\n    th.join();\n  }\n\n  EXPECT_TRUE(multiple_update);\n  EXPECT_TRUE(multiple_read);\n}\n\nTEST(GroupSharedMutexTest, AdvancedFunctionalityMultiStream) {\n  group_shared_mutex mutex;\n  bool multiple_read = false;\n  bool multiple_update = false;\n\n  // Test multiple reads\n  std::vector<std::thread> reads;\n  for (int i = 0; i < 50; ++i) {\n    reads.emplace_back([&]() {\n      cudaStream_t stream;\n      CUDA_CHECK(cudaStreamCreate(&stream));\n\n      read_shared_lock read(mutex);\n      EXPECT_TRUE(mutex.read_count() > 0);\n      if (mutex.read_count() > 1) multiple_read = true;\n      std::this_thread::sleep_for(1000ms);\n      ASSERT_EQ(mutex.update_count(), 0);\n\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      CUDA_CHECK(cudaStreamDestroy(stream));\n    });\n  }\n\n  // Test multiple updates\n  std::vector<std::thread> updates;\n  for (int i = 0; i < 50; ++i) {\n    updates.emplace_back([&]() {\n      cudaStream_t stream;\n      CUDA_CHECK(cudaStreamCreate(&stream));\n\n      update_shared_lock update(mutex);\n      EXPECT_TRUE(mutex.update_count() > 0);\n      if (mutex.update_count() > 1) multiple_update = true;\n      std::this_thread::sleep_for(1000ms);\n      ASSERT_EQ(mutex.read_count(), 0);\n\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      CUDA_CHECK(cudaStreamDestroy(stream));\n    });\n  }\n\n  // Test multiple uniques\n  std::vector<std::thread> uniques;\n  for (int i = 0; i < 50; ++i) {\n    uniques.emplace_back([&]() {\n      cudaStream_t stream;\n      CUDA_CHECK(cudaStreamCreate(&stream));\n\n      update_read_lock unique(mutex);\n      ASSERT_EQ(mutex.read_count(), 1);\n      ASSERT_EQ(mutex.read_count(), 1);\n      std::this_thread::sleep_for(100ms);\n\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      CUDA_CHECK(cudaStreamDestroy(stream));\n    });\n  }\n\n  for (auto& th : reads) {\n    th.join();\n  }\n\n  for (auto& th : updates) {\n    th.join();\n  }\n\n  for (auto& th : uniques) {\n    th.join();\n  }\n\n  EXPECT_TRUE(multiple_update);\n  EXPECT_TRUE(multiple_read);\n}\n"
  },
  {
    "path": "tests/insert_and_evict_test.cc.cu",
    "content": "/*\n * Copyright (c) 2023, NVIDIA CORPORATION.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *     http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n#include <gtest/gtest.h>\n#include <stdio.h>\n#include <array>\n#include <map>\n#include <unordered_map>\n#include \"merlin/types.cuh\"\n#include \"merlin_hashtable.cuh\"\n#include \"merlin_localfile.hpp\"\n#include \"test_util.cuh\"\n\nconstexpr size_t dim = 64;\nusing i64 = int64_t;\nusing u64 = uint64_t;\nusing f32 = float;\nusing EvictStrategy = nv::merlin::EvictStrategy;\nusing TableOptions = nv::merlin::HashTableOptions;\n\n/*\n * There are several steps to check whether if\n * the insert_and_evict API is safe to use:\n *\n *   step1: Create a table with max_capacity U\n *   step2: Insert M keys into table while M < U. And\n *     the table size became m <= M. M - m keys was\n *     evicted.\n *   step3: Insert N keys into table while m + N > U, with\n *     no same key with M keys. And p keys gets evicted.\n *     If now the table size is v. Then total number of\n *     keys T = v + p + M - m, must equal to VT = M + N,\n *     while the keys, values, and scores match.\n *   step4: export table and check all values.\n */\nvoid test_insert_and_evict_basic() {\n  TableOptions opt;\n\n  // table setting\n  const size_t init_capacity = 1024;\n\n  // numeric setting\n  const size_t U = 2llu << 18;\n  const size_t M = (U >> 1);\n  const size_t N = (U >> 1) + 17;  // Add a prime to test the non-aligned case.\n\n  opt.max_capacity = U;\n  opt.init_capacity = init_capacity;\n  opt.max_hbm_for_vectors = U * dim * sizeof(f32);\n  opt.num_of_buckets_per_alloc = 8;\n\n  using Table =\n      nv::merlin::HashTable<i64, f32, u64, EvictStrategy::kCustomized>;\n  opt.dim = dim;\n\n  std::map<i64, test_util::ValueArray<f32, dim>> summarized_kvs;\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  // step1\n  std::unique_ptr<Table> table = std::make_unique<Table>();\n  table->init(opt);\n\n  // step2\n  test_util::KVMSBuffer<i64, f32, u64> evict_buffer;\n  evict_buffer.Reserve(M, dim, stream);\n  evict_buffer.ToZeros(stream);\n\n  test_util::KVMSBuffer<i64, f32, u64> buffer;\n  buffer.Reserve(M, dim, stream);\n  buffer.ToRange(0, 1, stream);\n  buffer.Setscore((u64)1, stream);\n  size_t n_evicted = table->insert_and_evict(\n      M, buffer.keys_ptr(), buffer.values_ptr(), buffer.scores_ptr(),\n      evict_buffer.keys_ptr(), evict_buffer.values_ptr(),\n      evict_buffer.scores_ptr(), stream);\n  size_t table_size_m = table->size(stream);\n  buffer.SyncData(/*h2d=*/false, stream);\n  evict_buffer.SyncData(/*h2d=*/false, stream);\n  ASSERT_EQ(n_evicted + table_size_m, M);\n  for (size_t i = 0; i < n_evicted; i++) {\n    test_util::ValueArray<f32, dim>* vec =\n        reinterpret_cast<test_util::ValueArray<f32, dim>*>(\n            evict_buffer.values_ptr(false) + i * dim);\n    summarized_kvs.emplace(evict_buffer.keys_ptr(false)[i], *vec);\n  }\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n\n  //  step3\n  evict_buffer.Reserve(N, dim, stream);\n  buffer.Reserve(N, dim, stream);\n  buffer.ToRange(M, 1, stream);\n  buffer.Setscore((u64)2, stream);\n  n_evicted = table->insert_and_evict(\n      N, buffer.keys_ptr(), buffer.values_ptr(), buffer.scores_ptr(),\n      evict_buffer.keys_ptr(), evict_buffer.values_ptr(),\n      evict_buffer.scores_ptr(), stream);\n  size_t table_size_n = table->size(stream);\n  buffer.SyncData(/*h2d=*/false, stream);\n  evict_buffer.SyncData(/*h2d=*/false, stream);\n  ASSERT_EQ(table_size_m + N, table_size_n + n_evicted);\n  for (size_t i = 0; i < n_evicted; i++) {\n    test_util::ValueArray<f32, dim>* vec =\n        reinterpret_cast<test_util::ValueArray<f32, dim>*>(\n            evict_buffer.values_ptr(false) + i * dim);\n    summarized_kvs.emplace(evict_buffer.keys_ptr(false)[i], *vec);\n  }\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n\n  // step4\n  buffer.Reserve(table_size_n, dim, stream);\n  size_t n_exported =\n      table->export_batch(table->capacity(), 0, buffer.keys_ptr(),\n                          buffer.values_ptr(), buffer.scores_ptr(), stream);\n  ASSERT_EQ(table_size_n, n_exported);\n  buffer.SyncData(/*h2d=*/false, stream);\n  for (size_t i = 0; i < n_exported; i++) {\n    test_util::ValueArray<f32, dim>* vec =\n        reinterpret_cast<test_util::ValueArray<f32, dim>*>(\n            buffer.values_ptr(false) + i * dim);\n    summarized_kvs.emplace(buffer.keys_ptr(false)[i], *vec);\n  }\n\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  buffer.Free(stream);\n  evict_buffer.Free(stream);\n\n  size_t k = 0;\n  for (auto it = summarized_kvs.begin(); it != summarized_kvs.end(); it++) {\n    i64 key = it->first;\n    test_util::ValueArray<f32, dim>& value = it->second;\n    ASSERT_EQ(key, (i64)k);\n    for (size_t j = 0; j < dim; j++) {\n      ASSERT_EQ(value[j], (f32)k);\n    }\n    ++k;\n  }\n  ASSERT_EQ(summarized_kvs.size(), M + N);\n  summarized_kvs.clear();\n}\n\ntemplate <typename K, typename V, typename S, typename Table>\nvoid CheckInsertAndEvict(Table* table, K* keys, V* values, S* scores,\n                         K* evicted_keys, V* evicted_values, S* evicted_scores,\n                         size_t len, cudaStream_t stream, TableOptions& opt) {\n  std::map<i64, test_util::ValueArray<f32, dim>> map_before_insert;\n  std::map<i64, test_util::ValueArray<f32, dim>> map_after_insert;\n  K* h_tmp_keys = nullptr;\n  V* h_tmp_values = nullptr;\n  S* h_tmp_scores = nullptr;\n  bool* h_tmp_founds = nullptr;\n\n  K* d_tmp_keys = nullptr;\n  V* d_tmp_values = nullptr;\n  S* d_tmp_scores = nullptr;\n  bool* d_tmp_founds = nullptr;\n\n  size_t table_size_before = table->size(stream);\n  size_t cap = table_size_before + len;\n\n  CUDA_CHECK(cudaMallocAsync(&d_tmp_keys, cap * sizeof(K), stream));\n  CUDA_CHECK(cudaMemsetAsync(d_tmp_keys, 0, cap * sizeof(K), stream));\n  CUDA_CHECK(cudaMallocAsync(&d_tmp_values, cap * dim * sizeof(V), stream));\n  CUDA_CHECK(cudaMemsetAsync(d_tmp_values, 0, cap * dim * sizeof(V), stream));\n  CUDA_CHECK(cudaMallocAsync(&d_tmp_scores, cap * sizeof(S), stream));\n  CUDA_CHECK(cudaMemsetAsync(d_tmp_scores, 0, cap * sizeof(S), stream));\n  CUDA_CHECK(cudaMallocAsync(&d_tmp_founds, cap * sizeof(bool), stream));\n  CUDA_CHECK(cudaMemsetAsync(d_tmp_founds, 0, cap * sizeof(bool), stream));\n  h_tmp_keys = (K*)malloc(cap * sizeof(K));\n  h_tmp_values = (V*)malloc(cap * dim * sizeof(V));\n  h_tmp_scores = (S*)malloc(cap * sizeof(S));\n  h_tmp_founds = (bool*)malloc(cap * sizeof(bool));\n\n  size_t table_size_verify0 = table->export_batch(\n      table->capacity(), 0, d_tmp_keys, d_tmp_values, d_tmp_scores, stream);\n  ASSERT_EQ(table_size_before, table_size_verify0);\n\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys,\n                             table_size_before * sizeof(K),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,\n                             table_size_before * dim * sizeof(V),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores,\n                             table_size_before * sizeof(S),\n                             cudaMemcpyDeviceToHost, stream));\n\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys + table_size_before, keys,\n                             len * sizeof(K), cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values + table_size_before * dim, values,\n                             len * dim * sizeof(V), cudaMemcpyDeviceToHost,\n                             stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores + table_size_before, scores,\n                             len * sizeof(S), cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n\n  for (size_t i = 0; i < cap; i++) {\n    test_util::ValueArray<V, dim>* vec =\n        reinterpret_cast<test_util::ValueArray<V, dim>*>(h_tmp_values +\n                                                         i * dim);\n    map_before_insert[h_tmp_keys[i]] = *vec;\n  }\n\n  auto start = std::chrono::steady_clock::now();\n  size_t filtered_len = table->insert_and_evict(\n      len, keys, values,\n      Table::evict_strategy == EvictStrategy::kLru ? nullptr : scores,\n      evicted_keys, evicted_values, evicted_scores, stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  auto end = std::chrono::steady_clock::now();\n  auto diff = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start);\n\n  {\n    table->find(len, keys, values, d_tmp_founds, scores, stream);\n    CUDA_CHECK(cudaMemcpyAsync(h_tmp_founds, d_tmp_founds, len * sizeof(bool),\n                               cudaMemcpyDeviceToHost, stream));\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    size_t found_counter = 0;\n    for (int i = 0; i < len; i++) {\n      if (h_tmp_founds[i]) found_counter++;\n    }\n    std::cout << \"filtered_len:\" << filtered_len\n              << \", miss counter:\" << len - found_counter << std::endl;\n\n    CUDA_CHECK(cudaMemset(d_tmp_founds, 0, len * sizeof(bool)));\n    table->contains(len, keys, d_tmp_founds, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    int contains_counter = 0;\n    CUDA_CHECK(cudaMemcpyAsync(h_tmp_founds, d_tmp_founds, len * sizeof(bool),\n                               cudaMemcpyDeviceToHost, stream));\n    for (int i = 0; i < len; i++) {\n      if (h_tmp_founds[i]) contains_counter++;\n    }\n    ASSERT_EQ(contains_counter, found_counter);\n  }\n\n  float dur = diff.count();\n\n  size_t table_size_after = table->size(stream);\n  size_t table_size_verify1 = table->export_batch(\n      table->capacity(), 0, d_tmp_keys, d_tmp_values, d_tmp_scores, stream);\n\n  ASSERT_EQ(table_size_verify1, table_size_after);\n\n  size_t new_cap = table_size_after + filtered_len;\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys,\n                             table_size_after * sizeof(K),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,\n                             table_size_after * dim * sizeof(V),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores,\n                             table_size_after * sizeof(S),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys + table_size_after, evicted_keys,\n                             filtered_len * sizeof(K), cudaMemcpyDeviceToHost,\n                             stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values + table_size_after * dim,\n                             evicted_values, filtered_len * dim * sizeof(V),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores + table_size_after, evicted_scores,\n                             filtered_len * sizeof(S), cudaMemcpyDeviceToHost,\n                             stream));\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  int64_t new_cap_i64 = (int64_t)new_cap;\n  for (int64_t i = new_cap_i64 - 1; i >= 0; i--) {\n    test_util::ValueArray<V, dim>* vec =\n        reinterpret_cast<test_util::ValueArray<V, dim>*>(h_tmp_values +\n                                                         i * dim);\n    map_after_insert[h_tmp_keys[i]] = *vec;\n  }\n\n  size_t key_miss_cnt = 0;\n  size_t value_diff_cnt = 0;\n  for (auto& it : map_before_insert) {\n    if (map_after_insert.find(it.first) == map_after_insert.end()) {\n      ++key_miss_cnt;\n      continue;\n    }\n    test_util::ValueArray<V, dim>& vec0 = it.second;\n    test_util::ValueArray<V, dim>& vec1 = map_after_insert.at(it.first);\n    for (size_t j = 0; j < dim; j++) {\n      if (vec0[j] != vec1[j]) {\n        ++value_diff_cnt;\n        break;\n      }\n    }\n  }\n  std::cout << \"Check insert_and_evict behavior got \"\n            << \"key_miss_cnt: \" << key_miss_cnt\n            << \", value_diff_cnt: \" << value_diff_cnt\n            << \", while table_size_before: \" << table_size_before\n            << \", while table_size_after: \" << table_size_after\n            << \", while len: \" << len << \", dur: \" << dur << std::endl;\n\n  ASSERT_EQ(key_miss_cnt, 0);\n  ASSERT_EQ(value_diff_cnt, 0);\n\n  CUDA_CHECK(cudaFreeAsync(d_tmp_keys, stream));\n  CUDA_CHECK(cudaFreeAsync(d_tmp_values, stream));\n  CUDA_CHECK(cudaFreeAsync(d_tmp_scores, stream));\n  CUDA_CHECK(cudaFreeAsync(d_tmp_founds, stream));\n  free(h_tmp_keys);\n  free(h_tmp_values);\n  free(h_tmp_scores);\n  free(h_tmp_founds);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n}\n\nvoid test_insert_and_evict_advanced_on_lru() {\n  const size_t U = 524288;\n  const size_t init_capacity = U;\n  const size_t B = 524288 + 13;\n\n  TableOptions opt;\n\n  opt.max_capacity = U;\n  opt.init_capacity = init_capacity;\n  opt.max_hbm_for_vectors = U * dim * sizeof(f32);\n  opt.max_bucket_size = 128;\n  opt.num_of_buckets_per_alloc = 32;\n  using Table = nv::merlin::HashTable<i64, f32, u64, EvictStrategy::kLru>;\n  opt.dim = dim;\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  std::unique_ptr<Table> table = std::make_unique<Table>();\n  table->init(opt);\n\n  test_util::KVMSBuffer<i64, f32, u64> evict_buffer;\n  evict_buffer.Reserve(B, dim, stream);\n  evict_buffer.ToZeros(stream);\n\n  test_util::KVMSBuffer<i64, f32, u64> data_buffer;\n  data_buffer.Reserve(B, dim, stream);\n\n  size_t offset = 0;\n  u64 score = 0;\n  for (int i = 0; i < 16; i++) {\n    test_util::create_random_keys<i64, u64, f32, dim>(\n        data_buffer.keys_ptr(false), data_buffer.scores_ptr(false),\n        data_buffer.values_ptr(false), (int)B, B * 16);\n    data_buffer.SyncData(true, stream);\n\n    CheckInsertAndEvict<i64, f32, u64, Table>(\n        table.get(), data_buffer.keys_ptr(), data_buffer.values_ptr(),\n        data_buffer.scores_ptr(), evict_buffer.keys_ptr(),\n        evict_buffer.values_ptr(), evict_buffer.scores_ptr(), B, stream, opt);\n\n    offset += B;\n    score += 1;\n  }\n}\n\ntemplate <typename K, typename V, typename S, typename Table>\nvoid CheckInsertAndEvictOnLfu(Table* table,\n                              test_util::KVMSBuffer<K, V, S>* data_buffer,\n                              test_util::KVMSBuffer<K, V, S>* evict_buffer,\n                              size_t len, cudaStream_t stream,\n                              TableOptions& opt, unsigned int global_epoch) {\n  std::map<K, test_util::ValueArray<V, dim>> values_map_before_insert;\n  std::map<K, test_util::ValueArray<V, dim>> values_map_after_insert;\n\n  std::unordered_map<K, S> scores_map_before_insert;\n  std::map<K, S> scores_map_after_insert;\n\n  std::map<K, S> scores_map_current_batch;\n  std::map<K, S> scores_map_current_evict;\n\n  K* keys = data_buffer->keys_ptr();\n  V* values = data_buffer->values_ptr();\n  S* scores = data_buffer->scores_ptr();\n\n  K* evicted_keys = evict_buffer->keys_ptr();\n  V* evicted_values = evict_buffer->values_ptr();\n  S* evicted_scores = evict_buffer->scores_ptr();\n\n  for (size_t i = 0; i < len; i++) {\n    scores_map_current_batch[data_buffer->keys_ptr(false)[i]] =\n        data_buffer->scores_ptr(false)[i];\n  }\n\n  K* h_tmp_keys = nullptr;\n  V* h_tmp_values = nullptr;\n  S* h_tmp_scores = nullptr;\n  bool* h_tmp_founds = nullptr;\n\n  K* d_tmp_keys = nullptr;\n  V* d_tmp_values = nullptr;\n  S* d_tmp_scores = nullptr;\n  bool* d_tmp_founds = nullptr;\n\n  size_t table_size_before = table->size(stream);\n  size_t cap = table_size_before + len;\n\n  CUDA_CHECK(cudaMallocAsync(&d_tmp_keys, cap * sizeof(K), stream));\n  CUDA_CHECK(cudaMemsetAsync(d_tmp_keys, 0, cap * sizeof(K), stream));\n  CUDA_CHECK(cudaMallocAsync(&d_tmp_values, cap * dim * sizeof(V), stream));\n  CUDA_CHECK(cudaMemsetAsync(d_tmp_values, 0, cap * dim * sizeof(V), stream));\n  CUDA_CHECK(cudaMallocAsync(&d_tmp_scores, cap * sizeof(S), stream));\n  CUDA_CHECK(cudaMemsetAsync(d_tmp_scores, 0, cap * sizeof(S), stream));\n  CUDA_CHECK(cudaMallocAsync(&d_tmp_founds, cap * sizeof(bool), stream));\n  CUDA_CHECK(cudaMemsetAsync(d_tmp_founds, 0, cap * sizeof(bool), stream));\n  h_tmp_keys = (K*)malloc(cap * sizeof(K));\n  h_tmp_values = (V*)malloc(cap * dim * sizeof(V));\n  h_tmp_scores = (S*)malloc(cap * sizeof(S));\n  h_tmp_founds = (bool*)malloc(cap * sizeof(bool));\n\n  size_t table_size_verify0 = table->export_batch(\n      table->capacity(), 0, d_tmp_keys, d_tmp_values, d_tmp_scores, stream);\n  ASSERT_EQ(table_size_before, table_size_verify0);\n\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys,\n                             table_size_before * sizeof(K),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,\n                             table_size_before * dim * sizeof(V),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores,\n                             table_size_before * sizeof(S),\n                             cudaMemcpyDeviceToHost, stream));\n\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys + table_size_before, keys,\n                             len * sizeof(K), cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values + table_size_before * dim, values,\n                             len * dim * sizeof(V), cudaMemcpyDeviceToHost,\n                             stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores + table_size_before, scores,\n                             len * sizeof(S), cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n\n  for (size_t i = 0; i < cap; i++) {\n    test_util::ValueArray<V, dim>* vec =\n        reinterpret_cast<test_util::ValueArray<V, dim>*>(h_tmp_values +\n                                                         i * dim);\n    values_map_before_insert[h_tmp_keys[i]] = *vec;\n  }\n\n  for (size_t i = 0; i < table_size_before; i++) {\n    scores_map_before_insert[h_tmp_keys[i]] = h_tmp_scores[i];\n  }\n\n  auto start = std::chrono::steady_clock::now();\n  table->set_global_epoch(global_epoch);\n  size_t filtered_len = table->insert_and_evict(\n      len, keys, values,\n      (Table::evict_strategy == EvictStrategy::kLru ||\n       Table::evict_strategy == EvictStrategy::kEpochLru)\n          ? nullptr\n          : scores,\n      evicted_keys, evicted_values, evicted_scores, stream);\n  evict_buffer->SyncData(false, stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  auto end = std::chrono::steady_clock::now();\n  auto diff = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start);\n\n  {\n    table->find(len, keys, values, d_tmp_founds, scores, stream);\n    CUDA_CHECK(cudaMemcpyAsync(h_tmp_founds, d_tmp_founds, len * sizeof(bool),\n                               cudaMemcpyDeviceToHost, stream));\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    size_t found_counter = 0;\n    for (int i = 0; i < len; i++) {\n      if (h_tmp_founds[i]) found_counter++;\n    }\n\n    CUDA_CHECK(cudaMemset(d_tmp_founds, 0, len * sizeof(bool)));\n    table->contains(len, keys, d_tmp_founds, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    int contains_counter = 0;\n    CUDA_CHECK(cudaMemcpyAsync(h_tmp_founds, d_tmp_founds, len * sizeof(bool),\n                               cudaMemcpyDeviceToHost, stream));\n    for (int i = 0; i < len; i++) {\n      if (h_tmp_founds[i]) contains_counter++;\n    }\n    ASSERT_EQ(contains_counter, found_counter);\n  }\n\n  for (size_t i = 0; i < filtered_len; i++) {\n    scores_map_current_evict[evict_buffer->keys_ptr(false)[i]] =\n        evict_buffer->scores_ptr(false)[i];\n  }\n\n  float dur = diff.count();\n\n  size_t table_size_after = table->size(stream);\n  size_t table_size_verify1 = table->export_batch(\n      table->capacity(), 0, d_tmp_keys, d_tmp_values, d_tmp_scores, stream);\n\n  ASSERT_EQ(table_size_verify1, table_size_after);\n\n  size_t new_cap = table_size_after + filtered_len;\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys,\n                             table_size_after * sizeof(K),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,\n                             table_size_after * dim * sizeof(V),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores,\n                             table_size_after * sizeof(S),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys + table_size_after, evicted_keys,\n                             filtered_len * sizeof(K), cudaMemcpyDeviceToHost,\n                             stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values + table_size_after * dim,\n                             evicted_values, filtered_len * dim * sizeof(V),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores + table_size_after, evicted_scores,\n                             filtered_len * sizeof(S), cudaMemcpyDeviceToHost,\n                             stream));\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  int64_t new_cap_i64 = (int64_t)new_cap;\n\n  size_t key_miss_cnt = 0;\n  size_t value_diff_cnt = 0;\n  size_t score_error_cnt = 0;\n\n  for (int64_t i = new_cap_i64 - 1; i >= 0; i--) {\n    test_util::ValueArray<V, dim>* vec =\n        reinterpret_cast<test_util::ValueArray<V, dim>*>(h_tmp_values +\n                                                         i * dim);\n    values_map_after_insert[h_tmp_keys[i]] = *vec;\n    scores_map_after_insert[h_tmp_keys[i]] = h_tmp_scores[i];\n  }\n\n  for (auto it : scores_map_current_batch) {\n    const K key = it.first;\n    const K score = it.second;\n    S current_score = scores_map_after_insert[key];\n    S score_before_insert = 0;\n    if (scores_map_before_insert.find(key) != scores_map_before_insert.end() &&\n        scores_map_current_evict.find(key) == scores_map_current_evict.end()) {\n      score_before_insert = scores_map_before_insert[key];\n    } else {\n      continue;\n    }\n    bool valid = (current_score == score + score_before_insert);\n    if (!valid) {\n      score_error_cnt++;\n    }\n  }\n\n  ASSERT_EQ(values_map_before_insert.size(), values_map_after_insert.size());\n\n  for (auto& it : values_map_before_insert) {\n    if (values_map_after_insert.find(it.first) ==\n        values_map_after_insert.end()) {\n      ++key_miss_cnt;\n      continue;\n    }\n    test_util::ValueArray<V, dim>& vec0 = it.second;\n    test_util::ValueArray<V, dim>& vec1 = values_map_after_insert.at(it.first);\n    for (size_t j = 0; j < dim; j++) {\n      if (vec0[j] != vec1[j]) {\n        ++value_diff_cnt;\n      }\n    }\n  }\n  std::cout << \"Check insert_and_evict behavior got \"\n            << \"key_miss_cnt: \" << key_miss_cnt\n            << \", value_diff_cnt: \" << value_diff_cnt\n            << \", score_error_cnt: \" << score_error_cnt\n            << \", while table_size_before: \" << table_size_before\n            << \", while table_size_after: \" << table_size_after\n            << \", while len: \" << len << \", dur: \" << dur << std::endl;\n\n  ASSERT_EQ(key_miss_cnt, 0);\n  ASSERT_EQ(score_error_cnt, 0);\n  ASSERT_EQ(value_diff_cnt, 0);\n\n  CUDA_CHECK(cudaFreeAsync(d_tmp_keys, stream));\n  CUDA_CHECK(cudaFreeAsync(d_tmp_values, stream));\n  CUDA_CHECK(cudaFreeAsync(d_tmp_scores, stream));\n  CUDA_CHECK(cudaFreeAsync(d_tmp_founds, stream));\n  free(h_tmp_keys);\n  free(h_tmp_values);\n  free(h_tmp_scores);\n  free(h_tmp_founds);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n}\n\nvoid test_insert_and_evict_advanced_on_lfu() {\n  const size_t U = 1024 * 1024;\n  const size_t init_capacity = U;\n  const size_t B = 256 * 1024;\n\n  TableOptions opt;\n\n  opt.max_capacity = U;\n  opt.init_capacity = init_capacity;\n  opt.max_hbm_for_vectors = U * dim * sizeof(f32);\n  opt.max_bucket_size = 128;\n  opt.num_of_buckets_per_alloc = 32;\n  using Table = nv::merlin::HashTable<i64, f32, u64, EvictStrategy::kLfu>;\n  opt.dim = dim;\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  std::unique_ptr<Table> table = std::make_unique<Table>();\n  table->init(opt);\n\n  test_util::KVMSBuffer<i64, f32, u64> evict_buffer;\n  evict_buffer.Reserve(B, dim, stream);\n  evict_buffer.ToZeros(stream);\n\n  test_util::KVMSBuffer<i64, f32, u64> data_buffer;\n  data_buffer.Reserve(B, dim, stream);\n\n  size_t offset = 0;\n  for (unsigned int global_epoch = 1; global_epoch <= 32; global_epoch++) {\n    test_util::create_random_keys_advanced<i64, u64, f32>(\n        dim, data_buffer.keys_ptr(false), data_buffer.scores_ptr(false),\n        data_buffer.values_ptr(false), (int)B, B * 16, 100);\n    data_buffer.SyncData(true, stream);\n\n    CheckInsertAndEvictOnLfu<i64, f32, u64, Table>(\n        table.get(), &data_buffer, &evict_buffer, B, stream, opt, global_epoch);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    offset += B;\n  }\n}\n\ntemplate <typename K, typename V, typename S, typename Table>\nvoid CheckInsertAndEvictOnEpochLru(Table* table,\n                                   test_util::KVMSBuffer<K, V, S>* data_buffer,\n                                   test_util::KVMSBuffer<K, V, S>* evict_buffer,\n                                   size_t len, cudaStream_t stream,\n                                   TableOptions& opt,\n                                   unsigned int global_epoch) {\n  std::map<K, test_util::ValueArray<V, dim>> values_map_before_insert;\n  std::map<K, test_util::ValueArray<V, dim>> values_map_after_insert;\n\n  std::map<K, S> scores_map_before_insert;\n  std::map<K, S> scores_map_after_insert;\n\n  std::map<K, S> scores_map_current_batch;\n\n  K* keys = data_buffer->keys_ptr();\n  V* values = data_buffer->values_ptr();\n  S* scores = data_buffer->scores_ptr();\n\n  K* evicted_keys = evict_buffer->keys_ptr();\n  V* evicted_values = evict_buffer->values_ptr();\n  S* evicted_scores = evict_buffer->scores_ptr();\n\n  for (size_t i = 0; i < len; i++) {\n    scores_map_current_batch[data_buffer->keys_ptr(false)[i]] =\n        data_buffer->scores_ptr(false)[i];\n  }\n\n  K* h_tmp_keys = nullptr;\n  V* h_tmp_values = nullptr;\n  S* h_tmp_scores = nullptr;\n  bool* h_tmp_founds = nullptr;\n\n  K* d_tmp_keys = nullptr;\n  V* d_tmp_values = nullptr;\n  S* d_tmp_scores = nullptr;\n  bool* d_tmp_founds = nullptr;\n\n  size_t table_size_before = table->size(stream);\n  size_t cap = table_size_before + len;\n\n  CUDA_CHECK(cudaMallocAsync(&d_tmp_keys, cap * sizeof(K), stream));\n  CUDA_CHECK(cudaMemsetAsync(d_tmp_keys, 0, cap * sizeof(K), stream));\n  CUDA_CHECK(cudaMallocAsync(&d_tmp_values, cap * dim * sizeof(V), stream));\n  CUDA_CHECK(cudaMemsetAsync(d_tmp_values, 0, cap * dim * sizeof(V), stream));\n  CUDA_CHECK(cudaMallocAsync(&d_tmp_scores, cap * sizeof(S), stream));\n  CUDA_CHECK(cudaMemsetAsync(d_tmp_scores, 0, cap * sizeof(S), stream));\n  CUDA_CHECK(cudaMallocAsync(&d_tmp_founds, cap * sizeof(bool), stream));\n  CUDA_CHECK(cudaMemsetAsync(d_tmp_founds, 0, cap * sizeof(bool), stream));\n  h_tmp_keys = (K*)malloc(cap * sizeof(K));\n  h_tmp_values = (V*)malloc(cap * dim * sizeof(V));\n  h_tmp_scores = (S*)malloc(cap * sizeof(S));\n  h_tmp_founds = (bool*)malloc(cap * sizeof(bool));\n\n  size_t table_size_verify0 = table->export_batch(\n      table->capacity(), 0, d_tmp_keys, d_tmp_values, d_tmp_scores, stream);\n  ASSERT_EQ(table_size_before, table_size_verify0);\n\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys,\n                             table_size_before * sizeof(K),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,\n                             table_size_before * dim * sizeof(V),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores,\n                             table_size_before * sizeof(S),\n                             cudaMemcpyDeviceToHost, stream));\n\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys + table_size_before, keys,\n                             len * sizeof(K), cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values + table_size_before * dim, values,\n                             len * dim * sizeof(V), cudaMemcpyDeviceToHost,\n                             stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores + table_size_before, scores,\n                             len * sizeof(S), cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n\n  for (size_t i = 0; i < cap; i++) {\n    test_util::ValueArray<V, dim>* vec =\n        reinterpret_cast<test_util::ValueArray<V, dim>*>(h_tmp_values +\n                                                         i * dim);\n    values_map_before_insert[h_tmp_keys[i]] = *vec;\n  }\n\n  for (size_t i = 0; i < table_size_before; i++) {\n    scores_map_before_insert[h_tmp_keys[i]] = h_tmp_scores[i];\n  }\n\n  S nano_before_insert = test_util::host_nano<S>();\n\n  auto start = std::chrono::steady_clock::now();\n  table->set_global_epoch(global_epoch);\n  size_t filtered_len = table->insert_and_evict(\n      len, keys, values,\n      (Table::evict_strategy == EvictStrategy::kLru ||\n       Table::evict_strategy == EvictStrategy::kEpochLru)\n          ? nullptr\n          : scores,\n      evicted_keys, evicted_values, evicted_scores, stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  auto end = std::chrono::steady_clock::now();\n  auto diff = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start);\n\n  S nano_after_insert = test_util::host_nano<S>();\n\n  {\n    table->find(len, keys, values, d_tmp_founds, scores, stream);\n    CUDA_CHECK(cudaMemcpyAsync(h_tmp_founds, d_tmp_founds, len * sizeof(bool),\n                               cudaMemcpyDeviceToHost, stream));\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    size_t found_counter = 0;\n    for (int i = 0; i < len; i++) {\n      if (h_tmp_founds[i]) found_counter++;\n    }\n    std::cout << \"filtered_len:\" << filtered_len\n              << \", miss counter:\" << len - found_counter << std::endl;\n    ASSERT_EQ(len, found_counter);\n\n    CUDA_CHECK(cudaMemset(d_tmp_founds, 0, len * sizeof(bool)));\n    table->contains(len, keys, d_tmp_founds, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    int contains_counter = 0;\n    CUDA_CHECK(cudaMemcpyAsync(h_tmp_founds, d_tmp_founds, len * sizeof(bool),\n                               cudaMemcpyDeviceToHost, stream));\n    for (int i = 0; i < len; i++) {\n      if (h_tmp_founds[i]) contains_counter++;\n    }\n    ASSERT_EQ(contains_counter, found_counter);\n  }\n\n  float dur = diff.count();\n\n  size_t table_size_after = table->size(stream);\n  size_t table_size_verify1 = table->export_batch(\n      table->capacity(), 0, d_tmp_keys, d_tmp_values, d_tmp_scores, stream);\n\n  ASSERT_EQ(table_size_verify1, table_size_after);\n\n  size_t new_cap = table_size_after + filtered_len;\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys,\n                             table_size_after * sizeof(K),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,\n                             table_size_after * dim * sizeof(V),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores,\n                             table_size_after * sizeof(S),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys + table_size_after, evicted_keys,\n                             filtered_len * sizeof(K), cudaMemcpyDeviceToHost,\n                             stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values + table_size_after * dim,\n                             evicted_values, filtered_len * dim * sizeof(V),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores + table_size_after, evicted_scores,\n                             filtered_len * sizeof(S), cudaMemcpyDeviceToHost,\n                             stream));\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  int64_t new_cap_i64 = (int64_t)new_cap;\n\n  size_t key_miss_cnt = 0;\n  size_t value_diff_cnt = 0;\n  size_t score_error_cnt1 = 0;\n  size_t score_error_cnt2 = 0;\n\n  for (int64_t i = new_cap_i64 - 1; i >= 0; i--) {\n    test_util::ValueArray<V, dim>* vec =\n        reinterpret_cast<test_util::ValueArray<V, dim>*>(h_tmp_values +\n                                                         i * dim);\n    values_map_after_insert[h_tmp_keys[i]] = *vec;\n    scores_map_after_insert[h_tmp_keys[i]] = h_tmp_scores[i];\n    if (i >= (new_cap_i64 - filtered_len)) {\n      bool valid = ((h_tmp_scores[i] >> 32) < (global_epoch - 2));\n      if (!valid) {\n        score_error_cnt1++;\n      }\n    }\n  }\n\n  for (auto& it : scores_map_current_batch) {\n    S score = scores_map_after_insert[it.first];\n    bool valid =\n        ((score >> 32) == global_epoch) &&\n        ((score & 0xFFFFFFFF) >= (0xFFFFFFFF & (nano_before_insert >> 20))) &&\n        ((score & 0xFFFFFFFF) <= (0xFFFFFFFF & (nano_after_insert >> 20)));\n    if (!valid) {\n      score_error_cnt2++;\n    }\n  }\n  for (auto& it : values_map_before_insert) {\n    if (values_map_after_insert.find(it.first) ==\n        values_map_after_insert.end()) {\n      ++key_miss_cnt;\n      continue;\n    }\n    test_util::ValueArray<V, dim>& vec0 = it.second;\n    test_util::ValueArray<V, dim>& vec1 = values_map_after_insert.at(it.first);\n    for (size_t j = 0; j < dim; j++) {\n      if (vec0[j] != vec1[j]) {\n        ++value_diff_cnt;\n        break;\n      }\n    }\n  }\n  std::cout << \"Check insert_and_evict behavior got \"\n            << \"key_miss_cnt: \" << key_miss_cnt\n            << \", value_diff_cnt: \" << value_diff_cnt\n            << \", score_error_cnt1: \" << score_error_cnt1\n            << \", score_error_cnt2: \" << score_error_cnt2\n            << \", while table_size_before: \" << table_size_before\n            << \", while table_size_after: \" << table_size_after\n            << \", while len: \" << len << \", dur: \" << dur << std::endl;\n\n  ASSERT_EQ(key_miss_cnt, 0);\n  ASSERT_EQ(value_diff_cnt, 0);\n  ASSERT_EQ(score_error_cnt1, 0);\n  ASSERT_EQ(score_error_cnt2, 0);\n\n  CUDA_CHECK(cudaFreeAsync(d_tmp_keys, stream));\n  CUDA_CHECK(cudaFreeAsync(d_tmp_values, stream));\n  CUDA_CHECK(cudaFreeAsync(d_tmp_scores, stream));\n  CUDA_CHECK(cudaFreeAsync(d_tmp_founds, stream));\n  free(h_tmp_keys);\n  free(h_tmp_values);\n  free(h_tmp_scores);\n  free(h_tmp_founds);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n}\n\nvoid test_insert_and_evict_advanced_on_epochlru() {\n  const size_t U = 1024 * 1024;\n  const size_t init_capacity = U;\n  const size_t B = 128 * 1024;\n\n  TableOptions opt;\n\n  opt.max_capacity = U;\n  opt.init_capacity = init_capacity;\n  opt.max_hbm_for_vectors = U * dim * sizeof(f32);\n  opt.max_bucket_size = 128;\n  opt.num_of_buckets_per_alloc = 32;\n  using Table = nv::merlin::HashTable<i64, f32, u64, EvictStrategy::kEpochLru>;\n  opt.dim = dim;\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  std::unique_ptr<Table> table = std::make_unique<Table>();\n  table->init(opt);\n\n  test_util::KVMSBuffer<i64, f32, u64> evict_buffer;\n  evict_buffer.Reserve(B, dim, stream);\n  evict_buffer.ToZeros(stream);\n\n  test_util::KVMSBuffer<i64, f32, u64> data_buffer;\n  data_buffer.Reserve(B, dim, stream);\n\n  size_t offset = 0;\n  u64 score = 0;\n  for (unsigned int global_epoch = 1; global_epoch <= 64; global_epoch++) {\n    test_util::create_random_keys_advanced<i64, u64, f32>(\n        dim, data_buffer.keys_ptr(false), data_buffer.scores_ptr(false),\n        data_buffer.values_ptr(false), (int)B, B * 16);\n    data_buffer.SyncData(true, stream);\n\n    CheckInsertAndEvictOnEpochLru<i64, f32, u64, Table>(\n        table.get(), &data_buffer, &evict_buffer, B, stream, opt, global_epoch);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    offset += B;\n    score += 1;\n  }\n}\n\ntemplate <typename K, typename V, typename S, typename Table>\nvoid CheckInsertAndEvictOnEpochLfu(\n    Table* table, test_util::KVMSBuffer<K, V, S>* data_buffer,\n    test_util::KVMSBuffer<K, V, S>* evict_buffer,\n    test_util::KVMSBuffer<K, V, S>* pre_data_buffer, size_t len,\n    cudaStream_t stream, TableOptions& opt, unsigned int global_epoch) {\n  std::map<K, test_util::ValueArray<V, dim>> values_map_before_insert;\n  std::map<K, test_util::ValueArray<V, dim>> values_map_after_insert;\n\n  std::unordered_map<K, S> scores_map_before_insert;\n  std::map<K, S> scores_map_after_insert;\n\n  std::map<K, S> scores_map_current_batch;\n  std::map<K, S> scores_map_current_evict;\n\n  K* keys = data_buffer->keys_ptr();\n  V* values = data_buffer->values_ptr();\n  S* scores = data_buffer->scores_ptr();\n\n  K* evicted_keys = evict_buffer->keys_ptr();\n  V* evicted_values = evict_buffer->values_ptr();\n  S* evicted_scores = evict_buffer->scores_ptr();\n\n  for (size_t i = 0; i < len; i++) {\n    scores_map_current_batch[data_buffer->keys_ptr(false)[i]] =\n        data_buffer->scores_ptr(false)[i];\n  }\n\n  K* h_tmp_keys = nullptr;\n  V* h_tmp_values = nullptr;\n  S* h_tmp_scores = nullptr;\n  bool* h_tmp_founds = nullptr;\n\n  K* d_tmp_keys = nullptr;\n  V* d_tmp_values = nullptr;\n  S* d_tmp_scores = nullptr;\n  bool* d_tmp_founds = nullptr;\n\n  size_t table_size_before = table->size(stream);\n  size_t cap = table_size_before + len;\n\n  CUDA_CHECK(cudaMallocAsync(&d_tmp_keys, cap * sizeof(K), stream));\n  CUDA_CHECK(cudaMemsetAsync(d_tmp_keys, 0, cap * sizeof(K), stream));\n  CUDA_CHECK(cudaMallocAsync(&d_tmp_values, cap * dim * sizeof(V), stream));\n  CUDA_CHECK(cudaMemsetAsync(d_tmp_values, 0, cap * dim * sizeof(V), stream));\n  CUDA_CHECK(cudaMallocAsync(&d_tmp_scores, cap * sizeof(S), stream));\n  CUDA_CHECK(cudaMemsetAsync(d_tmp_scores, 0, cap * sizeof(S), stream));\n  CUDA_CHECK(cudaMallocAsync(&d_tmp_founds, cap * sizeof(bool), stream));\n  CUDA_CHECK(cudaMemsetAsync(d_tmp_founds, 0, cap * sizeof(bool), stream));\n  h_tmp_keys = (K*)malloc(cap * sizeof(K));\n  h_tmp_values = (V*)malloc(cap * dim * sizeof(V));\n  h_tmp_scores = (S*)malloc(cap * sizeof(S));\n  h_tmp_founds = (bool*)malloc(cap * sizeof(bool));\n\n  size_t table_size_verify0 = table->export_batch(\n      table->capacity(), 0, d_tmp_keys, d_tmp_values, d_tmp_scores, stream);\n  ASSERT_EQ(table_size_before, table_size_verify0);\n\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys,\n                             table_size_before * sizeof(K),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,\n                             table_size_before * dim * sizeof(V),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores,\n                             table_size_before * sizeof(S),\n                             cudaMemcpyDeviceToHost, stream));\n\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys + table_size_before, keys,\n                             len * sizeof(K), cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values + table_size_before * dim, values,\n                             len * dim * sizeof(V), cudaMemcpyDeviceToHost,\n                             stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores + table_size_before, scores,\n                             len * sizeof(S), cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n\n  for (size_t i = 0; i < cap; i++) {\n    test_util::ValueArray<V, dim>* vec =\n        reinterpret_cast<test_util::ValueArray<V, dim>*>(h_tmp_values +\n                                                         i * dim);\n    values_map_before_insert[h_tmp_keys[i]] = *vec;\n  }\n\n  for (size_t i = 0; i < table_size_before; i++) {\n    scores_map_before_insert[h_tmp_keys[i]] = h_tmp_scores[i];\n  }\n\n  auto start = std::chrono::steady_clock::now();\n  table->set_global_epoch(global_epoch);\n  size_t filtered_len = table->insert_and_evict(\n      len, keys, values,\n      (Table::evict_strategy == EvictStrategy::kLru ||\n       Table::evict_strategy == EvictStrategy::kEpochLru)\n          ? nullptr\n          : scores,\n      evicted_keys, evicted_values, evicted_scores, stream);\n  evict_buffer->SyncData(false, stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  auto end = std::chrono::steady_clock::now();\n  auto diff = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start);\n\n  {\n    table->find(len, pre_data_buffer->keys_ptr(), values, d_tmp_founds,\n                pre_data_buffer->scores_ptr(), stream);\n    CUDA_CHECK(cudaMemcpyAsync(h_tmp_founds, d_tmp_founds, len * sizeof(bool),\n                               cudaMemcpyDeviceToHost, stream));\n    pre_data_buffer->SyncData(false);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    size_t found_counter = 0;\n    size_t old_epoch_counter = 0;\n    size_t new_epoch_counter = 0;\n    for (int i = 0; i < len; i++) {\n      if (h_tmp_founds[i]) found_counter++;\n      S score = pre_data_buffer->scores_ptr(false)[i];\n      S cur_epoch = score >> 32;\n      if (global_epoch == cur_epoch) new_epoch_counter++;\n      if (global_epoch - 1 == cur_epoch) old_epoch_counter++;\n    }\n    ASSERT_EQ(len, new_epoch_counter + old_epoch_counter);\n    std::cout << \"old_epoch_counter:\" << old_epoch_counter\n              << \", new_epoch_counter:\" << new_epoch_counter << std::endl\n              << \", pre_data filtered_len:\" << filtered_len\n              << \", pre_data miss counter:\" << len - found_counter << std::endl;\n    ASSERT_EQ(len, found_counter);\n\n    CUDA_CHECK(cudaMemset(d_tmp_founds, 0, len * sizeof(bool)));\n    table->contains(len, pre_data_buffer->keys_ptr(), d_tmp_founds, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    int contains_counter = 0;\n    CUDA_CHECK(cudaMemcpyAsync(h_tmp_founds, d_tmp_founds, len * sizeof(bool),\n                               cudaMemcpyDeviceToHost, stream));\n    for (int i = 0; i < len; i++) {\n      if (h_tmp_founds[i]) contains_counter++;\n    }\n    ASSERT_EQ(contains_counter, found_counter);\n  }\n\n  {\n    table->find(len, keys, values, d_tmp_founds, scores, stream);\n    CUDA_CHECK(cudaMemcpyAsync(h_tmp_founds, d_tmp_founds, len * sizeof(bool),\n                               cudaMemcpyDeviceToHost, stream));\n    data_buffer->SyncData(false);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    size_t found_counter = 0;\n    size_t new_epoch_counter = 0;\n    for (int i = 0; i < len; i++) {\n      S score = data_buffer->scores_ptr(false)[i];\n      S cur_epoch = score >> 32;\n      if (h_tmp_founds[i]) found_counter++;\n      if (global_epoch == cur_epoch) new_epoch_counter++;\n    }\n    ASSERT_EQ(len, new_epoch_counter);\n    std::cout << \"filtered_len:\" << filtered_len\n              << \", miss counter:\" << len - found_counter << std::endl;\n    ASSERT_EQ(len, found_counter);\n\n    CUDA_CHECK(cudaMemset(d_tmp_founds, 0, len * sizeof(bool)));\n    table->contains(len, keys, d_tmp_founds, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    int contains_counter = 0;\n    CUDA_CHECK(cudaMemcpyAsync(h_tmp_founds, d_tmp_founds, len * sizeof(bool),\n                               cudaMemcpyDeviceToHost, stream));\n    for (int i = 0; i < len; i++) {\n      if (h_tmp_founds[i]) contains_counter++;\n    }\n    ASSERT_EQ(contains_counter, found_counter);\n  }\n\n  {\n    std::unordered_set<K> unique_keys;\n    for (int i = 0; i < len; i++) {\n      unique_keys.insert(data_buffer->keys_ptr(false)[i]);\n      unique_keys.insert(pre_data_buffer->keys_ptr(false)[i]);\n    }\n    float repeat_rate = (len * 2.0 - unique_keys.size()) / (len * 1.0);\n    std::cout << \"repeat_rate:\" << repeat_rate << std::endl;\n  }\n\n  for (size_t i = 0; i < filtered_len; i++) {\n    scores_map_current_evict[evict_buffer->keys_ptr(false)[i]] =\n        evict_buffer->scores_ptr(false)[i];\n  }\n\n  float dur = diff.count();\n\n  size_t table_size_after = table->size(stream);\n  size_t table_size_verify1 = table->export_batch(\n      table->capacity(), 0, d_tmp_keys, d_tmp_values, d_tmp_scores, stream);\n\n  ASSERT_EQ(table_size_verify1, table_size_after);\n\n  size_t new_cap = table_size_after + filtered_len;\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys,\n                             table_size_after * sizeof(K),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,\n                             table_size_after * dim * sizeof(V),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores,\n                             table_size_after * sizeof(S),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys + table_size_after, evicted_keys,\n                             filtered_len * sizeof(K), cudaMemcpyDeviceToHost,\n                             stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values + table_size_after * dim,\n                             evicted_values, filtered_len * dim * sizeof(V),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores + table_size_after, evicted_scores,\n                             filtered_len * sizeof(S), cudaMemcpyDeviceToHost,\n                             stream));\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  int64_t new_cap_i64 = (int64_t)new_cap;\n\n  size_t key_miss_cnt = 0;\n  size_t value_diff_cnt = 0;\n  size_t score_error_cnt1 = 0;\n  size_t score_error_cnt2 = 0;\n\n  for (int64_t i = new_cap_i64 - 1; i >= 0; i--) {\n    test_util::ValueArray<V, dim>* vec =\n        reinterpret_cast<test_util::ValueArray<V, dim>*>(h_tmp_values +\n                                                         i * dim);\n    values_map_after_insert[h_tmp_keys[i]] = *vec;\n    scores_map_after_insert[h_tmp_keys[i]] = h_tmp_scores[i];\n    if (i >= (new_cap_i64 - filtered_len)) {\n      bool valid = ((h_tmp_scores[i] >> 32) < (global_epoch - 2));\n      if (!valid) {\n        score_error_cnt1++;\n      }\n    }\n  }\n\n  for (auto it : scores_map_current_batch) {\n    const K key = it.first;\n    const K score = it.second;\n    S current_score = scores_map_after_insert[key];\n    S score_before_insert = 0;\n    if (scores_map_before_insert.find(key) != scores_map_before_insert.end() &&\n        scores_map_current_evict.find(key) == scores_map_current_evict.end()) {\n      score_before_insert = scores_map_before_insert[key];\n    }\n    bool valid = ((current_score >> 32) == global_epoch) &&\n                 ((current_score & 0xFFFFFFFF) ==\n                  ((0xFFFFFFFF & score_before_insert) + (0xFFFFFFFF & score)));\n\n    if (!valid) {\n      score_error_cnt2++;\n    }\n  }\n  for (auto& it : values_map_before_insert) {\n    if (values_map_after_insert.find(it.first) ==\n        values_map_after_insert.end()) {\n      ++key_miss_cnt;\n      continue;\n    }\n    test_util::ValueArray<V, dim>& vec0 = it.second;\n    test_util::ValueArray<V, dim>& vec1 = values_map_after_insert.at(it.first);\n    for (size_t j = 0; j < dim; j++) {\n      if (vec0[j] != vec1[j]) {\n        ++value_diff_cnt;\n        break;\n      }\n    }\n  }\n  std::cout << \"Check insert_and_evict behavior got \"\n            << \"key_miss_cnt: \" << key_miss_cnt\n            << \", value_diff_cnt: \" << value_diff_cnt\n            << \", score_error_cnt1: \" << score_error_cnt1\n            << \", score_error_cnt2: \" << score_error_cnt2\n            << \", while table_size_before: \" << table_size_before\n            << \", while table_size_after: \" << table_size_after\n            << \", while len: \" << len << \", dur: \" << dur << std::endl;\n\n  ASSERT_EQ(key_miss_cnt, 0);\n  ASSERT_EQ(value_diff_cnt, 0);\n  ASSERT_EQ(score_error_cnt1, 0);\n  ASSERT_EQ(score_error_cnt2, 0);\n\n  CUDA_CHECK(cudaFreeAsync(d_tmp_keys, stream));\n  CUDA_CHECK(cudaFreeAsync(d_tmp_values, stream));\n  CUDA_CHECK(cudaFreeAsync(d_tmp_scores, stream));\n  CUDA_CHECK(cudaFreeAsync(d_tmp_founds, stream));\n  free(h_tmp_keys);\n  free(h_tmp_values);\n  free(h_tmp_scores);\n  free(h_tmp_founds);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n}\n\nvoid test_insert_and_evict_advanced_on_epochlfu() {\n  const size_t U = 1024 * 1024;\n  const size_t init_capacity = U;\n  const size_t B = 128 * 1024;\n\n  TableOptions opt;\n\n  opt.max_capacity = U;\n  opt.init_capacity = init_capacity;\n  opt.max_hbm_for_vectors = U * dim * sizeof(f32);\n  opt.max_bucket_size = 128;\n  opt.num_of_buckets_per_alloc = 32;\n  using Table = nv::merlin::HashTable<i64, f32, u64, EvictStrategy::kEpochLfu>;\n  opt.dim = dim;\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  std::unique_ptr<Table> table = std::make_unique<Table>();\n  table->init(opt);\n\n  test_util::KVMSBuffer<i64, f32, u64> evict_buffer;\n  evict_buffer.Reserve(B, dim, stream);\n  evict_buffer.ToZeros(stream);\n\n  test_util::KVMSBuffer<i64, f32, u64> data_buffer;\n  test_util::KVMSBuffer<i64, f32, u64> pre_data_buffer;\n  data_buffer.Reserve(B, dim, stream);\n  pre_data_buffer.Reserve(B, dim, stream);\n\n  size_t offset = 0;\n  int freq_range = 100;\n  float repeat_rate = 0.9;\n  for (unsigned int global_epoch = 1; global_epoch <= 64; global_epoch++) {\n    if (global_epoch <= 1) {\n      test_util::create_random_keys_advanced<i64, u64, f32>(\n          dim, data_buffer.keys_ptr(false), data_buffer.scores_ptr(false),\n          data_buffer.values_ptr(false), (int)B, B * 16, freq_range);\n    } else {\n      test_util::create_random_keys_advanced<i64, u64, f32>(\n          dim, data_buffer.keys_ptr(false), pre_data_buffer.keys_ptr(false),\n          data_buffer.scores_ptr(false), data_buffer.values_ptr(false), (int)B,\n          B * 16, freq_range, repeat_rate);\n    }\n    data_buffer.SyncData(true, stream);\n    if (global_epoch <= 1) {\n      pre_data_buffer.CopyFrom(data_buffer, stream);\n    }\n\n    CheckInsertAndEvictOnEpochLfu<i64, f32, u64, Table>(\n        table.get(), &data_buffer, &evict_buffer, &pre_data_buffer, B, stream,\n        opt, global_epoch);\n\n    pre_data_buffer.CopyFrom(data_buffer, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    offset += B;\n  }\n}\n\nvoid test_insert_and_evict_advanced_on_customized() {\n  const size_t U = 1024 * 1024;\n  const size_t init_capacity = U;\n  const size_t B = 100000;\n\n  TableOptions opt;\n\n  opt.max_capacity = U;\n  opt.init_capacity = init_capacity;\n  opt.max_hbm_for_vectors = U * dim * sizeof(f32);\n  opt.max_bucket_size = 128;\n  opt.num_of_buckets_per_alloc = 2;\n  using Table =\n      nv::merlin::HashTable<i64, f32, u64, EvictStrategy::kCustomized>;\n  opt.dim = dim;\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  std::unique_ptr<Table> table = std::make_unique<Table>();\n  table->init(opt);\n\n  test_util::KVMSBuffer<i64, f32, u64> evict_buffer;\n  evict_buffer.Reserve(B, dim, stream);\n  evict_buffer.ToZeros(stream);\n\n  test_util::KVMSBuffer<i64, f32, u64> data_buffer;\n  data_buffer.Reserve(B, dim, stream);\n\n  size_t offset = 0;\n  u64 score = 0;\n  for (int i = 0; i < 32; i++) {\n    test_util::create_random_keys<i64, u64, f32, dim>(\n        data_buffer.keys_ptr(false), data_buffer.scores_ptr(false),\n        data_buffer.values_ptr(false), (int)B, (int)B * 16);\n    data_buffer.SyncData(true, stream);\n\n    CheckInsertAndEvict<i64, f32, u64, Table>(\n        table.get(), data_buffer.keys_ptr(), data_buffer.values_ptr(),\n        data_buffer.scores_ptr(), evict_buffer.keys_ptr(),\n        evict_buffer.values_ptr(), evict_buffer.scores_ptr(), B, stream, opt);\n\n    offset += B;\n    score += 1;\n  }\n}\n\nvoid test_insert_and_evict_with_export_batch() {\n  size_t max_capacity = 4096;\n  size_t init_capacity = 2048;\n  size_t offset = 0;\n  size_t uplimit = 1048576;\n  size_t len = 4096 + 13;\n\n  TableOptions opt;\n  opt.max_capacity = max_capacity;\n  opt.init_capacity = init_capacity;\n  opt.max_hbm_for_vectors = uplimit * dim * sizeof(f32);\n  opt.num_of_buckets_per_alloc = 16;\n  using Table = nv::merlin::HashTable<i64, f32, u64, EvictStrategy::kLru>;\n  opt.dim = dim;\n\n  using Vec_t = test_util::ValueArray<f32, dim>;\n  std::map<i64, Vec_t> ref_map;\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  std::unique_ptr<Table> table = std::make_unique<Table>();\n  table->init(opt);\n\n  test_util::KVMSBuffer<i64, f32, u64> buffer;\n  buffer.Reserve(len, dim, stream);\n  test_util::KVMSBuffer<i64, f32, u64> evict_buffer;\n  evict_buffer.Reserve(len, dim, stream);\n\n  size_t total_len = 0;\n  buffer.ToRange(offset, /*skip=1*/ 1, stream);\n  size_t n_evicted = table->insert_and_evict(\n      len, buffer.keys_ptr(), buffer.values_ptr(), nullptr,\n      evict_buffer.keys_ptr(), evict_buffer.values_ptr(), nullptr, stream);\n  printf(\"Insert %zu keys and evict %zu\\n\", len, n_evicted);\n  offset += len;\n  total_len += len;\n  evict_buffer.SyncData(/*h2d=*/false, stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  for (size_t i = 0; i < n_evicted; i++) {\n    Vec_t* vec =\n        reinterpret_cast<Vec_t*>(evict_buffer.values_ptr(false) + i * dim);\n    ref_map[evict_buffer.keys_ptr(false)[i]] = *vec;\n  }\n\n  offset = 0;\n  size_t search_len = (table->capacity() >> 2);\n  for (; offset < table->capacity(); offset += search_len) {\n    if (offset + search_len > table->capacity()) {\n      search_len = table->capacity() - offset;\n    }\n    size_t n_exported =\n        table->export_batch(search_len, offset, buffer.keys_ptr(),\n                            buffer.values_ptr(), /*scores=*/nullptr, stream);\n    buffer.SyncData(/*h2d=*/false);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    for (size_t i = 0; i < n_exported; i++) {\n      Vec_t* vec = reinterpret_cast<Vec_t*>(buffer.values_ptr(false) + i * dim);\n      for (size_t j = 0; j < dim; j++) {\n        ASSERT_EQ(buffer.keys_ptr(false)[i], vec->operator[](j));\n      }\n      ref_map[buffer.keys_ptr(false)[i]] = *vec;\n    }\n  }\n\n  for (auto& it : ref_map) {\n    for (size_t j = 0; j < dim; j++) {\n      ASSERT_EQ(static_cast<f32>(it.first), it.second.data[j]);\n    }\n  }\n}\n\ntemplate <typename K, typename V, typename S, typename Table>\nvoid BatchCheckInsertAndEvict(Table* table, K* keys, V* values, S* scores,\n                              K* evicted_keys, V* evicted_values,\n                              S* evicted_scores, size_t len,\n                              std::atomic<int>* step, size_t total_step,\n                              cudaStream_t stream, bool if_check = true) {\n  std::map<i64, test_util::ValueArray<f32, dim>> map_before_insert;\n  std::map<i64, test_util::ValueArray<f32, dim>> map_after_insert;\n\n  K* h_tmp_keys = nullptr;\n  V* h_tmp_values = nullptr;\n  S* h_tmp_scores = nullptr;\n\n  K* d_tmp_keys = nullptr;\n  V* d_tmp_values = nullptr;\n  S* d_tmp_scores = nullptr;\n\n  while (step->load() < total_step) {\n    size_t table_size_before = table->size(stream);\n    size_t cap = table_size_before + len;\n    size_t key_miss_cnt = 0;\n    size_t value_diff_cnt = 0;\n    size_t table_size_after = 0;\n    size_t table_size_verify1 = 0;\n\n    int s = step->load();\n\n    if (if_check) {\n      CUDA_CHECK(cudaMallocAsync(&d_tmp_keys, cap * sizeof(K), stream));\n      CUDA_CHECK(cudaMallocAsync(&d_tmp_values, cap * dim * sizeof(V), stream));\n      CUDA_CHECK(cudaMallocAsync(&d_tmp_scores, cap * sizeof(S), stream));\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      h_tmp_keys = (K*)malloc(cap * sizeof(K));\n      h_tmp_values = (V*)malloc(cap * dim * sizeof(V));\n      h_tmp_scores = (S*)malloc(cap * sizeof(S));\n\n      CUDA_CHECK(cudaMemsetAsync(d_tmp_keys, 0, cap * sizeof(K), stream));\n      CUDA_CHECK(\n          cudaMemsetAsync(d_tmp_values, 0, cap * dim * sizeof(V), stream));\n      CUDA_CHECK(cudaMemsetAsync(d_tmp_scores, 0, cap * sizeof(S), stream));\n\n      size_t table_size_verify0 = table->export_batch(\n          table->capacity(), 0, d_tmp_keys, d_tmp_values, d_tmp_scores, stream);\n      ASSERT_EQ(table_size_before, table_size_verify0);\n\n      CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys,\n                                 table_size_before * sizeof(K),\n                                 cudaMemcpyDeviceToHost, stream));\n      CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,\n                                 table_size_before * dim * sizeof(V),\n                                 cudaMemcpyDeviceToHost, stream));\n      CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores,\n                                 table_size_before * sizeof(S),\n                                 cudaMemcpyDeviceToHost, stream));\n\n      CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys + table_size_before, keys + len * s,\n                                 len * sizeof(K), cudaMemcpyDeviceToHost,\n                                 stream));\n      CUDA_CHECK(cudaMemcpyAsync(h_tmp_values + table_size_before * dim,\n                                 values + len * s * dim, len * dim * sizeof(V),\n                                 cudaMemcpyDeviceToHost, stream));\n      CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores + table_size_before,\n                                 scores + len * s, len * sizeof(S),\n                                 cudaMemcpyDeviceToHost, stream));\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      for (size_t i = 0; i < cap; i++) {\n        test_util::ValueArray<V, dim>* vec =\n            reinterpret_cast<test_util::ValueArray<V, dim>*>(h_tmp_values +\n                                                             i * dim);\n        map_before_insert[h_tmp_keys[i]] = *vec;\n      }\n    }\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    auto start = std::chrono::steady_clock::now();\n    size_t filtered_len = table->insert_and_evict(\n        len, keys + len * s, values + len * s * dim, nullptr, evicted_keys,\n        evicted_values, evicted_scores, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    auto end = std::chrono::steady_clock::now();\n    auto diff =\n        std::chrono::duration_cast<std::chrono::nanoseconds>(end - start);\n\n    float dur = diff.count();\n\n    if (if_check) {\n      table_size_after = table->size(stream);\n      table_size_verify1 = table->export_batch(\n          table->capacity(), 0, d_tmp_keys, d_tmp_values, d_tmp_scores, stream);\n\n      ASSERT_EQ(table_size_verify1, table_size_after);\n\n      size_t new_cap = table_size_after + filtered_len;\n      CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys,\n                                 table_size_after * sizeof(K),\n                                 cudaMemcpyDeviceToHost, stream));\n      CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,\n                                 table_size_after * dim * sizeof(V),\n                                 cudaMemcpyDeviceToHost, stream));\n      CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores,\n                                 table_size_after * sizeof(S),\n                                 cudaMemcpyDeviceToHost, stream));\n      CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys + table_size_after, evicted_keys,\n                                 filtered_len * sizeof(K),\n                                 cudaMemcpyDeviceToHost, stream));\n      CUDA_CHECK(cudaMemcpyAsync(h_tmp_values + table_size_after * dim,\n                                 evicted_values, filtered_len * dim * sizeof(V),\n                                 cudaMemcpyDeviceToHost, stream));\n      CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores + table_size_after,\n                                 evicted_scores, filtered_len * sizeof(S),\n                                 cudaMemcpyDeviceToHost, stream));\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      int64_t new_cap_i64 = (int64_t)new_cap;\n      for (int64_t i = new_cap_i64 - 1; i >= 0; i--) {\n        test_util::ValueArray<V, dim>* vec =\n            reinterpret_cast<test_util::ValueArray<V, dim>*>(h_tmp_values +\n                                                             i * dim);\n        map_after_insert[h_tmp_keys[i]] = *vec;\n      }\n\n      for (auto& it : map_before_insert) {\n        if (map_after_insert.find(it.first) == map_after_insert.end()) {\n          ++key_miss_cnt;\n          continue;\n        }\n        test_util::ValueArray<V, dim>& vec0 = it.second;\n        test_util::ValueArray<V, dim>& vec1 = map_after_insert.at(it.first);\n        for (size_t j = 0; j < dim; j++) {\n          if (vec0[j] != vec1[j]) {\n            ++value_diff_cnt;\n            break;\n          }\n        }\n      }\n      ASSERT_EQ(key_miss_cnt, 0);\n      ASSERT_EQ(value_diff_cnt, 0);\n\n      CUDA_CHECK(cudaFreeAsync(d_tmp_keys, stream));\n      CUDA_CHECK(cudaFreeAsync(d_tmp_values, stream));\n      CUDA_CHECK(cudaFreeAsync(d_tmp_scores, stream));\n      free(h_tmp_keys);\n      free(h_tmp_values);\n      free(h_tmp_scores);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n    }\n\n    std::cout << \"Check insert behavior got step: \" << step->load()\n              << \",\\tduration: \" << dur\n              << \",\\twhile value_diff_cnt: \" << value_diff_cnt\n              << \", while table_size_before: \" << table_size_before\n              << \", while table_size_after: \" << table_size_after\n              << \", while len: \" << len << std::endl;\n\n    step->fetch_add(1);\n  }\n}\n\ntemplate <typename K, typename V, typename S, typename Table>\nvoid BatchCheckFind(Table* table, K* keys, V* values, S* scores, size_t len,\n                    std::atomic<int>* step, size_t total_step,\n                    size_t find_interval, cudaStream_t stream,\n                    bool if_check = true) {\n  K* h_tmp_keys = nullptr;\n  V* h_tmp_values = nullptr;\n  S* h_tmp_scores = nullptr;\n  bool* h_tmp_founds = nullptr;\n\n  K* d_tmp_keys = nullptr;\n  V* d_tmp_values = nullptr;\n  S* d_tmp_scores = nullptr;\n  bool* d_tmp_founds = nullptr;\n\n  int find_step = 0;\n  size_t cap = len * find_interval;\n\n  CUDA_CHECK(cudaMallocAsync(&d_tmp_keys, cap * sizeof(K), stream));\n  CUDA_CHECK(cudaMallocAsync(&d_tmp_values, cap * dim * sizeof(V), stream));\n  CUDA_CHECK(cudaMallocAsync(&d_tmp_scores, cap * sizeof(S), stream));\n  CUDA_CHECK(cudaMallocAsync(&d_tmp_founds, cap * sizeof(bool), stream));\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n\n  h_tmp_keys = (K*)malloc(cap * sizeof(K));\n  h_tmp_values = (V*)malloc(cap * dim * sizeof(V));\n  h_tmp_scores = (S*)malloc(cap * sizeof(S));\n  h_tmp_founds = (bool*)malloc(cap * sizeof(bool));\n\n  while (step->load() < total_step) {\n    while (find_step >= (step->load() / find_interval)) continue;\n\n    size_t found_num = 0;\n    size_t value_diff_cnt = 0;\n\n    CUDA_CHECK(cudaMemsetAsync(d_tmp_keys, 0, cap * sizeof(K), stream));\n    CUDA_CHECK(cudaMemsetAsync(d_tmp_values, 0, cap * dim * sizeof(V), stream));\n    CUDA_CHECK(cudaMemsetAsync(d_tmp_scores, 0, cap * sizeof(S), stream));\n    CUDA_CHECK(cudaMemsetAsync(d_tmp_founds, 0, cap * sizeof(bool), stream));\n\n    CUDA_CHECK(cudaMemcpyAsync(d_tmp_keys, keys + cap * find_step,\n                               cap * sizeof(K), cudaMemcpyDeviceToDevice,\n                               stream));\n\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    auto start = std::chrono::steady_clock::now();\n    table->find(cap, d_tmp_keys, d_tmp_values, d_tmp_founds, d_tmp_scores,\n                stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    auto end = std::chrono::steady_clock::now();\n    auto diff =\n        std::chrono::duration_cast<std::chrono::nanoseconds>(end - start);\n\n    float dur = diff.count();\n\n    if (if_check) {\n      CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys, cap * sizeof(K),\n                                 cudaMemcpyDeviceToHost, stream));\n      CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,\n                                 cap * dim * sizeof(V), cudaMemcpyDeviceToHost,\n                                 stream));\n      CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores, cap * sizeof(S),\n                                 cudaMemcpyDeviceToHost, stream));\n      CUDA_CHECK(cudaMemcpyAsync(h_tmp_founds, d_tmp_founds, cap * sizeof(bool),\n                                 cudaMemcpyDeviceToHost, stream));\n\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      for (int i = 0; i < cap; i++) {\n        if (h_tmp_founds[i]) {\n          for (int j = 0; j < dim; j++) {\n            if (h_tmp_values[i * dim + j] !=\n                static_cast<float>(h_tmp_keys[i] * 0.00001)) {\n              value_diff_cnt++;\n            };\n          }\n          found_num++;\n        }\n      }\n      ASSERT_EQ(value_diff_cnt, 0);\n\n      CUDA_CHECK(cudaMemset(d_tmp_founds, 0, cap * sizeof(bool)));\n      table->contains(cap, keys, d_tmp_founds, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      int contains_num = 0;\n      CUDA_CHECK(cudaMemcpyAsync(h_tmp_founds, d_tmp_founds, cap * sizeof(bool),\n                                 cudaMemcpyDeviceToHost, stream));\n      for (int i = 0; i < cap; i++) {\n        if (h_tmp_founds[i]) contains_num++;\n      }\n      ASSERT_EQ(contains_num, found_num);\n    }\n    std::cout << std::endl\n              << \"\\nCheck find behavior got step: \" << find_step\n              << \",\\tduration: \" << dur\n              << \",\\twhile value_diff_cnt: \" << value_diff_cnt\n              << \", while cap: \" << cap << std::endl\n              << std::endl;\n    ASSERT_EQ(value_diff_cnt, 0);\n    find_step++;\n  }\n  CUDA_CHECK(cudaFreeAsync(d_tmp_keys, stream));\n  CUDA_CHECK(cudaFreeAsync(d_tmp_values, stream));\n  CUDA_CHECK(cudaFreeAsync(d_tmp_scores, stream));\n  CUDA_CHECK(cudaFreeAsync(d_tmp_founds, stream));\n  free(h_tmp_keys);\n  free(h_tmp_values);\n  free(h_tmp_scores);\n  free(h_tmp_founds);\n\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n}\n\nvoid test_insert_and_evict_bugfix_no_zero_eviction() {\n  size_t max_capacity = 2048;\n  size_t init_capacity = 2048;\n  size_t remove_len = 1024;\n  size_t insert_len = 2048;\n  \n  TableOptions opt;\n  opt.max_capacity = max_capacity;\n  opt.init_capacity = init_capacity;\n  opt.max_hbm_for_vectors = 1024 * 1024 * sizeof(f32); \n  opt.num_of_buckets_per_alloc = 16;\n  opt.dim = dim;\n\n  using Table = nv::merlin::HashTable<i64, f32, u64, EvictStrategy::kLru>;\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  std::unique_ptr<Table> table = std::make_unique<Table>();\n  table->init(opt);\n\n  test_util::KVMSBuffer<i64, f32, u64> buffer_init;\n  buffer_init.Reserve(max_capacity, dim, stream);\n  buffer_init.ToRange(1, 1, stream); \n  \n  test_util::KVMSBuffer<i64, f32, u64> evict_buffer_init;\n  evict_buffer_init.Reserve(max_capacity, dim, stream);\n\n  size_t n_evicted = table->insert_and_evict(\n      max_capacity, buffer_init.keys_ptr(), buffer_init.values_ptr(), nullptr,\n      evict_buffer_init.keys_ptr(), evict_buffer_init.values_ptr(), nullptr, stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n\n  table->erase(remove_len, buffer_init.keys_ptr(), stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n\n  test_util::KVMSBuffer<i64, f32, u64> buffer_new;\n  buffer_new.Reserve(insert_len, dim, stream);\n  buffer_new.ToRange(3000, 1, stream); \n\n  test_util::KVMSBuffer<i64, f32, u64> evict_buffer_new;\n  evict_buffer_new.Reserve(insert_len, dim, stream);\n  \n  CUDA_CHECK(cudaMemsetAsync(evict_buffer_new.keys_ptr(), 0, insert_len * sizeof(i64), stream));\n\n  n_evicted = table->insert_and_evict(\n      insert_len, buffer_new.keys_ptr(), buffer_new.values_ptr(), nullptr,\n      evict_buffer_new.keys_ptr(), evict_buffer_new.values_ptr(), nullptr, stream);\n      \n  evict_buffer_new.SyncData(/*h2d=*/false, stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n\n  printf(\"Second insert evicted %zu keys\\n\", n_evicted);\n\n  for (size_t i = 0; i < n_evicted; i++) {\n    i64 evicted_key = evict_buffer_new.keys_ptr(false)[i];\n    ASSERT_NE(evicted_key, 0) << \"Found 0 (uninitialized empty key) at evict index \" << i;\n  }\n}\n\nvoid test_insert_and_evict_run_with_batch_find() {\n  const size_t U = 16 * 1024 * 1024;\n  const size_t init_capacity = U;\n  const size_t B = 256 * 1024;\n  constexpr size_t batch_num = 256;\n  constexpr size_t find_interval = 8;\n\n  const bool if_check = false;\n\n  std::thread insert_and_evict_thread;\n  std::thread find_thread;\n  std::atomic<int> step{0};\n\n  TableOptions opt;\n\n  opt.max_capacity = U;\n  opt.init_capacity = init_capacity;\n  opt.max_hbm_for_vectors = U * dim * sizeof(f32);\n  opt.num_of_buckets_per_alloc = 128;\n  using Table = nv::merlin::HashTable<i64, f32, u64, EvictStrategy::kLru>;\n  opt.dim = dim;\n\n  cudaStream_t insert_stream;\n  cudaStream_t find_stream;\n  CUDA_CHECK(cudaStreamCreate(&insert_stream));\n  CUDA_CHECK(cudaStreamCreate(&find_stream));\n\n  std::unique_ptr<Table> table = std::make_unique<Table>();\n  table->init(opt);\n\n  test_util::KVMSBuffer<i64, f32, u64> global_buffer;\n  global_buffer.Reserve(B * batch_num, dim, insert_stream);\n\n  test_util::KVMSBuffer<i64, f32, u64> evict_buffer;\n  evict_buffer.Reserve(B, dim, insert_stream);\n  evict_buffer.ToZeros(insert_stream);\n\n  for (int i = 0; i < batch_num; i++) {\n    test_util::create_random_keys<i64, u64, f32, dim>(\n        global_buffer.keys_ptr(false) + B * i,\n        global_buffer.scores_ptr(false) + B * i,\n        global_buffer.values_ptr(false) + B * i * dim, (int)B);\n  }\n  global_buffer.SyncData(true, insert_stream);\n  CUDA_CHECK(cudaStreamSynchronize(insert_stream));\n\n  auto insert_and_evict_func = [&table, &global_buffer, &evict_buffer, &B,\n                                &step, &batch_num, &insert_stream]() {\n    BatchCheckInsertAndEvict<i64, f32, u64, Table>(\n        table.get(), global_buffer.keys_ptr(), global_buffer.values_ptr(),\n        global_buffer.scores_ptr(), evict_buffer.keys_ptr(),\n        evict_buffer.values_ptr(), evict_buffer.scores_ptr(), B, &step,\n        batch_num, insert_stream, if_check);\n  };\n\n  auto find_func = [&table, &global_buffer, &B, &step, &batch_num,\n                    &find_interval, &find_stream]() {\n    BatchCheckFind<i64, f32, u64, Table>(\n        table.get(), global_buffer.keys_ptr(), global_buffer.values_ptr(),\n        global_buffer.scores_ptr(), B, &step, batch_num, find_interval,\n        find_stream, if_check);\n  };\n\n  find_thread = std::thread(find_func);\n  insert_and_evict_thread = std::thread(insert_and_evict_func);\n  find_thread.join();\n  insert_and_evict_thread.join();\n  CUDA_CHECK(cudaStreamDestroy(insert_stream));\n  CUDA_CHECK(cudaStreamDestroy(find_stream));\n}\n\nTEST(InsertAndEvictTest, test_insert_and_evict_basic) {\n  test_insert_and_evict_basic();\n}\n\nTEST(InsertAndEvictTest, test_insert_and_evict_advanced_on_lru) {\n  test_insert_and_evict_advanced_on_lru();\n}\n\nTEST(InsertAndEvictTest, test_insert_and_evict_advanced_on_lfu) {\n  test_insert_and_evict_advanced_on_lfu();\n}\n\nTEST(InsertAndEvictTest, test_insert_and_evict_advanced_on_epochlru) {\n  test_insert_and_evict_advanced_on_epochlru();\n}\n\nTEST(InsertAndEvictTest, test_insert_and_evict_advanced_on_epochlfu) {\n  test_insert_and_evict_advanced_on_epochlfu();\n}\n\nTEST(InsertAndEvictTest, test_insert_and_evict_advanced_on_customized) {\n  test_insert_and_evict_advanced_on_customized();\n}\n\nTEST(InsertAndEvictTest, test_insert_and_evict_with_export_batch) {\n  test_insert_and_evict_with_export_batch();\n}\n\nTEST(InsertAndEvictTest, test_insert_and_evict_run_with_batch_find) {\n  test_insert_and_evict_run_with_batch_find();\n}\n\nTEST(InsertAndEvictTest, test_insert_and_evict_bugfix_no_zero_eviction) {\n  test_insert_and_evict_bugfix_no_zero_eviction();\n}\n"
  },
  {
    "path": "tests/lock_unlock_test.cc.cu",
    "content": "/*\n * Copyright (c) 2025, NVIDIA CORPORATION.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *     http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n#include <gtest/gtest.h>\n#include <stdio.h>\n#include <array>\n#include <map>\n#include <unordered_map>\n#include \"merlin/types.cuh\"\n#include \"merlin_hashtable.cuh\"\n#include \"merlin_localfile.hpp\"\n#include \"test_util.cuh\"\n\nconstexpr size_t dim = 64;\nusing i64 = int64_t;\nusing u64 = uint64_t;\nusing f32 = float;\nusing EvictStrategy = nv::merlin::EvictStrategy;\nusing TableOptions = nv::merlin::HashTableOptions;\n\nvoid test_lock_and_unlock() {\n  TableOptions opt;\n\n  // table setting\n  const size_t U = 4 * 1024 * 1024UL;\n  const size_t M = 65536UL;\n  opt.max_capacity = U;\n  opt.init_capacity = U;\n  opt.max_hbm_for_vectors = U * dim * sizeof(f32);\n  opt.num_of_buckets_per_alloc = 8;\n\n  using Table =\n      nv::merlin::HashTable<i64, f32, u64, EvictStrategy::kCustomized>;\n  opt.dim = dim;\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n  bool *d_found, *d_lock_results;\n  i64** lock_keys_ptr;\n  CUDA_CHECK(cudaMalloc(&d_found, M * sizeof(bool)));\n  CUDA_CHECK(cudaMalloc(&d_lock_results, M * sizeof(bool)));\n  CUDA_CHECK(cudaMalloc(&lock_keys_ptr, M * sizeof(i64*)));\n\n  // step1\n  std::unique_ptr<Table> table = std::make_unique<Table>();\n  table->init(opt);\n\n  // step2\n  test_util::KVMSBuffer<i64, f32, u64> buffer;\n  buffer.Reserve(M, dim, stream);\n\n  i64 start = 0;\n  for (int i = 0; i < U / M; i++) {\n    buffer.ToRange(start, 1, stream);\n    start += M;\n    buffer.Setscore((u64)i, stream);\n    table->insert_or_assign(M, buffer.keys_ptr(), buffer.values_ptr(),\n                            buffer.scores_ptr(), stream);\n\n    CUDA_CHECK(cudaMemsetAsync(d_found, 0, M * sizeof(bool), stream));\n    CUDA_CHECK(cudaMemsetAsync(d_lock_results, 0, M * sizeof(bool), stream));\n    table->contains(M, buffer.keys_ptr(), d_found, stream);\n    table->lock_keys(M, buffer.keys_ptr(), lock_keys_ptr, d_lock_results,\n                     stream, buffer.scores_ptr());\n    bool result = test_util::allEqualGpu(d_found, d_lock_results, M, stream);\n    ASSERT_EQ(result, true);\n    result = test_util::allTrueGpu(d_found, M, stream);\n    ASSERT_EQ(result, true);\n\n    CUDA_CHECK(cudaMemsetAsync(d_found, 0, M * sizeof(bool), stream));\n    CUDA_CHECK(cudaMemsetAsync(d_lock_results, 0, M * sizeof(bool), stream));\n    table->contains(M, buffer.keys_ptr(), d_found, stream);\n    result = test_util::allEqualGpu(d_found, d_lock_results, M, stream);\n    ASSERT_EQ(result, true);\n\n    CUDA_CHECK(cudaMemsetAsync(d_found, 0, M * sizeof(bool), stream));\n    table->unlock_keys(M, lock_keys_ptr, buffer.keys_ptr(), d_lock_results,\n                       stream);\n    table->contains(M, buffer.keys_ptr(), d_found, stream);\n    result = test_util::allEqualGpu(d_found, d_lock_results, M, stream);\n    ASSERT_EQ(result, true);\n    result = test_util::allTrueGpu(d_found, M, stream);\n    ASSERT_EQ(result, true);\n  }\n\n  CUDA_CHECK(cudaFree(d_found));\n  CUDA_CHECK(cudaFree(d_lock_results));\n  CUDA_CHECK(cudaFree(lock_keys_ptr));\n}\n\nTEST(LockAndUnlockTest, test_lock_and_unlock) { test_lock_and_unlock(); }"
  },
  {
    "path": "tests/memory_pool_test.cc.cu",
    "content": "/*\n * Copyright (c) 2022, NVIDIA CORPORATION.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *     http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n#include <cuda_runtime_api.h>\n#include <gtest/gtest.h>\n#include <iostream>\n#include \"merlin/allocator.cuh\"\n#include \"merlin/memory_pool.cuh\"\n\nusing namespace nv::merlin;\n\n/**\n * Wrapper around another allocator that prints debug messages.\n */\ntemplate <class Allocator>\nstruct DebugAllocator final\n    : AllocatorBase<typename Allocator::type, DebugAllocator<Allocator>> {\n  using type = typename Allocator::type;\n\n  static constexpr const char* name{\"DebugAllocator\"};\n\n  inline static type* alloc(size_t n, BaseAllocator* allocator,\n                            cudaStream_t stream = 0) {\n    type* ptr{Allocator::alloc(n, allocator, stream)};\n    std::cout << Allocator::name << \"[type_name = \" << typeid(type).name()\n              << \"]: \" << static_cast<void*>(ptr) << \" allocated = \" << n\n              << \" x \" << sizeof(type) << \" bytes, stream = \" << stream << '\\n';\n    return ptr;\n  }\n\n  inline static void free(type* ptr, BaseAllocator* allocator,\n                          cudaStream_t stream = 0) {\n    Allocator::free(ptr, allocator, stream);\n    std::cout << Allocator::name << \"[type_name = \" << typeid(type).name()\n              << \"]: \" << static_cast<void*>(ptr)\n              << \" freed, stream = \" << stream << '\\n';\n  }\n};\n\nvoid print_divider() {\n  for (size_t i{0}; i < 80; ++i) std::cout << '-';\n  std::cout << '\\n';\n}\n\nvoid print_pool_options(const MemoryPoolOptions& opt) {\n  print_divider();\n  std::cout << \"Memory Pool Configuration\\n\";\n  print_divider();\n  std::cout << \"opt.max_stock   : \" << opt.max_stock << \" buffers\\n\";\n  std::cout << \"opt.max_pending : \" << opt.max_pending << \" buffers\\n\";\n  print_divider();\n  std::cout.flush();\n}\n\nMemoryPoolOptions opt{\n    3,  //< max_stock\n    5,  //< max_pending\n};\n\nstruct SomeType {\n  int a;\n  float b;\n\n  friend std::ostream& operator<<(std::ostream&, const SomeType&);\n};\n\nstd::ostream& operator<<(std::ostream& os, const SomeType& obj) {\n  cudaPointerAttributes attr;\n  CUDA_CHECK(cudaPointerGetAttributes(&attr, &obj));\n\n  SomeType tmp;\n  if (attr.type == cudaMemoryTypeDevice) {\n    CUDA_CHECK(\n        cudaMemcpy(&tmp, &obj, sizeof(SomeType), cudaMemcpyDeviceToHost));\n  } else {\n    tmp = obj;\n  }\n\n  os << \"a = \" << tmp.a << \", b = \" << tmp.b;\n  return os;\n}\n\nvoid test_standard_allocator() {\n  using Allocator = DebugAllocator<StandardAllocator<SomeType>>;\n  std::shared_ptr<DefaultAllocator> default_allocator(new DefaultAllocator());\n\n  {\n    auto ptr{Allocator::make_unique(1, default_allocator.get())};\n    ASSERT_NE(ptr.get(), nullptr);\n\n    std::cout << \"Sync UPtr after alloc: \" << *ptr << std::endl;\n    ptr->a = 47;\n    ptr->b = 11;\n    std::cout << \"Sync UPtr after set: \" << *ptr << std::endl;\n\n    ptr.reset();\n    ASSERT_EQ(ptr.get(), nullptr);\n  }\n\n  {\n    auto ptr{Allocator::make_unique(1, default_allocator.get(), nullptr)};\n    ASSERT_NE(ptr.get(), nullptr);\n\n    std::cout << \"Async UPtr after alloc: \" << *ptr << std::endl;\n    ptr->a = 47;\n    ptr->b = 11;\n    std::cout << \"Async UPtr after set: \" << *ptr << std::endl;\n\n    ptr.reset();\n    ASSERT_EQ(ptr.get(), nullptr);\n  }\n\n  {\n    auto ptr{Allocator::make_shared(1, default_allocator.get())};\n    ASSERT_NE(ptr.get(), nullptr);\n\n    std::cout << \"SPtr after alloc: \" << *ptr << std::endl;\n    ptr->a = 47;\n    ptr->b = 11;\n    std::cout << \"SPtr after set: \" << *ptr << std::endl;\n\n    ptr.reset();\n    ASSERT_EQ(ptr.get(), nullptr);\n  }\n}\n\nvoid test_host_allocator() {\n  using Allocator = DebugAllocator<HostAllocator<SomeType>>;\n  std::shared_ptr<DefaultAllocator> default_allocator(new DefaultAllocator());\n\n  {\n    auto ptr{Allocator::make_unique(1, default_allocator.get())};\n    ASSERT_NE(ptr.get(), nullptr);\n\n    std::cout << \"Sync UPtr after alloc: \" << *ptr << std::endl;\n    ptr->a = 47;\n    ptr->b = 11;\n    std::cout << \"Sync UPtr after set: \" << *ptr << std::endl;\n\n    ptr.reset();\n    ASSERT_EQ(ptr.get(), nullptr);\n  }\n\n  {\n    auto ptr{Allocator::make_unique(1, default_allocator.get(), nullptr)};\n    ASSERT_NE(ptr.get(), nullptr);\n\n    std::cout << \"Async UPtr after alloc: \" << *ptr << std::endl;\n    ptr->a = 47;\n    ptr->b = 11;\n    std::cout << \"Async UPtr after set: \" << *ptr << std::endl;\n\n    ptr.reset();\n    ASSERT_EQ(ptr.get(), nullptr);\n  }\n\n  {\n    auto ptr{Allocator::make_shared(1, default_allocator.get())};\n    ASSERT_NE(ptr.get(), nullptr);\n\n    std::cout << \"SPtr after alloc: \" << *ptr << std::endl;\n    ptr->a = 47;\n    ptr->b = 11;\n    std::cout << \"SPtr after set: \" << *ptr << std::endl;\n\n    ptr.reset();\n    ASSERT_EQ(ptr.get(), nullptr);\n  }\n}\n\nvoid test_device_allocator() {\n  using Allocator = DebugAllocator<DeviceAllocator<SomeType>>;\n  std::shared_ptr<DefaultAllocator> default_allocator(new DefaultAllocator());\n\n  int num_devices;\n  CUDA_CHECK(cudaGetDeviceCount(&num_devices));\n  MERLIN_CHECK(num_devices > 0,\n               \"Need at least one CUDA capable device for running this test.\");\n\n  CUDA_CHECK(cudaSetDevice(num_devices - 1));\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  {\n    auto ptr{Allocator::make_unique(1, default_allocator.get())};\n    ASSERT_NE(ptr.get(), nullptr);\n\n    std::cout << \"Sync UPtr after alloc: \" << *ptr << std::endl;\n    const SomeType tmp{47, 11};\n\n    std::cout << \"Sync UPtr after alloc get ptr: \" << ptr.get() << std::endl;\n    CUDA_CHECK(cudaMemset(ptr.get(), 0, sizeof(SomeType)));\n    CUDA_CHECK(\n        cudaMemcpy(ptr.get(), &tmp, sizeof(SomeType), cudaMemcpyHostToDevice));\n    std::cout << \"Sync UPtr after set: \" << *ptr << std::endl;\n\n    ptr.reset();\n    ASSERT_EQ(ptr.get(), nullptr);\n  }\n\n  {\n    auto ptr{Allocator::make_unique(1, default_allocator.get(), stream)};\n    ASSERT_NE(ptr.get(), nullptr);\n\n    std::cout << \"Async UPtr after alloc: \" << *ptr << std::endl;\n    const SomeType tmp{47, 11};\n    CUDA_CHECK(\n        cudaMemcpy(ptr.get(), &tmp, sizeof(SomeType), cudaMemcpyHostToDevice));\n    std::cout << \"Async UPtr after set: \" << *ptr << std::endl;\n\n    ptr.reset();\n    ASSERT_EQ(ptr.get(), nullptr);\n  }\n\n  {\n    auto ptr{Allocator::make_shared(1, default_allocator.get(), stream)};\n    ASSERT_NE(ptr.get(), nullptr);\n\n    std::cout << \"SPtr after alloc: \" << *ptr << std::endl;\n    const SomeType tmp{47, 11};\n    CUDA_CHECK(\n        cudaMemcpy(ptr.get(), &tmp, sizeof(SomeType), cudaMemcpyHostToDevice));\n    std::cout << \"SPtr after set: \" << *ptr << std::endl;\n\n    ptr.reset();\n    ASSERT_EQ(ptr.get(), nullptr);\n  }\n\n  CUDA_CHECK(cudaStreamDestroy(stream));\n}\n\nvoid test_borrow_return_with_context(const bool use_custom_stream) {\n  int num_devices;\n  CUDA_CHECK(cudaGetDeviceCount(&num_devices));\n  MERLIN_CHECK(num_devices > 0,\n               \"Need at least one CUDA capable device for running this test.\");\n  CUDA_CHECK(cudaSetDevice(0));\n\n  cudaStream_t stream{0};\n  if (use_custom_stream) {\n    CUDA_CHECK(cudaStreamCreate(&stream));\n  }\n\n  std::shared_ptr<DefaultAllocator> default_allocator(new DefaultAllocator());\n  {\n    MemoryPool<DebugAllocator<DeviceAllocator<SomeType>>> pool(\n        opt, default_allocator.get());\n    const size_t buffer_size{256L * 1024};\n\n    // Initial status.\n    std::cout << \".:: Initial state ::.\\n\" << pool << std::endl;\n    ASSERT_EQ(pool.current_stock(), 0);\n    ASSERT_EQ(pool.num_pending(), 0);\n\n    // Borrow and return one buffer (unique ptr).\n    {\n      auto buffer{pool.get_unique(buffer_size, stream)};\n      std::cout << \".:: Borrow 1 (unique) ::.\\n\" << pool << std::endl;\n      ASSERT_EQ(pool.current_stock(), 0);\n      ASSERT_EQ(pool.num_pending(), 0);\n    }\n    std::cout << \".:: Return 1 (unique) ::.\\n\" << pool << std::endl;\n    ASSERT_EQ(pool.current_stock(), 0);\n    ASSERT_EQ(pool.num_pending(), 1);\n\n    // Await unfinished GPU work (ensure stable situation).\n    pool.await_pending(stream);\n    std::cout << \".:: Await pending ::.\\n\" << pool << std::endl;\n    ASSERT_EQ(pool.current_stock(), 1);\n    ASSERT_EQ(pool.num_pending(), 0);\n\n    // Borrow and return one buffer (shared ptr).\n    {\n      auto buffer{pool.get_shared(buffer_size, stream)};\n      std::cout << \".:: Borrow 1 (shared) ::.\\n\" << pool << std::endl;\n      ASSERT_EQ(pool.current_stock(), 0);\n      ASSERT_EQ(pool.num_pending(), 0);\n    }\n    std::cout << \".:: Return 1 (shared) ::.\\n\" << pool << std::endl;\n    ASSERT_EQ(pool.current_stock(), 0);\n    ASSERT_EQ(pool.num_pending(), 1);\n\n    // Await unfinished GPU work (ensure stable situation).\n    pool.await_pending(stream);\n    std::cout << \".:: Await pending ::.\\n\" << pool << std::endl;\n    ASSERT_EQ(pool.current_stock(), 1);\n    ASSERT_EQ(pool.num_pending(), 0);\n\n    // Borrow static workspace with less than `max_stock` buffers.\n    {\n      auto ws{pool.get_workspace<2>(buffer_size, stream)};\n      std::cout << \".:: Borrow 2 (static) ::.\\n\" << pool << std::endl;\n      ASSERT_EQ(pool.current_stock(), 0);\n      ASSERT_EQ(pool.num_pending(), 0);\n    }\n    std::cout << \".:: Return 2 (static) ::.\\n\" << pool << std::endl;\n    ASSERT_EQ(pool.current_stock(), 0);\n    ASSERT_EQ(pool.num_pending(), 2);\n\n    // Await unfinished GPU work (ensure stable situation).\n    pool.await_pending(stream);\n    std::cout << \".:: Await pending ::.\\n\" << pool << std::endl;\n    ASSERT_EQ(pool.current_stock(), 2);\n    ASSERT_EQ(pool.num_pending(), 0);\n\n    // Borrow workspace that exceeds base pool size. Possible results:\n    // 1. If this thread is slower than the driver.\n    //    Upon return we will see a partial deallocation before inserting the\n    //    last buffer into the pending queue.\n    // 2. If this the driver is slower than this thread queuing/querying events.\n    //    Either 0-3 buffers in stock partial dallocation\n    //    1-5 buffers pending. Hence there is no good way to check.\n    {\n      auto ws{pool.get_workspace<6>(buffer_size, stream)};\n      std::cout << \".:: Borrow 6 (static) ::.\\n\" << pool << std::endl;\n      ASSERT_EQ(pool.current_stock(), 0);\n      ASSERT_EQ(pool.num_pending(), 0);\n    }\n    std::cout << \".:: Return 6 (static) ::.\\n\" << pool << std::endl;\n    ASSERT_GE(pool.num_pending(), 1);\n\n    // Await unfinished GPU work (ensure stable situation).\n    pool.await_pending(stream);\n    std::cout << \".:: Await pending ::.\\n\" << pool << std::endl;\n    ASSERT_EQ(pool.current_stock(), 3);\n    ASSERT_EQ(pool.num_pending(), 0);\n\n    // Pin 1 and deplete stock.\n    {\n      auto ws{pool.get_workspace<1>(buffer_size, stream)};\n      pool.deplete_stock();\n      std::cout << \".:: Deplete stock ::.\\n\" << pool << std::endl;\n      ASSERT_EQ(pool.current_stock(), 0);\n      ASSERT_EQ(pool.num_pending(), 0);\n    }\n    std::cout << \".:: Deplete stock ::.\\n\" << pool << std::endl;\n    ASSERT_EQ(pool.current_stock(), 0);\n    ASSERT_EQ(pool.num_pending(), 1);\n\n    // Await unfinished GPU work (ensure stable situation).\n    pool.await_pending(stream);\n    std::cout << \".:: Await pending ::.\\n\" << pool << std::endl;\n    ASSERT_EQ(pool.current_stock(), 1);\n    ASSERT_EQ(pool.num_pending(), 0);\n\n    // Increase stock to 3 buffers.\n    { auto ws{pool.get_workspace<3>(buffer_size, stream)}; }\n    pool.await_pending(stream);\n    ASSERT_EQ(pool.current_stock(), 3);\n    ASSERT_EQ(pool.num_pending(), 0);\n\n    // Pin 1 of the 3 buffers and release it to make it pending.\n    { auto ws{pool.get_workspace<1>(buffer_size, stream)}; }\n    ASSERT_EQ(pool.current_stock(), 2);\n    ASSERT_EQ(pool.num_pending(), 1);\n    std::cout << \".:: Ensure 2 stock + 1 pending situation ::.\\n\"\n              << pool << std::endl;\n\n    // Borrow a buffer that is smaller than the current buffer size.\n    {\n      auto ws{pool.get_unique(buffer_size / 2, stream)};\n      std::cout << \".:: Borrow 1 (smaller) ::.\\n\" << pool << std::endl;\n      ASSERT_EQ(pool.current_stock(), 1);\n      ASSERT_EQ(pool.num_pending(), 1);\n    }\n    std::cout << \".:: Return 1 (smaller) ::.\\n\" << pool << std::endl;\n    ASSERT_EQ(pool.current_stock(), 1);\n    ASSERT_EQ(pool.num_pending(), 2);\n\n    // Borrow a buffer that is bigger than the current buffer size. This will\n    // evict the stock buffers which are smaller, but will not concern the\n    // buffers that are still pending.\n    {\n      auto ws{pool.get_unique(buffer_size + 37, stream)};\n      std::cout << \".:: Borrow 1 (bigger) ::.\\n\" << pool << std::endl;\n      ASSERT_EQ(pool.current_stock(), 0);\n      ASSERT_EQ(pool.num_pending(), 2);\n    }\n    std::cout << \".:: Return 1 (bigger) ::.\\n\" << pool << std::endl;\n    ASSERT_EQ(pool.current_stock(), 0);\n    ASSERT_EQ(pool.num_pending(), 3);\n\n    // Because there are now pending buffers that are too small, they will be\n    // cleared once the associated work has been completed.\n    pool.await_pending(stream);\n    std::cout << \".:: Await pending ::.\\n\" << pool << std::endl;\n    ASSERT_EQ(pool.current_stock(), 1);\n    ASSERT_EQ(pool.num_pending(), 0);\n  }\n\n  if (stream) {\n    CUDA_CHECK(cudaStreamDestroy(stream));\n  }\n}\n\nvoid test_borrow_return_lost_context() {\n  int num_devices;\n  CUDA_CHECK(cudaGetDeviceCount(&num_devices));\n  MERLIN_CHECK(num_devices > 0,\n               \"Need at least one CUDA capable device for running this test.\");\n  CUDA_CHECK(cudaSetDevice(0));\n\n  std::shared_ptr<DefaultAllocator> default_allocator(new DefaultAllocator());\n  {\n    MemoryPool<DebugAllocator<DeviceAllocator<SomeType>>> pool{\n        opt, default_allocator.get()};\n    const size_t buffer_size{256L * 1024};\n\n    // Initial status.\n    std::cout << \".:: Initial state ::.\\n\" << pool << std::endl;\n    ASSERT_EQ(pool.current_stock(), 0);\n    ASSERT_EQ(pool.num_pending(), 0);\n\n    // Borrow and return one buffer (unique ptr).\n    {\n      cudaStream_t stream;\n      CUDA_CHECK(cudaStreamCreate(&stream));\n\n      auto buffer{pool.get_unique(buffer_size, stream)};\n      std::cout << \".:: Borrow 1 (unique) ::.\\n\" << pool << std::endl;\n      ASSERT_EQ(pool.current_stock(), 0);\n      ASSERT_EQ(pool.num_pending(), 0);\n\n      CUDA_CHECK(cudaStreamDestroy(stream));\n    }\n    std::cout << \".:: Return 1 (unique) ::.\\n\" << pool << std::endl;\n    ASSERT_EQ(pool.current_stock(), 1);\n    ASSERT_EQ(pool.num_pending(), 0);\n\n    // Borrow and return one buffer (shared ptr).\n    {\n      cudaStream_t stream;\n      CUDA_CHECK(cudaStreamCreate(&stream));\n\n      auto buffer{pool.get_shared(buffer_size)};\n      std::cout << \".:: Borrow 1 (shared) ::.\\n\" << pool << std::endl;\n      ASSERT_EQ(pool.current_stock(), 0);\n      ASSERT_EQ(pool.num_pending(), 0);\n\n      CUDA_CHECK(cudaStreamDestroy(stream));\n    }\n    std::cout << \".:: Return 1 (shared) ::.\\n\" << pool << std::endl;\n    ASSERT_EQ(pool.current_stock(), 1);\n    ASSERT_EQ(pool.num_pending(), 0);\n\n    // Borrow static workspace with less than `max_stock` buffers.\n    {\n      cudaStream_t stream;\n      CUDA_CHECK(cudaStreamCreate(&stream));\n\n      auto ws{pool.get_workspace<2>(buffer_size)};\n      std::cout << \".:: Borrow 2 (static) ::.\\n\" << pool << std::endl;\n      ASSERT_EQ(pool.current_stock(), 0);\n      ASSERT_EQ(pool.num_pending(), 0);\n\n      CUDA_CHECK(cudaStreamDestroy(stream));\n    }\n    std::cout << \".:: Return 2 (static) ::.\\n\" << pool << std::endl;\n    ASSERT_EQ(pool.current_stock(), 2);\n    ASSERT_EQ(pool.num_pending(), 0);\n\n    // Borrow dynamic workspace with less than `max_stock` buffers.\n    {\n      cudaStream_t stream;\n      CUDA_CHECK(cudaStreamCreate(&stream));\n\n      auto ws{pool.get_workspace(2, buffer_size)};\n      std::cout << \".:: Borrow 2 (dynamic) ::.\\n\" << pool << std::endl;\n      ASSERT_EQ(pool.current_stock(), 0);\n      ASSERT_EQ(pool.num_pending(), 0);\n\n      CUDA_CHECK(cudaStreamDestroy(stream));\n    }\n\n    std::cout << \".:: Return 2 (dynamic) ::.\\n\" << pool << std::endl;\n    ASSERT_EQ(pool.current_stock(), 2);\n    ASSERT_EQ(pool.num_pending(), 0);\n\n    // Await unfinished GPU work (shouldn't change anything).\n    pool.await_pending();\n    std::cout << \".:: Await pending (shouldn't change anything) ::.\\n\"\n              << pool << std::endl;\n    ASSERT_EQ(pool.current_stock(), 2);\n    ASSERT_EQ(pool.num_pending(), 0);\n\n    // Borrow workspace that exceeds base pool size.\n    {\n      cudaStream_t stream;\n      CUDA_CHECK(cudaStreamCreate(&stream));\n\n      auto ws{pool.get_workspace<6>(buffer_size)};\n      std::cout << \".:: Borrow 6 (static) ::.\\n\" << pool << std::endl;\n      ASSERT_EQ(pool.current_stock(), 0);\n      ASSERT_EQ(pool.num_pending(), 0);\n\n      CUDA_CHECK(cudaStreamDestroy(stream));\n    }\n    std::cout << \".:: Return 6 (static) ::.\\n\" << pool << std::endl;\n    ASSERT_EQ(pool.current_stock(), opt.max_stock);\n    ASSERT_EQ(pool.num_pending(), 0);\n\n    // Borrow a buffer that is smaller than the current buffer size.\n    {\n      cudaStream_t stream;\n      CUDA_CHECK(cudaStreamCreate(&stream));\n\n      auto ws{pool.get_unique(buffer_size / 2)};\n      std::cout << \".:: Borrow 1 (smaller) ::.\\n\" << pool << std::endl;\n      ASSERT_EQ(pool.current_stock(), opt.max_stock - 1);\n      ASSERT_EQ(pool.num_pending(), 0);\n\n      CUDA_CHECK(cudaStreamDestroy(stream));\n    }\n    std::cout << \".:: Return 1 (smaller) ::.\\n\" << pool << std::endl;\n    ASSERT_EQ(pool.current_stock(), opt.max_stock);\n    ASSERT_EQ(pool.num_pending(), 0);\n\n    // Borrow a buffer that is bigger than the current buffer size.\n    {\n      cudaStream_t stream;\n      CUDA_CHECK(cudaStreamCreate(&stream));\n\n      auto ws{pool.get_unique(buffer_size + 37)};\n      std::cout << \".:: Borrow 1 (bigger) ::.\\n\" << pool << std::endl;\n      ASSERT_EQ(pool.current_stock(), 0);\n      ASSERT_EQ(pool.num_pending(), 0);\n\n      CUDA_CHECK(cudaStreamDestroy(stream));\n    }\n    std::cout << \".:: Return 1 (smaller) ::.\\n\" << pool << std::endl;\n    ASSERT_EQ(pool.current_stock(), 1);\n    ASSERT_EQ(pool.num_pending(), 0);\n  }\n}\n\nTEST(MemoryPoolTest, standard_allocator) { test_standard_allocator(); }\nTEST(MemoryPoolTest, host_allocator) { test_host_allocator(); }\nTEST(MemoryPoolTest, device_allocator) { test_device_allocator(); }\nTEST(MemoryPoolTest, borrow_return_default_context) {\n  test_borrow_return_with_context(false);\n}\nTEST(MemoryPoolTest, borrow_return_custom_context) {\n  test_borrow_return_with_context(true);\n}\n\nTEST(MemoryPoolTest, test_borrow_return_lost_context) {\n  std::cout << \"Unfortunately, there is currently no reliable way to test \"\n               \"safely whether a\\n\"\n            << \"stream is alive. Keeping the test around for manual tests.\\n\";\n  if (false) {\n    test_borrow_return_lost_context();\n  }\n}\n"
  },
  {
    "path": "tests/merlin_hashtable_test.cc.cu",
    "content": "/*\n * Copyright (c) 2022, NVIDIA CORPORATION.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *     http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n#include <gtest/gtest.h>\n#include <math.h>\n#include <stdio.h>\n#include <stdlib.h>\n#include <algorithm>\n#include <array>\n#include <iostream>\n#include <random>\n#include <thread>\n#include <unordered_set>\n#include <vector>\n#include \"merlin_hashtable.cuh\"\n#include \"test_util.cuh\"\n\nconstexpr size_t DIM = 16;\nusing K = uint64_t;\nusing V = float;\nusing S = uint64_t;\nusing TableOptions = nv::merlin::HashTableOptions;\nusing BaseAllocator = nv::merlin::BaseAllocator;\nusing MemoryType = nv::merlin::MemoryType;\nusing EvictStrategy = nv::merlin::EvictStrategy;\n\ntemplate <class K, class S>\nstruct EraseIfPredFunctor {\n  __forceinline__ __device__ bool operator()(const K& key, S& score,\n                                             const K& pattern,\n                                             const S& threshold) {\n    return ((key & 0x7f > pattern) && (score > threshold));\n  }\n};\n\ntemplate <class K, class V, class S>\nstruct EraseIfPredFunctorV2 {\n  K pattern;\n  S threshold;\n  EraseIfPredFunctorV2(K pattern, S threshold)\n      : pattern(pattern), threshold(threshold) {}\n  template <int GroupSize>\n  __forceinline__ __device__ bool operator()(\n      const K& key, const V* value, const S& score,\n      cg::thread_block_tile<GroupSize>& g) {\n    /* evaluate key, score and value. */\n    return ((key & 0x7f > pattern) && (score > threshold));\n  }\n};\n\ntemplate <class K, class V, class S>\nstruct EraseIfPredFunctorV3 {\n  K pattern;\n  S threshold;\n  int dim;\n  EraseIfPredFunctorV3(K pattern, S threshold)\n      : pattern(pattern), threshold(threshold) {}\n  template <int GroupSize>\n  __forceinline__ __device__ bool operator()(\n      const K& key, const V* value, const S& score,\n      cg::thread_block_tile<GroupSize>& g) {\n    /* evaluate key, score and value. */\n    bool pred = score < threshold;\n\n    for (int i = 0; i < g.size(); i++) {\n      auto cur_value = g.shfl(value, i);\n      auto cur_key = g.shfl(key, i);\n      bool cur_pred = g.shfl(pred, i);\n      if (cur_pred == false) continue;\n      unsigned int vote = 0;\n      /* evaluate one value cooperatively in one loop. */\n      for (int j = g.thread_rank(); j < dim; j += g.size()) {\n        if (cur_value[j] != static_cast<V>(cur_key * 0.00001)) cur_pred = false;\n        vote = g.ballot(cur_pred == false);\n        if (vote != 0) break;\n      }\n      if (g.thread_rank() == i && vote != 0) pred = false;\n    }\n    return pred;\n  }\n};\n\nenum class EraseIfVersion { V1, V2, V3 };\n\ntemplate <class K, class S>\nstruct ExportIfPredFunctor {\n  __forceinline__ __device__ bool operator()(const K& key, S& score,\n                                             const K& pattern,\n                                             const S& threshold) {\n    return score > threshold;\n  }\n};\n\nclass CustomizedAllocator : public virtual BaseAllocator {\n public:\n  CustomizedAllocator() {};\n  ~CustomizedAllocator() override {};\n\n  void alloc(const MemoryType type, void** ptr, size_t size,\n             unsigned int pinned_flags = cudaHostAllocDefault) override {\n    switch (type) {\n      case MemoryType::Device:\n        CUDA_CHECK(cudaMalloc(ptr, size));\n        break;\n      case MemoryType::Managed:\n        CUDA_CHECK(cudaMallocManaged(ptr, size, cudaMemAttachGlobal));\n        break;\n      case MemoryType::Pinned:\n        CUDA_CHECK(cudaMallocHost(ptr, size, pinned_flags));\n        break;\n      case MemoryType::Host:\n        *ptr = std::malloc(size);\n        break;\n    }\n    return;\n  }\n\n  void alloc_async(const MemoryType type, void** ptr, size_t size,\n                   cudaStream_t stream) override {\n    if (type == MemoryType::Device) {\n      CUDA_CHECK(cudaMallocAsync(ptr, size, stream));\n    } else {\n      MERLIN_CHECK(false,\n                   \"[CustomizedAllocator] alloc_async is only support for \"\n                   \"MemoryType::Device!\");\n    }\n    return;\n  }\n\n  void free(const MemoryType type, void* ptr) override {\n    if (ptr == nullptr) {\n      return;\n    }\n    switch (type) {\n      case MemoryType::Pinned:\n        CUDA_CHECK(cudaFreeHost(ptr));\n        break;\n      case MemoryType::Device:\n      case MemoryType::Managed:\n        CUDA_CHECK(cudaFree(ptr));\n        break;\n      case MemoryType::Host:\n        std::free(ptr);\n        break;\n    }\n    return;\n  }\n\n  void free_async(const MemoryType type, void* ptr,\n                  cudaStream_t stream) override {\n    if (ptr == nullptr) {\n      return;\n    }\n\n    if (type == MemoryType::Device) {\n      CUDA_CHECK(cudaFreeAsync(ptr, stream));\n    } else {\n      MERLIN_CHECK(false,\n                   \"[CustomizedAllocator] free_async is only support for \"\n                   \"MemoryType::Device!\");\n    }\n  }\n};\n\nvoid test_basic(size_t max_hbm_for_vectors) {\n  constexpr uint64_t BUCKET_MAX_SIZE = 128;\n  constexpr uint64_t INIT_CAPACITY = 64 * 1024 * 1024UL - (128 + 1);\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t KEY_NUM = 1 * 1024 * 1024UL;\n  constexpr uint64_t TEST_TIMES = 1;\n\n  K* h_keys;\n  S* h_scores;\n  V* h_vectors;\n  bool* h_found;\n\n  TableOptions options;\n\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_bucket_size = BUCKET_MAX_SIZE;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  options.reserved_key_start_bit = 2;\n  options.num_of_buckets_per_alloc = 32;\n\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));\n\n  CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n\n  test_util::create_random_keys<K, S, V, DIM>(h_keys, h_scores, h_vectors,\n                                              KEY_NUM);\n\n  K* d_keys;\n  S* d_scores = nullptr;\n  V* d_vectors;\n  V* d_new_vectors;\n  bool* d_found;\n  size_t dump_counter = 0;\n\n  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMalloc(&d_new_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n\n  CUDA_CHECK(\n      cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),\n                        cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors, KEY_NUM * sizeof(V) * options.dim,\n                        cudaMemcpyHostToDevice));\n\n  CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  uint64_t total_size = 0;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n\n    ASSERT_EQ(table->bucket_count(),\n              524287);  // 1 + (INIT_CAPACITY / options.bucket_max_size)\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    table->insert_or_assign(KEY_NUM, d_keys, d_vectors, d_scores, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, KEY_NUM);\n\n    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n    table->find(KEY_NUM, d_keys, d_vectors, d_found, nullptr, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    int found_num = 0;\n    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n\n    for (int i = 0; i < KEY_NUM; i++) {\n      if (h_found[i]) found_num++;\n      ASSERT_EQ(h_scores[i], h_keys[i]);\n      for (int j = 0; j < options.dim; j++) {\n        ASSERT_EQ(h_vectors[i * options.dim + j],\n                  static_cast<float>(h_keys[i] * 0.00001));\n      }\n    }\n\n    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n    table->contains(KEY_NUM, d_keys, d_found, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    int contains_num = 0;\n    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                          cudaMemcpyDeviceToHost));\n    for (int i = 0; i < KEY_NUM; i++) {\n      if (h_found[i]) contains_num++;\n    }\n    ASSERT_EQ(contains_num, found_num);\n\n    CUDA_CHECK(cudaMemset(d_new_vectors, 2, KEY_NUM * sizeof(V) * options.dim));\n    table->insert_or_assign(KEY_NUM, d_keys,\n                            reinterpret_cast<float*>(d_new_vectors), d_scores,\n                            stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, KEY_NUM);\n\n    CUDA_CHECK(cudaMemset(d_new_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n    table->find(KEY_NUM, d_keys, reinterpret_cast<float*>(d_new_vectors),\n                d_found, nullptr, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_new_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n    found_num = 0;\n    uint32_t i_value = 0x2020202;\n    for (int i = 0; i < KEY_NUM; i++) {\n      if (h_found[i]) found_num++;\n      for (int j = 0; j < options.dim; j++) {\n        ASSERT_EQ(h_vectors[i * options.dim + j],\n                  *(reinterpret_cast<float*>(&i_value)));\n      }\n    }\n    ASSERT_EQ(found_num, KEY_NUM);\n\n    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n    table->contains(KEY_NUM, d_keys, d_found, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    contains_num = 0;\n    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                          cudaMemcpyDeviceToHost));\n    for (int i = 0; i < KEY_NUM; i++) {\n      if (h_found[i]) contains_num++;\n    }\n    ASSERT_EQ(contains_num, found_num);\n\n    table->accum_or_assign(KEY_NUM, d_keys, d_vectors, d_found, d_scores,\n                           stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, KEY_NUM);\n\n    table->erase(KEY_NUM >> 1, d_keys, stream);\n    size_t total_size_after_erase = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size_after_erase, total_size >> 1);\n\n    table->clear(stream);\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    table->insert_or_assign(KEY_NUM, d_keys, d_vectors, d_scores, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    CUDA_CHECK(cudaMemset(d_scores, 0, KEY_NUM * sizeof(S)));\n    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n\n    table->find(KEY_NUM, d_keys, d_vectors, d_found, d_scores, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    found_num = 0;\n    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n\n    for (int i = 0; i < KEY_NUM; i++) {\n      if (h_found[i]) found_num++;\n      ASSERT_EQ(h_scores[i], h_keys[i]);\n      for (int j = 0; j < options.dim; j++) {\n        ASSERT_EQ(h_vectors[i * options.dim + j],\n                  static_cast<float>(h_keys[i] * 0.00001));\n      }\n    }\n    ASSERT_EQ(found_num, KEY_NUM);\n\n    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n    table->contains(KEY_NUM, d_keys, d_found, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    contains_num = 0;\n    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                          cudaMemcpyDeviceToHost));\n    for (int i = 0; i < KEY_NUM; i++) {\n      if (h_found[i]) contains_num++;\n    }\n    ASSERT_EQ(contains_num, found_num);\n\n    CUDA_CHECK(cudaMemset(d_keys, 0, KEY_NUM * sizeof(K)));\n    CUDA_CHECK(cudaMemset(d_scores, 0, KEY_NUM * sizeof(S)));\n    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n    dump_counter = table->export_batch(table->capacity(), 0, d_keys, d_vectors,\n                                       d_scores, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    ASSERT_EQ(dump_counter, KEY_NUM);\n    CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),\n                          cudaMemcpyDeviceToHost));\n    for (int i = 0; i < KEY_NUM; i++) {\n      ASSERT_EQ(h_scores[i], h_keys[i]);\n      for (int j = 0; j < options.dim; j++) {\n        ASSERT_EQ(h_vectors[i * options.dim + j],\n                  static_cast<float>(h_keys[i] * 0.00001));\n      }\n    }\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaFreeHost(h_keys));\n  CUDA_CHECK(cudaFreeHost(h_scores));\n  CUDA_CHECK(cudaFreeHost(h_vectors));\n  CUDA_CHECK(cudaFreeHost(h_found));\n\n  CUDA_CHECK(cudaFree(d_keys));\n  CUDA_CHECK(cudaFree(d_scores));\n  CUDA_CHECK(cudaFree(d_vectors));\n  CUDA_CHECK(cudaFree(d_new_vectors));\n  CUDA_CHECK(cudaFree(d_found));\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_basic_without_rehash(size_t max_hbm_for_vectors) {\n  constexpr uint64_t BUCKET_MAX_SIZE = 128;\n  constexpr uint64_t NUM_OF_BUCKETS_PER_ALLOC = 2048;\n  constexpr uint64_t INIT_CAPACITY =\n      64 * 1024 * 1024UL - (NUM_OF_BUCKETS_PER_ALLOC * BUCKET_MAX_SIZE) + 1;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t KEY_NUM = 1 * 1024 * 1024UL;\n  constexpr uint64_t TEST_TIMES = 1;\n\n  K* h_keys;\n  S* h_scores;\n  V* h_vectors;\n  bool* h_found;\n\n  TableOptions options;\n\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_bucket_size = BUCKET_MAX_SIZE;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  options.reserved_key_start_bit = 2;\n  options.num_of_buckets_per_alloc = NUM_OF_BUCKETS_PER_ALLOC;\n\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));\n\n  CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n\n  test_util::create_random_keys<K, S, V, DIM>(h_keys, h_scores, h_vectors,\n                                              KEY_NUM);\n\n  K* d_keys;\n  S* d_scores = nullptr;\n  V* d_vectors;\n  V* d_new_vectors;\n  bool* d_found;\n  size_t dump_counter = 0;\n\n  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMalloc(&d_new_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n\n  CUDA_CHECK(\n      cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),\n                        cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors, KEY_NUM * sizeof(V) * options.dim,\n                        cudaMemcpyHostToDevice));\n\n  CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  uint64_t total_size = 0;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n\n    ASSERT_EQ(table->bucket_count(),\n              522241);  // 1 + (INIT_CAPACITY / options.bucket_max_size)\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    table->insert_or_assign(KEY_NUM, d_keys, d_vectors, d_scores, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, KEY_NUM);\n\n    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n    table->find(KEY_NUM, d_keys, d_vectors, d_found, nullptr, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    int found_num = 0;\n    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n\n    for (int i = 0; i < KEY_NUM; i++) {\n      if (h_found[i]) found_num++;\n      ASSERT_EQ(h_scores[i], h_keys[i]);\n      for (int j = 0; j < options.dim; j++) {\n        ASSERT_EQ(h_vectors[i * options.dim + j],\n                  static_cast<float>(h_keys[i] * 0.00001));\n      }\n    }\n\n    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n    table->contains(KEY_NUM, d_keys, d_found, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    int contains_num = 0;\n    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                          cudaMemcpyDeviceToHost));\n    for (int i = 0; i < KEY_NUM; i++) {\n      if (h_found[i]) contains_num++;\n    }\n    ASSERT_EQ(contains_num, found_num);\n\n    CUDA_CHECK(cudaMemset(d_new_vectors, 2, KEY_NUM * sizeof(V) * options.dim));\n    table->insert_or_assign(KEY_NUM, d_keys,\n                            reinterpret_cast<float*>(d_new_vectors), d_scores,\n                            stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, KEY_NUM);\n\n    CUDA_CHECK(cudaMemset(d_new_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n    table->find(KEY_NUM, d_keys, reinterpret_cast<float*>(d_new_vectors),\n                d_found, nullptr, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_new_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n    found_num = 0;\n    uint32_t i_value = 0x2020202;\n    for (int i = 0; i < KEY_NUM; i++) {\n      if (h_found[i]) found_num++;\n      for (int j = 0; j < options.dim; j++) {\n        ASSERT_EQ(h_vectors[i * options.dim + j],\n                  *(reinterpret_cast<float*>(&i_value)));\n      }\n    }\n    ASSERT_EQ(found_num, KEY_NUM);\n\n    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n    table->contains(KEY_NUM, d_keys, d_found, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    contains_num = 0;\n    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                          cudaMemcpyDeviceToHost));\n    for (int i = 0; i < KEY_NUM; i++) {\n      if (h_found[i]) contains_num++;\n    }\n    ASSERT_EQ(contains_num, found_num);\n\n    table->accum_or_assign(KEY_NUM, d_keys, d_vectors, d_found, d_scores,\n                           stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, KEY_NUM);\n\n    table->erase(KEY_NUM >> 1, d_keys, stream);\n    size_t total_size_after_erase = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size_after_erase, total_size >> 1);\n\n    table->clear(stream);\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    table->insert_or_assign(KEY_NUM, d_keys, d_vectors, d_scores, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    CUDA_CHECK(cudaMemset(d_scores, 0, KEY_NUM * sizeof(S)));\n    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n\n    table->find(KEY_NUM, d_keys, d_vectors, d_found, d_scores, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    found_num = 0;\n    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n\n    for (int i = 0; i < KEY_NUM; i++) {\n      if (h_found[i]) found_num++;\n      ASSERT_EQ(h_scores[i], h_keys[i]);\n      for (int j = 0; j < options.dim; j++) {\n        ASSERT_EQ(h_vectors[i * options.dim + j],\n                  static_cast<float>(h_keys[i] * 0.00001));\n      }\n    }\n    ASSERT_EQ(found_num, KEY_NUM);\n\n    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n    table->contains(KEY_NUM, d_keys, d_found, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    contains_num = 0;\n    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                          cudaMemcpyDeviceToHost));\n    for (int i = 0; i < KEY_NUM; i++) {\n      if (h_found[i]) contains_num++;\n    }\n    ASSERT_EQ(contains_num, found_num);\n\n    CUDA_CHECK(cudaMemset(d_keys, 0, KEY_NUM * sizeof(K)));\n    CUDA_CHECK(cudaMemset(d_scores, 0, KEY_NUM * sizeof(S)));\n    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n    dump_counter = table->export_batch(table->capacity(), 0, d_keys, d_vectors,\n                                       d_scores, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    ASSERT_EQ(dump_counter, KEY_NUM);\n    CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),\n                          cudaMemcpyDeviceToHost));\n    for (int i = 0; i < KEY_NUM; i++) {\n      ASSERT_EQ(h_scores[i], h_keys[i]);\n      for (int j = 0; j < options.dim; j++) {\n        ASSERT_EQ(h_vectors[i * options.dim + j],\n                  static_cast<float>(h_keys[i] * 0.00001));\n      }\n    }\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaFreeHost(h_keys));\n  CUDA_CHECK(cudaFreeHost(h_scores));\n  CUDA_CHECK(cudaFreeHost(h_vectors));\n  CUDA_CHECK(cudaFreeHost(h_found));\n\n  CUDA_CHECK(cudaFree(d_keys));\n  CUDA_CHECK(cudaFree(d_scores));\n  CUDA_CHECK(cudaFree(d_vectors));\n  CUDA_CHECK(cudaFree(d_new_vectors));\n  CUDA_CHECK(cudaFree(d_found));\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\ntemplate <typename V>\nvoid test_find_using_pipeline(int dim, bool load_scores) {\n  using TableOptions = nv::merlin::HashTableOptions;\n  constexpr uint64_t BUCKET_MAX_SIZE = 128;\n  constexpr uint64_t INIT_CAPACITY = 128 * 1024UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t KEY_NUM = 128UL;\n  constexpr uint64_t TEST_TIMES = 1;\n\n  K* h_keys;\n  S* h_scores;\n  V* h_vectors;\n  bool* h_found;\n\n  TableOptions options;\n\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = dim;\n  options.max_bucket_size = BUCKET_MAX_SIZE;\n  options.max_hbm_for_vectors = nv::merlin::GB(16);\n\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));\n\n  CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n\n  test_util::create_random_keys<K, S, V>(options.dim, h_keys, h_scores,\n                                         h_vectors, KEY_NUM);\n\n  K* d_keys;\n  S* d_scores = nullptr;\n  V* d_vectors;\n  bool* d_found;\n\n  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n\n  CUDA_CHECK(\n      cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),\n                        cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors, KEY_NUM * sizeof(V) * options.dim,\n                        cudaMemcpyHostToDevice));\n\n  CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  uint64_t total_size = 0;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    table->insert_or_assign(KEY_NUM, d_keys, d_vectors, d_scores, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, KEY_NUM);\n\n    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n    CUDA_CHECK(cudaMemset(d_scores, 0, KEY_NUM * sizeof(S)));\n    if (load_scores) {\n      table->find(KEY_NUM, d_keys, d_vectors, d_found, d_scores, stream);\n    } else {\n      table->find(KEY_NUM, d_keys, d_vectors, d_found, nullptr, stream);\n    }\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    int found_num = 0;\n    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n\n    for (int i = 0; i < KEY_NUM; i++) {\n      if (h_found[i]) found_num++;\n      if (load_scores) ASSERT_EQ(h_scores[i], h_keys[i]);\n      for (int j = 0; j < options.dim; j++) {\n        ASSERT_EQ(h_vectors[i * options.dim + j],\n                  static_cast<V>(h_keys[i] * 0.00001));\n      }\n    }\n    ASSERT_EQ(found_num, KEY_NUM);\n\n    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n    table->contains(KEY_NUM, d_keys, d_found, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    int contains_num = 0;\n    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                          cudaMemcpyDeviceToHost));\n    for (int i = 0; i < KEY_NUM; i++) {\n      if (h_found[i]) contains_num++;\n    }\n    ASSERT_EQ(contains_num, found_num);\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaFreeHost(h_keys));\n  CUDA_CHECK(cudaFreeHost(h_scores));\n  CUDA_CHECK(cudaFreeHost(h_vectors));\n  CUDA_CHECK(cudaFreeHost(h_found));\n\n  CUDA_CHECK(cudaFree(d_keys));\n  CUDA_CHECK(cudaFree(d_scores));\n  CUDA_CHECK(cudaFree(d_vectors));\n  CUDA_CHECK(cudaFree(d_found));\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_basic_when_full(size_t max_hbm_for_vectors) {\n  constexpr uint64_t INIT_CAPACITY = 1 * 1024 * 1024UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t KEY_NUM = 1 * 1024 * 1024UL;\n  constexpr uint64_t TEST_TIMES = 1;\n\n  K* h_keys;\n  S* h_scores;\n  V* h_vectors;\n  bool* h_found;\n\n  TableOptions options;\n\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.reserved_key_start_bit = 3;\n  options.num_of_buckets_per_alloc = 32;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  std::unique_ptr<CustomizedAllocator> customized_allocator =\n      std::make_unique<CustomizedAllocator>();\n\n  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));\n\n  CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n\n  test_util::create_random_keys<K, S, V, DIM>(h_keys, h_scores, h_vectors,\n                                              KEY_NUM);\n\n  K* d_keys;\n  S* d_scores = nullptr;\n  V* d_vectors;\n  V* d_def_val;\n  bool* d_found;\n\n  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMalloc(&d_def_val, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n\n  CUDA_CHECK(\n      cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),\n                        cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors, KEY_NUM * sizeof(V) * options.dim,\n                        cudaMemcpyHostToDevice));\n\n  CUDA_CHECK(cudaMemset(d_def_val, 2, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  uint64_t total_size = 0;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options, customized_allocator.get());\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    table->insert_or_assign(KEY_NUM, d_keys, d_vectors, d_scores, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    uint64_t total_size_after_insert = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    {\n      CUDA_CHECK(cudaMemset(d_def_val, 0, KEY_NUM * sizeof(V) * options.dim));\n      table->find(KEY_NUM, d_keys, d_def_val, d_found, nullptr, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      int found_num = 0;\n\n      CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));\n      CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n      CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                            cudaMemcpyDeviceToHost));\n      CUDA_CHECK(cudaMemcpy(h_vectors, d_def_val,\n                            KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDeviceToHost));\n      for (int i = 0; i < KEY_NUM; i++) {\n        if (h_found[i]) {\n          found_num++;\n          for (int j = 0; j < options.dim; j++) {\n            ASSERT_EQ(h_vectors[i * options.dim + j],\n                      static_cast<float>(h_keys[i] * 0.00001));\n          }\n        }\n      }\n      ASSERT_EQ(total_size_after_insert, found_num);\n    }\n\n    table->erase(KEY_NUM, d_keys, stream);\n    size_t total_size_after_erase = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size_after_erase, 0);\n\n    table->insert_or_assign(KEY_NUM, d_keys, d_vectors, d_scores, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    uint64_t total_size_after_reinsert = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size_after_insert, total_size_after_reinsert);\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,\n                        cudaMemcpyDeviceToHost));\n\n  CUDA_CHECK(cudaFreeHost(h_keys));\n  CUDA_CHECK(cudaFreeHost(h_scores));\n  CUDA_CHECK(cudaFreeHost(h_vectors));\n  CUDA_CHECK(cudaFreeHost(h_found));\n\n  CUDA_CHECK(cudaFree(d_keys));\n  CUDA_CHECK(cudaFree(d_scores));\n  CUDA_CHECK(cudaFree(d_vectors));\n  CUDA_CHECK(cudaFree(d_def_val));\n  CUDA_CHECK(cudaFree(d_found));\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\ntemplate <EraseIfVersion EV>\nvoid test_erase_if_pred(size_t max_hbm_for_vectors) {\n  constexpr uint64_t INIT_CAPACITY = 256UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t KEY_NUM = 128UL;\n  constexpr uint64_t TEST_TIMES = 1;\n  constexpr uint64_t BUCKET_MAX_SIZE = 128;\n\n  K* h_keys;\n  S* h_scores;\n  V* h_vectors;\n  bool* h_found;\n\n  TableOptions options;\n\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.reserved_key_start_bit = 4;\n  options.num_of_buckets_per_alloc = 2;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  std::unique_ptr<Table> table = std::make_unique<Table>();\n  table->init(options);\n\n  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));\n\n  K* d_keys;\n  S* d_scores = nullptr;\n  V* d_vectors;\n  bool* d_found;\n\n  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  uint64_t total_size = 0;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    test_util::create_keys_in_one_buckets<K, S, V, DIM>(\n        h_keys, h_scores, h_vectors, KEY_NUM, INIT_CAPACITY);\n    CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    table->insert_or_assign(KEY_NUM, d_keys, d_vectors, d_scores, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n    K pattern = 100;\n    S threshold = 0;\n    size_t erase_num = 0;\n    if (EV == EraseIfVersion::V1) {\n      erase_num = table->template erase_if<EraseIfPredFunctor>(\n          pattern, threshold, stream);\n    } else if (EV == EraseIfVersion::V2) {\n      EraseIfPredFunctorV2<K, V, S> pred(pattern, threshold);\n      erase_num = table->template erase_if_v2<EraseIfPredFunctorV2<K, V, S>>(\n          pred, stream);\n    } else if (EV == EraseIfVersion::V3) {\n      EraseIfPredFunctorV3<K, V, S> pred(pattern, threshold);\n      pred.dim = options.dim;\n      erase_num = table->template erase_if_v2<EraseIfPredFunctorV3<K, V, S>>(\n          pred, stream);\n    }\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ((erase_num + total_size), BUCKET_MAX_SIZE);\n\n    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n    table->find(KEY_NUM, d_keys, d_vectors, d_found, d_scores, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    int found_num = 0;\n\n    CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));\n    CUDA_CHECK(cudaMemset(h_scores, 0, KEY_NUM * sizeof(S)));\n    CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n    for (int i = 0; i < KEY_NUM; i++) {\n      if (h_found[i]) {\n        found_num++;\n        ASSERT_EQ(h_scores[i], h_keys[i]);\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors[i * options.dim + j],\n                    static_cast<float>(h_keys[i] * 0.00001));\n        }\n      }\n    }\n    ASSERT_EQ(found_num, (BUCKET_MAX_SIZE - erase_num));\n\n    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n    table->contains(KEY_NUM, d_keys, d_found, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    int contains_num = 0;\n    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                          cudaMemcpyDeviceToHost));\n    for (int i = 0; i < KEY_NUM; i++) {\n      if (h_found[i]) contains_num++;\n    }\n    ASSERT_EQ(contains_num, found_num);\n\n    table->clear(stream);\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,\n                        cudaMemcpyDeviceToHost));\n\n  CUDA_CHECK(cudaFreeHost(h_keys));\n  CUDA_CHECK(cudaFreeHost(h_scores));\n  CUDA_CHECK(cudaFreeHost(h_vectors));\n  CUDA_CHECK(cudaFreeHost(h_found));\n\n  CUDA_CHECK(cudaFree(d_keys));\n  CUDA_CHECK(cudaFree(d_scores));\n  CUDA_CHECK(cudaFree(d_vectors));\n  CUDA_CHECK(cudaFree(d_found));\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_rehash(size_t max_hbm_for_vectors) {\n  constexpr uint64_t BUCKET_MAX_SIZE = 128ul;\n  constexpr uint64_t INIT_CAPACITY = BUCKET_MAX_SIZE;\n  constexpr uint64_t MAX_CAPACITY = 4 * INIT_CAPACITY;\n  constexpr uint64_t KEY_NUM = BUCKET_MAX_SIZE * 2;\n  constexpr uint64_t TEST_TIMES = 100;\n  K* h_keys;\n  S* h_scores;\n  V* h_vectors;\n  bool* h_found;\n\n  TableOptions options;\n\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.reserved_key_start_bit = 5;\n  options.max_bucket_size = BUCKET_MAX_SIZE;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));\n\n  K* d_keys;\n  S* d_scores = nullptr;\n  V* d_vectors;\n  bool* d_found;\n  size_t dump_counter = 0;\n\n  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  uint64_t total_size = 0;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n    test_util::create_keys_in_one_buckets<K, S, V, DIM>(\n        h_keys, h_scores, h_vectors, KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE);\n    CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    table->insert_or_assign(KEY_NUM, d_keys, d_vectors, d_scores, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaDeviceSynchronize());\n    ASSERT_EQ(total_size, KEY_NUM);\n\n    dump_counter = table->export_batch(table->capacity(), 0, d_keys, d_vectors,\n                                       d_scores, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(dump_counter, KEY_NUM);\n\n    table->reserve(MAX_CAPACITY, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(table->capacity(), MAX_CAPACITY);\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, KEY_NUM);\n\n    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n    table->find(BUCKET_MAX_SIZE, d_keys, d_vectors, d_found, d_scores, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    int found_num = 0;\n\n    CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));\n    CUDA_CHECK(cudaMemset(h_scores, 0, KEY_NUM * sizeof(S)));\n    CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n    CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n    for (int i = 0; i < BUCKET_MAX_SIZE; i++) {\n      if (h_found[i]) {\n        found_num++;\n        ASSERT_EQ(h_scores[i], h_keys[i]);\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors[i * options.dim + j],\n                    static_cast<float>(h_keys[i] * 0.00001));\n        }\n      }\n    }\n    ASSERT_EQ(found_num, BUCKET_MAX_SIZE);\n\n    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n    table->contains(KEY_NUM, d_keys, d_found, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    int contains_num = 0;\n    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                          cudaMemcpyDeviceToHost));\n    for (int i = 0; i < BUCKET_MAX_SIZE; i++) {\n      if (h_found[i]) contains_num++;\n    }\n    ASSERT_EQ(contains_num, found_num);\n\n    table->clear(stream);\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,\n                        cudaMemcpyDeviceToHost));\n\n  CUDA_CHECK(cudaFreeHost(h_keys));\n  CUDA_CHECK(cudaFreeHost(h_scores));\n  CUDA_CHECK(cudaFreeHost(h_vectors));\n  CUDA_CHECK(cudaFreeHost(h_found));\n\n  CUDA_CHECK(cudaFree(d_keys));\n  CUDA_CHECK(cudaFree(d_scores));\n  CUDA_CHECK(cudaFree(d_vectors));\n  CUDA_CHECK(cudaFree(d_found));\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_rehash_on_big_batch(size_t max_hbm_for_vectors) {\n  constexpr uint64_t INIT_CAPACITY = 1024;\n  constexpr uint64_t MAX_CAPACITY = 16 * 1024;\n  constexpr uint64_t INIT_KEY_NUM = 1024;\n  constexpr uint64_t KEY_NUM = 2048;\n  K* h_keys;\n  S* h_scores;\n  V* h_vectors;\n  bool* h_found;\n\n  TableOptions options;\n\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.reserved_key_start_bit = 6;\n  options.num_of_buckets_per_alloc = 8;\n  options.max_bucket_size = 128;\n  options.max_load_factor = 0.6;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));\n\n  K* d_keys;\n  S* d_scores = nullptr;\n  V* d_vectors;\n  bool* d_found;\n  size_t dump_counter = 0;\n\n  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  uint64_t total_size = 0;\n  uint64_t expected_size = 0;\n  std::unique_ptr<Table> table = std::make_unique<Table>();\n  table->init(options);\n\n  test_util::create_random_keys<K, S, V, DIM>(h_keys, h_scores, h_vectors,\n                                              KEY_NUM);\n\n  CUDA_CHECK(\n      cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),\n                        cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors, KEY_NUM * sizeof(V) * options.dim,\n                        cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n\n  total_size = table->size(stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  ASSERT_EQ(total_size, 0);\n\n  table->insert_or_assign(INIT_KEY_NUM, d_keys, d_vectors, d_scores, stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  expected_size = INIT_KEY_NUM;\n\n  total_size = table->size(stream);\n  CUDA_CHECK(cudaDeviceSynchronize());\n  ASSERT_EQ(total_size, expected_size);\n  ASSERT_EQ(table->capacity(), (INIT_CAPACITY * 2));\n\n  table->insert_or_assign(KEY_NUM, d_keys, d_vectors, d_scores, stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  expected_size = KEY_NUM;\n\n  total_size = table->size(stream);\n  CUDA_CHECK(cudaDeviceSynchronize());\n  ASSERT_EQ(total_size, expected_size);\n  ASSERT_EQ(table->capacity(), KEY_NUM * 4);\n\n  dump_counter = table->export_batch(table->capacity(), 0, d_keys, d_vectors,\n                                     d_scores, stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  ASSERT_EQ(dump_counter, expected_size);\n\n  CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n  table->find(KEY_NUM, d_keys, d_vectors, d_found, d_scores, stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  int found_num = 0;\n\n  CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));\n  CUDA_CHECK(cudaMemset(h_scores, 0, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(\n      cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K), cudaMemcpyDeviceToHost));\n  CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                        cudaMemcpyDeviceToHost));\n  CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),\n                        cudaMemcpyDeviceToHost));\n  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,\n                        cudaMemcpyDeviceToHost));\n  for (int i = 0; i < KEY_NUM; i++) {\n    if (h_found[i]) {\n      found_num++;\n      ASSERT_EQ(h_scores[i], h_keys[i]);\n      for (int j = 0; j < options.dim; j++) {\n        ASSERT_EQ(h_vectors[i * options.dim + j],\n                  static_cast<float>(h_keys[i] * 0.00001));\n      }\n    }\n  }\n  ASSERT_EQ(found_num, KEY_NUM);\n\n  CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n  table->contains(KEY_NUM, d_keys, d_found, stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  int contains_num = 0;\n  CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                        cudaMemcpyDeviceToHost));\n  for (int i = 0; i < KEY_NUM; i++) {\n    if (h_found[i]) contains_num++;\n  }\n  ASSERT_EQ(contains_num, found_num);\n\n  table->clear(stream);\n  total_size = table->size(stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  ASSERT_EQ(total_size, 0);\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,\n                        cudaMemcpyDeviceToHost));\n\n  CUDA_CHECK(cudaFreeHost(h_keys));\n  CUDA_CHECK(cudaFreeHost(h_scores));\n  CUDA_CHECK(cudaFreeHost(h_vectors));\n  CUDA_CHECK(cudaFreeHost(h_found));\n\n  CUDA_CHECK(cudaFree(d_keys));\n  CUDA_CHECK(cudaFree(d_scores));\n  CUDA_CHECK(cudaFree(d_vectors));\n  CUDA_CHECK(cudaFree(d_found));\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_rehash_on_big_batch_specific(size_t max_hbm_for_vectors) {\n  constexpr uint64_t INIT_CAPACITY = 50000;\n  constexpr uint64_t MAX_CAPACITY = 100000;\n  constexpr uint64_t EXPECTED_MAX_CAPACITY = 65536;\n  constexpr uint64_t KEY_NUM = 50000;\n  K* h_keys;\n  S* h_scores;\n  V* h_vectors;\n\n  TableOptions options;\n\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.reserved_key_start_bit = 7;\n  options.num_of_buckets_per_alloc = 16;\n  options.max_bucket_size = 128;\n  options.max_load_factor = 0.6;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>;\n\n  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));\n\n  K* d_keys;\n  S* d_scores = nullptr;\n  V* d_vectors;\n\n  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  uint64_t total_size = 0;\n  std::unique_ptr<Table> table = std::make_unique<Table>();\n  table->init(options);\n\n  test_util::create_random_keys<K, S, V, DIM>(h_keys, h_scores, h_vectors,\n                                              KEY_NUM);\n\n  CUDA_CHECK(\n      cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),\n                        cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors, KEY_NUM * sizeof(V) * options.dim,\n                        cudaMemcpyHostToDevice));\n\n  total_size = table->size(stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  ASSERT_EQ(total_size, 0);\n\n  table->insert_or_assign(KEY_NUM, d_keys, d_vectors, nullptr, stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  ASSERT_EQ(table->capacity(), EXPECTED_MAX_CAPACITY);\n\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaFreeHost(h_keys));\n  CUDA_CHECK(cudaFreeHost(h_scores));\n  CUDA_CHECK(cudaFreeHost(h_vectors));\n\n  CUDA_CHECK(cudaFree(d_keys));\n  CUDA_CHECK(cudaFree(d_scores));\n  CUDA_CHECK(cudaFree(d_vectors));\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_dynamic_rehash_on_multi_threads(size_t max_hbm_for_vectors) {\n  constexpr uint64_t BUCKET_MAX_SIZE = 128ul;\n  constexpr uint64_t INIT_CAPACITY = 4 * 1024 - BUCKET_MAX_SIZE - 1;\n  constexpr uint64_t MAX_CAPACITY = 16 * 1024 * INIT_CAPACITY;\n  constexpr uint64_t KEY_NUM = 256;\n  constexpr uint64_t THREAD_N = 8;\n\n  std::vector<std::thread> threads;\n\n  TableOptions options;\n\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.reserved_key_start_bit = 8;\n  options.num_of_buckets_per_alloc = 16;\n  options.max_load_factor = 0.50f;\n  options.max_bucket_size = BUCKET_MAX_SIZE;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>;\n\n  std::shared_ptr<Table> table = std::make_shared<Table>();\n  table->init(options);\n  ASSERT_EQ(table->bucket_count(), 32);\n\n  auto worker_function = [&table, KEY_NUM, options](int task_n) {\n    K* h_keys;\n    V* h_vectors;\n    bool* h_found;\n\n    size_t current_capacity = table->capacity();\n\n    CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));\n    CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));\n    CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));\n\n    K* d_keys;\n    V* d_vectors;\n    bool* d_found;\n\n    CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));\n    CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));\n    CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n\n    cudaStream_t stream;\n    CUDA_CHECK(cudaStreamCreate(&stream));\n\n    while (table->capacity() * 2 < MAX_CAPACITY) {\n      test_util::create_random_keys<K, S, V, DIM>(h_keys, nullptr, h_vectors,\n                                                  KEY_NUM);\n      CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),\n                            cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,\n                            KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n\n      table->insert_or_assign(KEY_NUM, d_keys, d_vectors, nullptr, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n      table->find(KEY_NUM, d_keys, d_vectors, d_found, nullptr, stream);\n\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      int found_num = 0;\n\n      CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));\n      CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n      CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),\n                            cudaMemcpyDeviceToHost));\n      CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                            cudaMemcpyDeviceToHost));\n\n      CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                            KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDeviceToHost));\n      for (int i = 0; i < KEY_NUM; i++) {\n        if (h_found[i]) {\n          found_num++;\n          for (int j = 0; j < options.dim; j++) {\n            ASSERT_EQ(h_vectors[i * options.dim + j],\n                      static_cast<float>(h_keys[i] * 0.00001));\n          }\n        }\n      }\n      ASSERT_EQ(found_num, KEY_NUM);\n\n      CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n      table->contains(KEY_NUM, d_keys, d_found, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      int contains_num = 0;\n      CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                            cudaMemcpyDeviceToHost));\n      for (int i = 0; i < KEY_NUM; i++) {\n        if (h_found[i]) contains_num++;\n      }\n      ASSERT_EQ(contains_num, found_num);\n\n      if (task_n == 0 && current_capacity != table->capacity()) {\n        std::cout << \"[test_dynamic_rehash_on_multi_threads] The capacity \"\n                     \"changed from \"\n                  << current_capacity << \" to \" << table->capacity()\n                  << std::endl;\n        current_capacity = table->capacity();\n      }\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n    }\n    CUDA_CHECK(cudaStreamDestroy(stream));\n\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n\n    CUDA_CHECK(cudaFreeHost(h_keys));\n    CUDA_CHECK(cudaFreeHost(h_found));\n    CUDA_CHECK(cudaFreeHost(h_vectors));\n\n    CUDA_CHECK(cudaFree(d_keys));\n    CUDA_CHECK(cudaFree(d_vectors));\n    CUDA_CHECK(cudaFree(d_found));\n    CUDA_CHECK(cudaDeviceSynchronize());\n\n    CudaCheckError();\n  };\n\n  for (int i = 0; i < THREAD_N; ++i)\n    threads.emplace_back(std::thread(worker_function, i));\n\n  for (auto& th : threads) {\n    th.join();\n  }\n  ASSERT_GE(table->capacity() * 2, MAX_CAPACITY);\n}\n\nvoid test_export_batch_if(size_t max_hbm_for_vectors) {\n  constexpr uint64_t INIT_CAPACITY = 256UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t KEY_NUM = 128UL;\n  constexpr uint64_t TEST_TIMES = 1;\n  constexpr uint64_t BUCKET_MAX_SIZE = 128ul;\n\n  K* h_keys;\n  S* h_scores;\n  V* h_vectors;\n  size_t h_dump_counter = 0;\n\n  TableOptions options;\n\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.reserved_key_start_bit = 9;\n  options.num_of_buckets_per_alloc = 2;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>;\n\n  std::unique_ptr<Table> table = std::make_unique<Table>();\n  table->init(options);\n\n  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));\n\n  K* d_keys;\n  S* d_scores = nullptr;\n  V* d_vectors;\n  bool* d_found;\n  size_t* d_dump_counter;\n  int found_num = 0;\n  bool* h_found;\n\n  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n  CUDA_CHECK(cudaMalloc(&d_dump_counter, sizeof(size_t)));\n\n  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  S threshold = test_util::host_nano<S>();\n  uint64_t total_size = 0;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    test_util::create_random_keys<K, S, V, DIM>(h_keys, h_scores, h_vectors,\n                                                KEY_NUM);\n\n    CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyHostToDevice));\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    table->insert_or_assign(KEY_NUM, d_keys, d_vectors, nullptr, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, KEY_NUM);\n\n    CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));\n    CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n    CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),\n                          cudaMemcpyDeviceToHost));\n\n    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n    table->find(KEY_NUM, d_keys, d_vectors, d_found, nullptr, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                          cudaMemcpyDeviceToHost));\n\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n    found_num = 0;\n    for (int i = 0; i < BUCKET_MAX_SIZE; i++) {\n      if (h_found[i]) {\n        found_num++;\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors[i * options.dim + j],\n                    static_cast<float>(h_keys[i] * 0.00001));\n        }\n      }\n    }\n    ASSERT_EQ(found_num, KEY_NUM);\n\n    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n    table->contains(KEY_NUM, d_keys, d_found, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    int contains_num = 0;\n    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                          cudaMemcpyDeviceToHost));\n    for (int i = 0; i < BUCKET_MAX_SIZE; i++) {\n      if (h_found[i]) contains_num++;\n    }\n    ASSERT_EQ(contains_num, found_num);\n\n    K pattern = 100;\n\n    table->template export_batch_if<ExportIfPredFunctor>(\n        pattern, threshold, table->capacity(), 0, d_dump_counter, d_keys,\n        d_vectors, d_scores, stream);\n\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    CUDA_CHECK(cudaMemcpy(&h_dump_counter, d_dump_counter, sizeof(size_t),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),\n                          cudaMemcpyDeviceToHost));\n\n    size_t expected_export_count = 0;\n    for (int i = 0; i < KEY_NUM; i++) {\n      if (h_scores[i] > threshold) expected_export_count++;\n    }\n    ASSERT_EQ(expected_export_count, h_dump_counter);\n\n    threshold = test_util::host_nano<S>();\n    table->template export_batch_if<ExportIfPredFunctor>(\n        pattern, threshold, table->capacity(), 0, d_dump_counter, d_keys,\n        d_vectors, d_scores, stream);\n\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    CUDA_CHECK(cudaMemcpy(&h_dump_counter, d_dump_counter, sizeof(size_t),\n                          cudaMemcpyDeviceToHost));\n\n    ASSERT_EQ(0, h_dump_counter);\n\n    CUDA_CHECK(cudaMemset(h_keys, 0, KEY_NUM * sizeof(K)));\n    CUDA_CHECK(cudaMemset(h_scores, 0, KEY_NUM * sizeof(S)));\n    CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n\n    CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n\n    for (int i = 0; i < h_dump_counter; i++) {\n      ASSERT_GT(h_scores[i], threshold);\n      for (int j = 0; j < options.dim; j++) {\n        ASSERT_EQ(h_vectors[i * options.dim + j],\n                  static_cast<float>(h_keys[i] * 0.00001));\n      }\n    }\n\n    table->clear(stream);\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n  }\n  CUDA_CHECK(cudaDeviceSynchronize());\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,\n                        cudaMemcpyDeviceToHost));\n\n  CUDA_CHECK(cudaFreeHost(h_keys));\n  CUDA_CHECK(cudaFreeHost(h_scores));\n  CUDA_CHECK(cudaFreeHost(h_vectors));\n  CUDA_CHECK(cudaFreeHost(h_found));\n\n  CUDA_CHECK(cudaFree(d_keys));\n  CUDA_CHECK(cudaFree(d_scores));\n  CUDA_CHECK(cudaFree(d_vectors));\n  CUDA_CHECK(cudaFree(d_found));\n  CUDA_CHECK(cudaFree(d_dump_counter));\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_basic_for_cpu_io() {\n  constexpr uint64_t INIT_CAPACITY = 64 * 1024 * 1024UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t KEY_NUM = 1 * 1024 * 1024UL;\n  constexpr uint64_t TEST_TIMES = 1;\n\n  K* h_keys;\n  S* h_scores;\n  V* h_vectors;\n  bool* h_found;\n\n  TableOptions options;\n\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.reserved_key_start_bit = 10;\n  options.max_hbm_for_vectors = nv::merlin::GB(0);\n  options.io_by_cpu = true;\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));\n\n  CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n\n  test_util::create_random_keys<K, S, V, DIM>(h_keys, h_scores, nullptr,\n                                              KEY_NUM);\n\n  K* d_keys;\n  S* d_scores = nullptr;\n  V* d_vectors;\n  V* d_def_val;\n  V** d_vectors_ptr;\n  bool* d_found;\n  size_t dump_counter = 0;\n\n  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMalloc(&d_def_val, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));\n  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n\n  CUDA_CHECK(\n      cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n  CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),\n                        cudaMemcpyHostToDevice));\n\n  CUDA_CHECK(cudaMemset(d_vectors, 1, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMemset(d_def_val, 2, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMemset(d_vectors_ptr, 0, KEY_NUM * sizeof(V*)));\n  CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  uint64_t total_size = 0;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    table->insert_or_assign(KEY_NUM, d_keys, d_vectors, d_scores, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, KEY_NUM);\n\n    CUDA_CHECK(cudaMemset(d_vectors, 2, KEY_NUM * sizeof(V) * options.dim));\n    table->insert_or_assign(KEY_NUM, d_keys, d_vectors, d_scores, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, KEY_NUM);\n\n    table->find(KEY_NUM, d_keys, d_vectors, d_found, nullptr, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    int found_num = 0;\n    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                          cudaMemcpyDeviceToHost));\n\n    for (int i = 0; i < KEY_NUM; i++) {\n      if (h_found[i]) found_num++;\n    }\n    ASSERT_EQ(found_num, KEY_NUM);\n\n    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n    table->contains(KEY_NUM, d_keys, d_found, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    int contains_num = 0;\n    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                          cudaMemcpyDeviceToHost));\n    for (int i = 0; i < KEY_NUM; i++) {\n      if (h_found[i]) contains_num++;\n    }\n    ASSERT_EQ(contains_num, found_num);\n\n    table->accum_or_assign(KEY_NUM, d_keys, d_vectors, d_found, d_scores,\n                           stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, KEY_NUM);\n\n    table->erase(KEY_NUM >> 1, d_keys, stream);\n    size_t total_size_after_erase = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size_after_erase, total_size >> 1);\n\n    table->clear(stream);\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    table->insert_or_assign(KEY_NUM, d_keys, d_vectors, d_scores, stream);\n\n    dump_counter = table->export_batch(table->capacity(), 0, d_keys, d_vectors,\n                                       d_scores, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(dump_counter, KEY_NUM);\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,\n                        cudaMemcpyDeviceToHost));\n\n  CUDA_CHECK(cudaFreeHost(h_keys));\n  CUDA_CHECK(cudaFreeHost(h_scores));\n  CUDA_CHECK(cudaFreeHost(h_vectors));\n  CUDA_CHECK(cudaFreeHost(h_found));\n\n  CUDA_CHECK(cudaFree(d_keys));\n  CUDA_CHECK(cudaFree(d_scores));\n  CUDA_CHECK(cudaFree(d_vectors));\n  CUDA_CHECK(cudaFree(d_def_val));\n  CUDA_CHECK(cudaFree(d_vectors_ptr));\n  CUDA_CHECK(cudaFree(d_found));\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_evict_strategy_lru_basic(size_t max_hbm_for_vectors) {\n  constexpr uint64_t BUCKET_NUM = 8UL;\n  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;\n  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;\n  constexpr uint64_t TEST_KEY_NUM = 4;\n  constexpr uint64_t TEMP_KEY_NUM =\n      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;\n  constexpr uint64_t TEST_TIMES = 128;\n\n  TableOptions options;\n\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.reserved_key_start_bit = 11;\n  options.num_of_buckets_per_alloc = 4;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>;\n\n  std::vector<K> h_keys_base(BASE_KEY_NUM);\n  std::vector<S> h_scores_base(BASE_KEY_NUM);\n  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_test(TEST_KEY_NUM);\n  std::vector<S> h_scores_test(TEST_KEY_NUM);\n  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_temp(TEMP_KEY_NUM);\n  std::vector<S> h_scores_temp(TEMP_KEY_NUM);\n  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);\n\n  K* d_keys_temp;\n  S* d_scores_temp = nullptr;\n  V* d_vectors_temp;\n\n  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));\n  CUDA_CHECK(\n      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));\n\n  test_util::create_keys_in_one_buckets<K, S, V, DIM>(\n      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),\n      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF);\n\n  test_util::create_keys_in_one_buckets<K, S, V, DIM>(\n      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),\n      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,\n      0xFFFFFFFFFFFFFFFD);\n\n  h_keys_test[2] = h_keys_base[72];\n  h_keys_test[3] = h_keys_base[73];\n\n  for (int i = 0; i < options.dim; i++) {\n    h_vectors_test[2 * options.dim + i] = h_vectors_base[72 * options.dim + i];\n    h_vectors_test[3 * options.dim + i] = h_vectors_base[73 * options.dim + i];\n  }\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  size_t total_size = 0;\n  size_t dump_counter = 0;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    {\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      S start_ts = test_util::host_nano<S>(stream);\n      table->insert_or_assign(BASE_KEY_NUM, d_keys_temp, d_vectors_temp,\n                              nullptr, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      S end_ts = test_util::host_nano<S>(stream);\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      std::vector<S> h_scores_temp_sorted(h_scores_temp);\n      std::sort(h_scores_temp_sorted.begin(), h_scores_temp_sorted.end());\n\n      ASSERT_GE(h_scores_temp_sorted[0], start_ts);\n      ASSERT_LE(h_scores_temp_sorted[BASE_KEY_NUM - 1], end_ts);\n      for (int i = 0; i < dump_counter; i++) {\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n\n    {\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),\n                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),\n                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),\n                            TEST_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      S start_ts = test_util::host_nano<S>(stream);\n      table->insert_or_assign(TEST_KEY_NUM, d_keys_temp, d_vectors_temp,\n                              nullptr, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      S end_ts = test_util::host_nano<S>(stream);\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            TEMP_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      std::array<S, TEMP_KEY_NUM> h_scores_temp_sorted;\n      int ctr = 0;\n      for (int i = 0; i < TEMP_KEY_NUM; i++) {\n        if (h_keys_test.end() !=\n            std::find(h_keys_test.begin(), h_keys_test.end(), h_keys_temp[i])) {\n          ASSERT_GT(h_scores_temp[i], BUCKET_MAX_SIZE);\n          h_scores_temp_sorted[ctr++] = h_scores_temp[i];\n        } else {\n          ASSERT_LE(h_scores_temp[i], start_ts);\n        }\n      }\n      std::sort(h_scores_temp_sorted.begin(),\n                h_scores_temp_sorted.begin() + ctr);\n\n      ASSERT_GE(h_scores_temp_sorted[0], start_ts);\n      ASSERT_LE(h_scores_temp_sorted[ctr - 1], end_ts);\n      for (int i = 0; i < dump_counter; i++) {\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaFree(d_keys_temp));\n  CUDA_CHECK(cudaFree(d_scores_temp));\n  CUDA_CHECK(cudaFree(d_vectors_temp));\n\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_evict_strategy_lfu_basic(size_t max_hbm_for_vectors) {\n  constexpr uint64_t BUCKET_NUM = 8UL;\n  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;\n  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;\n  constexpr uint64_t TEST_KEY_NUM = 4;\n  constexpr uint64_t TEMP_KEY_NUM =\n      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;\n  constexpr uint64_t TEST_TIMES = 1024;\n\n  TableOptions options;\n\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.reserved_key_start_bit = 12;\n  options.num_of_buckets_per_alloc = 1;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLfu>;\n\n  std::vector<K> h_keys_base(BASE_KEY_NUM);\n  std::vector<S> h_scores_base(BASE_KEY_NUM);\n  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_test(TEST_KEY_NUM);\n  std::vector<S> h_scores_test(TEST_KEY_NUM);\n  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_temp(TEMP_KEY_NUM);\n  std::vector<S> h_scores_temp(TEMP_KEY_NUM);\n  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);\n\n  K* d_keys_temp;\n  S* d_scores_temp = nullptr;\n  V* d_vectors_temp;\n\n  int freq_range = 1000;\n\n  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));\n  CUDA_CHECK(\n      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));\n\n  for (int i = 0; i < TEST_TIMES; i++) {\n    test_util::create_keys_in_one_buckets_lfu<K, S, V, DIM>(\n        h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),\n        BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF,\n        freq_range);\n\n    test_util::create_keys_in_one_buckets_lfu<K, S, V, DIM>(\n        h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),\n        TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,\n        0xFFFFFFFFFFFFFFFD, freq_range);\n\n    h_keys_test[2] = h_keys_base[72];\n    h_keys_test[3] = h_keys_base[73];\n\n    h_scores_test[2] = h_keys_base[72] % freq_range;\n    h_scores_test[3] = h_keys_base[73] % freq_range;\n\n    for (int i = 0; i < options.dim; i++) {\n      h_vectors_test[2 * options.dim + i] =\n          h_vectors_base[72 * options.dim + i];\n      h_vectors_test[3 * options.dim + i] =\n          h_vectors_base[73 * options.dim + i];\n    }\n    cudaStream_t stream;\n    CUDA_CHECK(cudaStreamCreate(&stream));\n\n    size_t total_size = 0;\n    size_t dump_counter = 0;\n    S global_epoch = 1;\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    {\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      table->set_global_epoch(global_epoch);\n      table->insert_or_assign(BASE_KEY_NUM, d_keys_temp, d_vectors_temp,\n                              d_scores_temp, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      for (int i = 0; i < dump_counter; i++) {\n        ASSERT_EQ(h_scores_temp[i], h_keys_temp[i] % freq_range);\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n\n    {\n      global_epoch++;\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),\n                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),\n                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),\n                            TEST_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      table->set_global_epoch(global_epoch);\n      table->insert_or_assign(TEST_KEY_NUM, d_keys_temp, d_vectors_temp,\n                              d_scores_temp, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            TEMP_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      for (int i = 0; i < dump_counter; i++) {\n        bool in_base =\n            h_keys_base.end() !=\n            std::find(h_keys_base.begin(), h_keys_base.end(), h_keys_temp[i]);\n        bool in_test =\n            h_keys_test.end() !=\n            std::find(h_keys_test.begin(), h_keys_test.end(), h_keys_temp[i]);\n        if (in_base && in_test) {\n          ASSERT_EQ(h_scores_temp[i], (h_keys_temp[i] % freq_range) * 2);\n        } else {\n          ASSERT_EQ(h_scores_temp[i], (h_keys_temp[i] % freq_range));\n        }\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n    CUDA_CHECK(cudaStreamDestroy(stream));\n  }\n\n  CUDA_CHECK(cudaFree(d_keys_temp));\n  CUDA_CHECK(cudaFree(d_scores_temp));\n  CUDA_CHECK(cudaFree(d_vectors_temp));\n\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_evict_strategy_epochlru_basic(size_t max_hbm_for_vectors) {\n  constexpr int RSHIFT_ON_NANO = 20;\n\n  constexpr uint64_t BUCKET_NUM = 8UL;\n  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;\n  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;\n  constexpr uint64_t TEST_KEY_NUM = 4;\n  constexpr uint64_t TEMP_KEY_NUM =\n      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;\n  constexpr uint64_t TEST_TIMES = 128;\n\n  TableOptions options;\n\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.reserved_key_start_bit = 13;\n  options.num_of_buckets_per_alloc = 8;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kEpochLru>;\n\n  std::vector<K> h_keys_base(BASE_KEY_NUM);\n  std::vector<S> h_scores_base(BASE_KEY_NUM);\n  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_test(TEST_KEY_NUM);\n  std::vector<S> h_scores_test(TEST_KEY_NUM);\n  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_temp(TEMP_KEY_NUM);\n  std::vector<S> h_scores_temp(TEMP_KEY_NUM);\n  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);\n\n  K* d_keys_temp;\n  S* d_scores_temp = nullptr;\n  V* d_vectors_temp;\n\n  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));\n  CUDA_CHECK(\n      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));\n\n  test_util::create_keys_in_one_buckets<K, S, V, DIM>(\n      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),\n      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF);\n\n  test_util::create_keys_in_one_buckets<K, S, V, DIM>(\n      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),\n      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,\n      0xFFFFFFFFFFFFFFFD);\n\n  h_keys_test[2] = h_keys_base[72];\n  h_keys_test[3] = h_keys_base[73];\n\n  for (int i = 0; i < options.dim; i++) {\n    h_vectors_test[2 * options.dim + i] = h_vectors_base[72 * options.dim + i];\n    h_vectors_test[3 * options.dim + i] = h_vectors_base[73 * options.dim + i];\n  }\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  size_t total_size = 0;\n  size_t dump_counter = 0;\n  S global_epoch = 1;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    {\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      S start_ts =\n          (test_util::host_nano<S>(stream) >> RSHIFT_ON_NANO) & 0xFFFFFFFF;\n      table->set_global_epoch(global_epoch);\n      table->insert_or_assign(BASE_KEY_NUM, d_keys_temp, d_vectors_temp,\n                              nullptr, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      S end_ts =\n          (test_util::host_nano<S>(stream) >> RSHIFT_ON_NANO) & 0xFFFFFFFF;\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      std::vector<S> h_scores_temp_sorted(h_scores_temp);\n      std::sort(h_scores_temp_sorted.begin(), h_scores_temp_sorted.end());\n\n      ASSERT_GE(h_scores_temp_sorted[0], (global_epoch << 32 | start_ts));\n      ASSERT_LE(h_scores_temp_sorted[BASE_KEY_NUM - 1],\n                (global_epoch << 32 | end_ts));\n      for (int i = 0; i < dump_counter; i++) {\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n\n    {\n      global_epoch++;\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),\n                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),\n                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),\n                            TEST_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      S start_ts =\n          (test_util::host_nano<S>(stream) >> RSHIFT_ON_NANO) & 0xFFFFFFFF;\n      table->set_global_epoch(global_epoch);\n      table->insert_or_assign(TEST_KEY_NUM, d_keys_temp, d_vectors_temp,\n                              nullptr, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      S end_ts =\n          (test_util::host_nano<S>(stream) >> RSHIFT_ON_NANO) & 0xFFFFFFFF;\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            TEMP_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      std::vector<S> h_scores_temp_sorted;\n      h_scores_temp_sorted.reserve(TEMP_KEY_NUM);\n      for (int i = 0; i < TEMP_KEY_NUM; i++) {\n        if (h_keys_test.end() !=\n            std::find(h_keys_test.begin(), h_keys_test.end(), h_keys_temp[i])) {\n          ASSERT_GE(h_scores_temp[i], (global_epoch << 32 | start_ts));\n          h_scores_temp_sorted.push_back(h_scores_temp[i]);\n        } else {\n          ASSERT_LE(h_scores_temp[i], (global_epoch << 32 | start_ts));\n        }\n      }\n      std::sort(h_scores_temp_sorted.begin(), h_scores_temp_sorted.end());\n\n      if (!h_scores_temp_sorted.empty()) {\n        ASSERT_GE(h_scores_temp_sorted[0], (global_epoch << 32 | start_ts));\n        ASSERT_LE(h_scores_temp_sorted[h_scores_temp_sorted.size() - 1],\n                  (global_epoch << 32 | end_ts));\n      }\n      for (int i = 0; i < dump_counter; i++) {\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaFree(d_keys_temp));\n  CUDA_CHECK(cudaFree(d_scores_temp));\n  CUDA_CHECK(cudaFree(d_vectors_temp));\n\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_evict_strategy_epochlfu_basic(size_t max_hbm_for_vectors) {\n  constexpr uint64_t BUCKET_NUM = 8UL;\n  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;\n  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;\n  constexpr uint64_t TEST_KEY_NUM = 4;\n  constexpr uint64_t TEMP_KEY_NUM =\n      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;\n  constexpr uint64_t TEST_TIMES = 128;\n\n  TableOptions options;\n\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.reserved_key_start_bit = 14;\n  options.num_of_buckets_per_alloc = 8;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kEpochLfu>;\n\n  std::vector<K> h_keys_base(BASE_KEY_NUM);\n  std::vector<S> h_scores_base(BASE_KEY_NUM);\n  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_test(TEST_KEY_NUM);\n  std::vector<S> h_scores_test(TEST_KEY_NUM);\n  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_temp(TEMP_KEY_NUM);\n  std::vector<S> h_scores_temp(TEMP_KEY_NUM);\n  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);\n\n  K* d_keys_temp;\n  S* d_scores_temp = nullptr;\n  V* d_vectors_temp;\n\n  int freq_range = 1000;\n\n  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));\n  CUDA_CHECK(\n      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));\n\n  test_util::create_keys_in_one_buckets_lfu<K, S, V, DIM>(\n      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),\n      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF,\n      freq_range);\n\n  test_util::create_keys_in_one_buckets_lfu<K, S, V, DIM>(\n      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),\n      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,\n      0xFFFFFFFFFFFFFFFD, freq_range);\n\n  // Simulate overflow of low 32bits.\n  h_scores_base[71] = static_cast<S>(std::numeric_limits<uint32_t>::max() -\n                                     static_cast<uint32_t>(1));\n\n  h_keys_test[1] = h_keys_base[71];\n  h_keys_test[2] = h_keys_base[72];\n  h_keys_test[3] = h_keys_base[73];\n\n  h_scores_test[1] = h_scores_base[71];\n  h_scores_test[2] = h_keys_base[72] % freq_range;\n  h_scores_test[3] = h_keys_base[73] % freq_range;\n\n  for (int i = 0; i < options.dim; i++) {\n    h_vectors_test[1 * options.dim + i] = h_vectors_base[71 * options.dim + i];\n    h_vectors_test[2 * options.dim + i] = h_vectors_base[72 * options.dim + i];\n    h_vectors_test[3 * options.dim + i] = h_vectors_base[73 * options.dim + i];\n  }\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  size_t total_size = 0;\n  size_t dump_counter = 0;\n  S global_epoch = 1;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    {\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      table->set_global_epoch(global_epoch);\n      table->insert_or_assign(BASE_KEY_NUM, d_keys_temp, d_vectors_temp,\n                              d_scores_temp, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      for (int i = 0; i < dump_counter; i++) {\n        if (h_keys_temp[i] == h_keys_base[71]) {\n          S expected_score = test_util::make_expected_score_for_epochlfu<S>(\n              global_epoch, h_scores_base[71]);\n          ASSERT_EQ(h_scores_temp[i], expected_score);\n        } else {\n          S expected_score = test_util::make_expected_score_for_epochlfu<S>(\n              global_epoch, (h_keys_temp[i] % freq_range));\n          ASSERT_EQ(h_scores_temp[i], expected_score);\n        }\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n\n    {\n      global_epoch++;\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),\n                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),\n                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),\n                            TEST_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      table->set_global_epoch(global_epoch);\n      table->insert_or_assign(TEST_KEY_NUM, d_keys_temp, d_vectors_temp,\n                              d_scores_temp, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            TEMP_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      ASSERT_TRUE(h_keys_temp.end() != std::find(h_keys_temp.begin(),\n                                                 h_keys_temp.end(),\n                                                 h_keys_base[71]));\n\n      for (int i = 0; i < dump_counter; i++) {\n        bool in_base =\n            h_keys_base.end() !=\n            std::find(h_keys_base.begin(), h_keys_base.end(), h_keys_temp[i]);\n        bool in_test =\n            h_keys_test.end() !=\n            std::find(h_keys_test.begin(), h_keys_test.end(), h_keys_temp[i]);\n\n        if (in_base && in_test) {\n          if (h_keys_temp[i] == h_keys_base[71]) {\n            S expected_score = test_util::make_expected_score_for_epochlfu<S>(\n                global_epoch, h_scores_base[71] * 2);\n            ASSERT_EQ(h_scores_temp[i], expected_score);\n          } else {\n            S expected_score = test_util::make_expected_score_for_epochlfu<S>(\n                global_epoch, (h_keys_temp[i] % freq_range) * 2);\n            ASSERT_EQ(h_scores_temp[i], expected_score);\n          }\n        } else {\n          if (h_keys_temp[i] == h_keys_base[71]) {\n            S expected_score = test_util::make_expected_score_for_epochlfu<S>(\n                global_epoch - static_cast<S>(in_base), h_scores_base[71]);\n            ASSERT_EQ(h_scores_temp[i], expected_score);\n          } else {\n            S expected_score = test_util::make_expected_score_for_epochlfu<S>(\n                global_epoch - static_cast<S>(in_base),\n                (h_keys_temp[i] % freq_range));\n\n            ASSERT_EQ(h_scores_temp[i], expected_score);\n          }\n        }\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaFree(d_keys_temp));\n  CUDA_CHECK(cudaFree(d_scores_temp));\n  CUDA_CHECK(cudaFree(d_vectors_temp));\n\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_evict_strategy_customized_basic(size_t max_hbm_for_vectors) {\n  constexpr uint64_t BUCKET_NUM = 8UL;\n  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;\n  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;\n  constexpr uint64_t TEST_KEY_NUM = 128;\n  constexpr uint64_t TEMP_KEY_NUM =\n      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;\n  constexpr uint64_t TEST_TIMES = 128;\n\n  TableOptions options;\n\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.reserved_key_start_bit = 15;\n  options.num_of_buckets_per_alloc = 8;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  std::vector<K> h_keys_base(BASE_KEY_NUM);\n  std::vector<S> h_scores_base(BASE_KEY_NUM);\n  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_test(TEST_KEY_NUM);\n  std::vector<S> h_scores_test(TEST_KEY_NUM);\n  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_temp(TEMP_KEY_NUM);\n  std::vector<S> h_scores_temp(TEMP_KEY_NUM);\n  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);\n\n  K* d_keys_temp;\n  S* d_scores_temp = nullptr;\n  V* d_vectors_temp;\n\n  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));\n  CUDA_CHECK(\n      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));\n\n  test_util::create_keys_in_one_buckets<K, S, V, DIM>(\n      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),\n      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF);\n\n  const S base_score_start = 1000;\n  for (int i = 0; i < BASE_KEY_NUM; i++) {\n    h_scores_base[i] = base_score_start + i;\n  }\n\n  test_util::create_keys_in_one_buckets<K, S, V, DIM>(\n      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),\n      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,\n      0xFFFFFFFFFFFFFFFD);\n  const S test_score_start = base_score_start + BASE_KEY_NUM;\n  for (int i = 0; i < TEST_KEY_NUM; i++) {\n    h_scores_test[i] = test_score_start + i;\n  }\n  for (int i = 64; i < TEST_KEY_NUM; i++) {\n    h_keys_test[i] = h_keys_base[i];\n    for (int j = 0; j < options.dim; j++) {\n      h_vectors_test[i * options.dim + j] = h_vectors_base[i * options.dim + j];\n    }\n  }\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  size_t total_size = 0;\n  size_t dump_counter = 0;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    {\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      table->insert_or_assign(BASE_KEY_NUM, d_keys_temp, d_vectors_temp,\n                              d_scores_temp, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      std::vector<S> h_scores_temp_sorted(h_scores_temp);\n      std::sort(h_scores_temp_sorted.begin(), h_scores_temp_sorted.end());\n\n      auto expected_range = test_util::range<S, TEMP_KEY_NUM>(base_score_start);\n      ASSERT_TRUE(std::equal(h_scores_temp_sorted.begin(),\n                             h_scores_temp_sorted.end(),\n                             expected_range.begin()));\n      for (int i = 0; i < dump_counter; i++) {\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n\n    {\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),\n                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),\n                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),\n                            TEST_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      table->insert_or_assign(TEST_KEY_NUM, d_keys_temp, d_vectors_temp,\n                              d_scores_temp, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            TEMP_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      std::vector<S> h_scores_temp_sorted(h_scores_temp);\n      std::sort(h_scores_temp_sorted.begin(), h_scores_temp_sorted.end());\n\n      auto expected_range_test =\n          test_util::range<S, TEST_KEY_NUM>(test_score_start);\n      ASSERT_TRUE(std::equal(h_scores_temp_sorted.begin(),\n                             h_scores_temp_sorted.end(),\n                             expected_range_test.begin()));\n      for (int i = 0; i < dump_counter; i++) {\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaFree(d_keys_temp));\n  CUDA_CHECK(cudaFree(d_scores_temp));\n  CUDA_CHECK(cudaFree(d_vectors_temp));\n\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_evict_strategy_customized_advanced(size_t max_hbm_for_vectors) {\n  constexpr uint64_t BUCKET_NUM = 8UL;\n  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;\n  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;\n  constexpr uint64_t TEST_KEY_NUM = 8;\n  constexpr uint64_t TEMP_KEY_NUM =\n      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;\n  constexpr uint64_t TEST_TIMES = 256;\n\n  TableOptions options;\n\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.reserved_key_start_bit = 16;\n  options.num_of_buckets_per_alloc = 8;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  std::vector<K> h_keys_base(BASE_KEY_NUM);\n  std::vector<S> h_scores_base(BASE_KEY_NUM);\n  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_test(TEST_KEY_NUM);\n  std::vector<S> h_scores_test(TEST_KEY_NUM);\n  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);\n\n  std::vector<K> h_keys_temp(TEMP_KEY_NUM);\n  std::vector<S> h_scores_temp(TEMP_KEY_NUM);\n  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);\n\n  K* d_keys_temp;\n  S* d_scores_temp = nullptr;\n  V* d_vectors_temp;\n\n  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));\n  CUDA_CHECK(\n      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));\n\n  test_util::create_keys_in_one_buckets<K, S, V, DIM>(\n      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),\n      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF);\n\n  const S base_score_start = 1000;\n  for (int i = 0; i < BASE_KEY_NUM; i++) {\n    h_scores_base[i] = base_score_start + i;\n  }\n\n  test_util::create_keys_in_one_buckets<K, S, V, DIM>(\n      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),\n      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,\n      0xFFFFFFFFFFFFFFFD);\n\n  h_keys_test[4] = h_keys_base[72];\n  h_keys_test[5] = h_keys_base[73];\n  h_keys_test[6] = h_keys_base[74];\n  h_keys_test[7] = h_keys_base[75];\n\n  // replace four new keys to lower scores, would not be inserted.\n  h_scores_test[0] = 20;\n  h_scores_test[1] = 78;\n  h_scores_test[2] = 97;\n  h_scores_test[3] = 98;\n\n  // replace three exist keys to new scores, just refresh the score for them.\n  h_scores_test[4] = 99;\n  h_scores_test[5] = 1010;\n  h_scores_test[6] = 1020;\n  h_scores_test[7] = 1035;\n\n  for (int i = 4; i < TEST_KEY_NUM; i++) {\n    for (int j = 0; j < options.dim; j++) {\n      h_vectors_test[i * options.dim + j] =\n          static_cast<V>(h_keys_test[i] * 0.00001);\n    }\n  }\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  size_t total_size = 0;\n  size_t dump_counter = 0;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n    ASSERT_EQ(table->bucket_count(), BUCKET_NUM);\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    {\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      table->insert_or_assign(BASE_KEY_NUM, d_keys_temp, d_vectors_temp,\n                              d_scores_temp, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            BASE_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      std::vector<S> h_scores_temp_sorted(h_scores_temp);\n      std::sort(h_scores_temp_sorted.begin(), h_scores_temp_sorted.end());\n\n      auto expected_range = test_util::range<S, TEMP_KEY_NUM>(base_score_start);\n      ASSERT_TRUE(std::equal(h_scores_temp_sorted.begin(),\n                             h_scores_temp_sorted.end(),\n                             expected_range.begin()));\n      for (int i = 0; i < dump_counter; i++) {\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n\n    {\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),\n                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),\n                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),\n                            TEST_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      table->insert_or_assign(TEST_KEY_NUM, d_keys_temp, d_vectors_temp,\n                              d_scores_temp, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);\n\n      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,\n                                         d_vectors_temp, d_scores_temp, stream);\n      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,\n                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,\n                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,\n                            TEMP_KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      for (int i = 0; i < TEST_KEY_NUM; i++) {\n        if (i < 4) {\n          ASSERT_EQ(h_keys_temp.end(),\n                    std::find(h_keys_temp.begin(), h_keys_temp.end(),\n                              h_keys_test[i]));\n        } else {\n          ASSERT_NE(h_keys_temp.end(),\n                    std::find(h_keys_temp.begin(), h_keys_temp.end(),\n                              h_keys_test[i]));\n        }\n      }\n      for (int i = 0; i < TEMP_KEY_NUM; i++) {\n        if (h_keys_temp[i] == h_keys_test[4])\n          ASSERT_EQ(h_scores_temp[i], h_scores_test[4]);\n        if (h_keys_temp[i] == h_keys_test[5])\n          ASSERT_EQ(h_scores_temp[i], h_scores_test[5]);\n        if (h_keys_temp[i] == h_keys_test[6])\n          ASSERT_EQ(h_scores_temp[i], h_scores_test[6]);\n        if (h_keys_temp[i] == h_keys_test[7])\n          ASSERT_EQ(h_scores_temp[i], h_scores_test[7]);\n\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n    }\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaFree(d_keys_temp));\n  CUDA_CHECK(cudaFree(d_scores_temp));\n  CUDA_CHECK(cudaFree(d_vectors_temp));\n\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_evict_strategy_customized_correct_rate(size_t max_hbm_for_vectors) {\n  constexpr uint64_t BATCH_SIZE = 1024 * 1024ul;\n  constexpr uint64_t STEPS = 128;\n  constexpr uint64_t MAX_BUCKET_SIZE = 128;\n  constexpr uint64_t INIT_CAPACITY = BATCH_SIZE * STEPS;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t TEST_TIMES = 1;\n  float expected_correct_rate = 0.964;\n  const int rounds = 12;\n\n  TableOptions options;\n\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.reserved_key_start_bit = 17;\n  options.num_of_buckets_per_alloc = 128;\n  options.max_bucket_size = MAX_BUCKET_SIZE;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  K* h_keys_base = test_util::HostBuffer<K>(BATCH_SIZE).ptr();\n  S* h_scores_base = test_util::HostBuffer<S>(BATCH_SIZE).ptr();\n  V* h_vectors_base = test_util::HostBuffer<V>(BATCH_SIZE * options.dim).ptr();\n\n  K* h_keys_temp = test_util::HostBuffer<K>(MAX_CAPACITY).ptr();\n  S* h_scores_temp = test_util::HostBuffer<S>(MAX_CAPACITY).ptr();\n  V* h_vectors_temp =\n      test_util::HostBuffer<V>(MAX_CAPACITY * options.dim).ptr();\n\n  K* d_keys_temp;\n  S* d_scores_temp = nullptr;\n  V* d_vectors_temp;\n\n  CUDA_CHECK(cudaMalloc(&d_keys_temp, MAX_CAPACITY * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores_temp, MAX_CAPACITY * sizeof(S)));\n  CUDA_CHECK(\n      cudaMalloc(&d_vectors_temp, MAX_CAPACITY * sizeof(V) * options.dim));\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  size_t total_size = 0;\n  size_t global_start_key = 100000;\n  for (int i = 0; i < TEST_TIMES; i++) {\n    std::unique_ptr<Table> table = std::make_unique<Table>();\n    table->init(options);\n    size_t start_key = global_start_key;\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_EQ(total_size, 0);\n\n    for (int r = 0; r < rounds; r++) {\n      size_t expected_min_key = global_start_key + INIT_CAPACITY * r;\n      size_t expected_max_key = global_start_key + INIT_CAPACITY * (r + 1) - 1;\n      size_t expected_table_size =\n          (r == 0) ? size_t(expected_correct_rate * INIT_CAPACITY)\n                   : INIT_CAPACITY;\n\n      for (int s = 0; s < STEPS; s++) {\n        test_util::create_continuous_keys<K, S, V, DIM>(\n            h_keys_base, h_scores_base, h_vectors_base, BATCH_SIZE, start_key);\n        start_key += BATCH_SIZE;\n\n        CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base, BATCH_SIZE * sizeof(K),\n                              cudaMemcpyHostToDevice));\n        CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base,\n                              BATCH_SIZE * sizeof(S), cudaMemcpyHostToDevice));\n        CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base,\n                              BATCH_SIZE * sizeof(V) * options.dim,\n                              cudaMemcpyHostToDevice));\n        table->insert_or_assign(BATCH_SIZE, d_keys_temp, d_vectors_temp,\n                                d_scores_temp, stream);\n        CUDA_CHECK(cudaStreamSynchronize(stream));\n      }\n\n      size_t total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_GE(total_size, expected_table_size);\n      ASSERT_EQ(MAX_CAPACITY, table->capacity());\n\n      size_t dump_counter = table->export_batch(\n          MAX_CAPACITY, 0, d_keys_temp, d_vectors_temp, d_scores_temp, stream);\n\n      CUDA_CHECK(cudaMemcpy(h_keys_temp, d_keys_temp, MAX_CAPACITY * sizeof(K),\n                            cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_scores_temp, d_scores_temp,\n                            MAX_CAPACITY * sizeof(S), cudaMemcpyDefault));\n      CUDA_CHECK(cudaMemcpy(h_vectors_temp, d_vectors_temp,\n                            MAX_CAPACITY * sizeof(V) * options.dim,\n                            cudaMemcpyDefault));\n\n      ASSERT_EQ(total_size, dump_counter);\n      size_t bigger_score_counter = 0;\n      K max_key = 0;\n\n      for (int i = 0; i < dump_counter; i++) {\n        ASSERT_EQ(h_keys_temp[i], h_scores_temp[i]);\n        max_key = std::max(max_key, h_keys_temp[i]);\n        if (h_scores_temp[i] >= expected_min_key) bigger_score_counter++;\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors_temp[i * options.dim + j],\n                    static_cast<float>(h_keys_temp[i] * 0.00001));\n        }\n      }\n\n      float correct_rate = (bigger_score_counter * 1.0) / MAX_CAPACITY;\n      std::cout << std::setprecision(3) << \"[Round \" << r << \"]\"\n                << \"correct_rate=\" << correct_rate << std::endl;\n      ASSERT_GE(max_key, expected_max_key);\n      ASSERT_GE(correct_rate, expected_correct_rate);\n    }\n  }\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaFree(d_keys_temp));\n  CUDA_CHECK(cudaFree(d_scores_temp));\n  CUDA_CHECK(cudaFree(d_vectors_temp));\n\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nvoid test_insert_or_assign_multi_threads(size_t max_hbm_for_vectors,\n                                         const float BATCH_0_RATIO,\n                                         const float BATCH_1_RATIO,\n                                         bool capacity_silent = true) {\n  const uint64_t THREAD_N = 64UL;\n  const uint64_t BATCH_0_SIZE = static_cast<uint64_t>(THREAD_N * BATCH_0_RATIO);\n  const uint64_t BATCH_1_SIZE = static_cast<uint64_t>(THREAD_N * BATCH_1_RATIO);\n  const uint64_t BATCH_2_SIZE = THREAD_N - BATCH_0_SIZE - BATCH_1_SIZE;\n\n  const uint64_t INIT_CAPACITY = 32 * 1024 * 1024UL;\n  const uint64_t MAX_CAPACITY = 128 * 1024 * 1024UL;\n  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;\n  constexpr uint64_t KEY_NUM = 1 * 1024 * 1024UL;\n\n  std::vector<std::thread> threads;\n\n  TableOptions options;\n\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_load_factor = 0.50f;\n  options.max_bucket_size = BUCKET_MAX_SIZE;\n  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>;\n\n  std::shared_ptr<Table> table = std::make_shared<Table>();\n  table->init(options);\n  // assume every key is different\n  auto worker1 = [&table, KEY_NUM, options, capacity_silent](int batch,\n                                                             int task_n) {\n    K* h_keys;\n    V* h_vectors;\n    bool* h_found;\n\n    size_t current_capacity = table->capacity();\n\n    CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));\n    CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));\n    CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));\n\n    K* d_keys;\n    V* d_vectors;\n    bool* d_found;\n\n    CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));\n    CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));\n    CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n\n    cudaStream_t stream;\n    CUDA_CHECK(cudaStreamCreate(&stream));\n\n    test_util::create_random_keys<K, S, V, DIM>(h_keys, nullptr, h_vectors,\n                                                KEY_NUM);\n    CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n\n    table->find(KEY_NUM, d_keys, d_vectors, d_found, nullptr, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    {\n      int found_num = 0;\n      CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));\n      CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                            cudaMemcpyDeviceToHost));\n      for (int i = 0; i < KEY_NUM; i++) {\n        if (h_found[i]) {\n          found_num++;\n        }\n      }\n      ASSERT_EQ(found_num, 0);\n    }\n\n    table->insert_or_assign(KEY_NUM, d_keys, d_vectors, nullptr, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n    table->find(KEY_NUM, d_keys, d_vectors, d_found, nullptr, stream);\n\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    int found_num = 0;\n\n    CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));\n    CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n    CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                          cudaMemcpyDeviceToHost));\n\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n    thread_local bool print_unequal{false};\n    thread_local uint64_t err_times{0};\n    for (int i = 0; i < KEY_NUM; i++) {\n      if (h_found[i]) {\n        found_num++;\n        for (int j = 0; j < options.dim; j++) {\n          if (batch == 2) {\n            if (h_vectors[i * options.dim + j] !=\n                static_cast<float>(h_keys[i] * 0.00001)) {\n              if (!print_unequal) {\n                std::cout << \" [Thread \" << task_n << \"]\\t\";\n                UNEQUAL_EXPR(h_vectors[i * options.dim + j],\n                             static_cast<float>(h_keys[i] * 0.00001));\n                print_unequal = true;\n              }\n              err_times += 1;\n            }\n          } else {\n            ASSERT_EQ(h_vectors[i * options.dim + j],\n                      static_cast<float>(h_keys[i] * 0.00001));\n          }\n        }\n      }\n    }\n\n    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n    table->contains(KEY_NUM, d_keys, d_found, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    int contains_num = 0;\n    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                          cudaMemcpyDeviceToHost));\n    for (int i = 0; i < KEY_NUM; i++) {\n      if (h_found[i]) contains_num++;\n    }\n    ASSERT_EQ(contains_num, found_num);\n\n    bool print_thread_id{false};\n    if (batch == 0 || batch == 1) {\n      ASSERT_EQ(found_num, KEY_NUM);\n      ASSERT_EQ(err_times, 0);\n    } else {\n      if (found_num != KEY_NUM or err_times != 0) {\n        std::cout << \" [Thread \" << task_n << \"]\\t\"\n                  << \"Number of keys(insert/found/error) : \" << \"(\" << KEY_NUM\n                  << \"/\" << found_num << \"/\" << err_times << \") \\t\";\n        print_thread_id = true;\n      }\n    }\n    if (current_capacity != table->capacity() && !capacity_silent) {\n      if (!print_thread_id) std::cout << \" [Thread \" << task_n << \"]\\t\";\n\n      std::cout << \"The capacity changed from \" << current_capacity << \" to \"\n                << table->capacity() << std::endl;\n    } else if (print_thread_id) {\n      std::cout << std::endl;\n    }\n\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    CUDA_CHECK(cudaStreamDestroy(stream));\n\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n\n    CUDA_CHECK(cudaFreeHost(h_keys));\n    CUDA_CHECK(cudaFreeHost(h_found));\n    CUDA_CHECK(cudaFreeHost(h_vectors));\n\n    CUDA_CHECK(cudaFree(d_keys));\n    CUDA_CHECK(cudaFree(d_vectors));\n    CUDA_CHECK(cudaFree(d_found));\n    CUDA_CHECK(cudaDeviceSynchronize());\n\n    CudaCheckError();\n  };\n  auto worker2 = [&table, KEY_NUM, options, capacity_silent](int batch,\n                                                             int task_n) {\n    K* h_keys;\n    V* h_vectors;\n    bool* h_found;\n\n    size_t current_capacity = table->capacity();\n\n    CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));\n    CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));\n    CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));\n\n    K* d_keys;\n    V* d_vectors;\n    V* d_new_vectors;\n    bool* d_found;\n\n    CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));\n    CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));\n    CUDA_CHECK(cudaMalloc(&d_new_vectors, KEY_NUM * sizeof(V) * options.dim));\n    CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n\n    cudaStream_t stream;\n    CUDA_CHECK(cudaStreamCreate(&stream));\n\n    test_util::create_random_keys<K, S, V, DIM>(h_keys, nullptr, h_vectors,\n                                                KEY_NUM);\n    CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n    CUDA_CHECK(cudaMemset(d_new_vectors, 2, KEY_NUM * sizeof(V) * options.dim));\n\n    table->insert_or_assign(KEY_NUM, d_keys, d_vectors, nullptr, stream);\n    table->insert_or_assign(KEY_NUM, d_keys, d_new_vectors, nullptr, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n    table->find(KEY_NUM, d_keys, d_vectors, d_found, nullptr, stream);\n\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    int found_num = 0;\n\n    CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));\n    CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n    CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),\n                          cudaMemcpyDeviceToHost));\n    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                          cudaMemcpyDeviceToHost));\n\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n    thread_local bool print_unequal{false};\n    thread_local uint64_t err_times{0};\n    uint32_t i_value = 0x2020202;\n    for (int i = 0; i < KEY_NUM; i++) {\n      if (h_found[i]) {\n        found_num++;\n        for (int j = 0; j < options.dim; j++) {\n          if (batch == 2) {\n            if (h_vectors[i * options.dim + j] !=\n                *(reinterpret_cast<float*>(&i_value))) {\n              if (!print_unequal) {\n                std::cout << \" [Thread \" << task_n << \"]\\t\";\n                UNEQUAL_EXPR(h_vectors[i * options.dim + j],\n                             *(reinterpret_cast<float*>(&i_value)));\n                print_unequal = true;\n              }\n              err_times += 1;\n            }\n          } else {\n            ASSERT_EQ(h_vectors[i * options.dim + j],\n                      *(reinterpret_cast<float*>(&i_value)));\n          }\n        }\n      }\n    }\n\n    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n    table->contains(KEY_NUM, d_keys, d_found, stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    int contains_num = 0;\n    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                          cudaMemcpyDeviceToHost));\n    for (int i = 0; i < KEY_NUM; i++) {\n      if (h_found[i]) contains_num++;\n    }\n    ASSERT_EQ(contains_num, found_num);\n\n    bool print_thread_id{false};\n    if (batch == 0 || batch == 1) {\n      ASSERT_EQ(found_num, KEY_NUM);\n      ASSERT_EQ(err_times, 0);\n    } else {\n      if (found_num != KEY_NUM or err_times != 0) {\n        std::cout << \" [Thread \" << task_n << \"]\\t\"\n                  << \"Number of keys(insert/found/error) : \" << \"(\" << KEY_NUM\n                  << \"/\" << found_num << \"/\" << err_times << \") \\t\";\n        print_thread_id = true;\n      }\n    }\n    if (current_capacity != table->capacity() && !capacity_silent) {\n      if (!print_thread_id) std::cout << \" [Thread \" << task_n << \"]\\t\";\n\n      std::cout << \"The capacity changed from \" << current_capacity << \" to \"\n                << table->capacity() << std::endl;\n    } else if (print_thread_id) {\n      std::cout << std::endl;\n    }\n\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n\n    CUDA_CHECK(cudaStreamDestroy(stream));\n\n    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyDeviceToHost));\n\n    CUDA_CHECK(cudaFreeHost(h_keys));\n    CUDA_CHECK(cudaFreeHost(h_found));\n    CUDA_CHECK(cudaFreeHost(h_vectors));\n\n    CUDA_CHECK(cudaFree(d_keys));\n    CUDA_CHECK(cudaFree(d_vectors));\n    CUDA_CHECK(cudaFree(d_new_vectors));\n    CUDA_CHECK(cudaFree(d_found));\n    CUDA_CHECK(cudaDeviceSynchronize());\n\n    CudaCheckError();\n  };\n\n  /* the table is relative idle, and assume there is no eviction */\n  int batch = 0;\n  std::cout << \"[Batch 0] \" << BATCH_0_SIZE << \" threads\\n\";\n  for (int i = 0; i < BATCH_0_SIZE; i += 2) {\n    threads.emplace_back(std::thread(worker1, batch, i));\n    threads.emplace_back(std::thread(worker2, batch, i + 1));\n  }\n  for (auto& th : threads) {\n    th.join();\n  }\n  threads.clear();\n\n  /* test the correct of APIs serially */\n  batch = 1;\n  std::cout << \"[Batch 1] \" << BATCH_1_SIZE << \" threads\\n\";\n  for (int i = BATCH_0_SIZE; i < BATCH_0_SIZE + BATCH_1_SIZE; i += 2) {\n    auto th = std::thread(worker1, batch, i);\n    th.join();\n    th = std::thread(worker2, batch, i + 1);\n    th.join();\n  }\n\n  /* eviction may occur */\n  batch = 2;\n  std::cout << \"[Batch 2] \" << BATCH_2_SIZE << \" threads\\n\";\n  for (int i = BATCH_0_SIZE + BATCH_1_SIZE; i < THREAD_N; i += 2) {\n    threads.emplace_back(std::thread(worker1, batch, i));\n    threads.emplace_back(std::thread(worker2, batch, i + 1));\n  }\n  for (auto& th : threads) {\n    th.join();\n  }\n  ASSERT_EQ(table->capacity(), MAX_CAPACITY);\n}\n\ntemplate <typename K, typename V, typename S, typename Table, size_t dim = 64>\nvoid CheckInsertOrAssignValues(Table* table, K* keys, V* values, S* scores,\n                               size_t len, cudaStream_t stream) {\n  std::map<K, test_util::ValueArray<V, dim>> map_before_insert;\n  std::map<K, test_util::ValueArray<V, dim>> map_after_insert;\n  K* h_tmp_keys = nullptr;\n  V* h_tmp_values = nullptr;\n  S* h_tmp_scores = nullptr;\n\n  K* d_tmp_keys = nullptr;\n  V* d_tmp_values = nullptr;\n  S* d_tmp_scores = nullptr;\n\n  size_t table_size_before = table->size(stream);\n  size_t cap = table_size_before + len;\n\n  CUDA_CHECK(cudaMallocAsync(&d_tmp_keys, cap * sizeof(K), stream));\n  CUDA_CHECK(cudaMemsetAsync(d_tmp_keys, 0, cap * sizeof(K), stream));\n  CUDA_CHECK(cudaMallocAsync(&d_tmp_values, cap * dim * sizeof(V), stream));\n  CUDA_CHECK(cudaMemsetAsync(d_tmp_values, 0, cap * dim * sizeof(V), stream));\n  CUDA_CHECK(cudaMallocAsync(&d_tmp_scores, cap * sizeof(S), stream));\n  CUDA_CHECK(cudaMemsetAsync(d_tmp_scores, 0, cap * sizeof(S), stream));\n  h_tmp_keys = (K*)malloc(cap * sizeof(K));\n  h_tmp_values = (V*)malloc(cap * dim * sizeof(V));\n  h_tmp_scores = (S*)malloc(cap * sizeof(S));\n\n  size_t table_size_verify0 = table->export_batch(\n      table->capacity(), 0, d_tmp_keys, d_tmp_values, d_tmp_scores, stream);\n  ASSERT_EQ(table_size_before, table_size_verify0);\n\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys,\n                             table_size_before * sizeof(K),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,\n                             table_size_before * dim * sizeof(V),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores,\n                             table_size_before * sizeof(S),\n                             cudaMemcpyDeviceToHost, stream));\n\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n\n  for (size_t i = 0; i < table_size_verify0; i++) {\n    test_util::ValueArray<V, dim>* vec =\n        reinterpret_cast<test_util::ValueArray<V, dim>*>(h_tmp_values +\n                                                         i * dim);\n    map_before_insert[h_tmp_keys[i]] = *vec;\n  }\n\n  auto start = std::chrono::steady_clock::now();\n  table->insert_or_assign(len, keys, values, nullptr, stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  auto end = std::chrono::steady_clock::now();\n  auto diff = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start);\n\n  float dur = diff.count();\n\n  size_t table_size_after = table->size(stream);\n  size_t table_size_verify1 = table->export_batch(\n      table->capacity(), 0, d_tmp_keys, d_tmp_values, d_tmp_scores, stream);\n\n  ASSERT_EQ(table_size_verify1, table_size_after);\n\n  size_t new_cap = table_size_after;\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys,\n                             table_size_after * sizeof(K),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,\n                             table_size_after * dim * sizeof(V),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores,\n                             table_size_after * sizeof(S),\n                             cudaMemcpyDeviceToHost, stream));\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  int64_t new_cap_K = (int64_t)new_cap;\n  for (int64_t i = new_cap_K - 1; i >= 0; i--) {\n    test_util::ValueArray<V, dim>* vec =\n        reinterpret_cast<test_util::ValueArray<V, dim>*>(h_tmp_values +\n                                                         i * dim);\n    map_after_insert[h_tmp_keys[i]] = *vec;\n  }\n\n  size_t value_diff_cnt = 0;\n  for (auto& it : map_after_insert) {\n    test_util::ValueArray<V, dim>& vec = map_after_insert.at(it.first);\n    for (size_t j = 0; j < dim; j++) {\n      if (vec[j] != static_cast<float>(it.first * 0.00001)) {\n        ++value_diff_cnt;\n        break;\n      }\n    }\n  }\n  ASSERT_EQ(value_diff_cnt, 0);\n  std::cout << \"Check insert behavior got value_diff_cnt: \" << value_diff_cnt\n            << \", while table_size_before: \" << table_size_before\n            << \", while table_size_after: \" << table_size_after\n            << \", while len: \" << len << std::endl;\n\n  CUDA_CHECK(cudaFreeAsync(d_tmp_keys, stream));\n  CUDA_CHECK(cudaFreeAsync(d_tmp_values, stream));\n  CUDA_CHECK(cudaFreeAsync(d_tmp_scores, stream));\n  free(h_tmp_keys);\n  free(h_tmp_values);\n  free(h_tmp_scores);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n}\n\nvoid test_insert_or_assign_values_check(size_t max_hbm_for_vectors) {\n  const size_t U = 524288;\n  const size_t init_capacity = 1024;\n  const size_t B = 524288 + 13;\n  constexpr size_t dim = 64;\n\n  TableOptions opt;\n\n  opt.max_capacity = U;\n  opt.init_capacity = init_capacity;\n  opt.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>;\n  opt.dim = 64;\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  std::unique_ptr<Table> table = std::make_unique<Table>();\n  table->init(opt);\n\n  test_util::KVMSBuffer<K, V, S> data_buffer;\n  data_buffer.Reserve(B, dim, stream);\n\n  size_t offset = 0;\n  S score = 0;\n  for (int i = 0; i < 20; i++) {\n    test_util::create_random_keys<K, S, V, dim>(\n        data_buffer.keys_ptr(false), data_buffer.scores_ptr(false),\n        data_buffer.values_ptr(false), (int)B, B * 16);\n    data_buffer.SyncData(true, stream);\n\n    CheckInsertOrAssignValues<K, V, S, Table, dim>(\n        table.get(), data_buffer.keys_ptr(), data_buffer.values_ptr(),\n        data_buffer.scores_ptr(), B, stream);\n\n    offset += B;\n    score += 1;\n  }\n}\n\nvoid test_bucket_size(bool load_scores = true) {\n  constexpr uint64_t INIT_CAPACITY = 128 * 1024UL;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr uint64_t KEY_NUM = 128UL;\n  constexpr uint64_t TEST_TIMES = 1;\n  constexpr uint32_t DIM = 4;\n\n  K* h_keys;\n  S* h_scores;\n  V* h_vectors;\n  bool* h_found;\n\n  TableOptions options;\n\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.max_hbm_for_vectors = nv::merlin::GB(16);\n  options.reserved_key_start_bit = 1;\n  options.num_of_buckets_per_alloc = 2;\n  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\n  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));\n\n  K* d_keys;\n  S* d_scores = nullptr;\n  V* d_vectors;\n  bool* d_found;\n\n  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));\n  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));\n  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));\n\n  uint64_t lowerBound = 8;\n  uint64_t upperBound = 2048;\n  for (uint64_t bucket_max_size = lowerBound; bucket_max_size <= upperBound;\n       bucket_max_size *= 2) {\n    options.max_bucket_size = bucket_max_size;\n    CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n\n    test_util::create_random_keys<K, S, V>(options.dim, h_keys, h_scores,\n                                           h_vectors, KEY_NUM);\n    CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),\n                          cudaMemcpyHostToDevice));\n    CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,\n                          KEY_NUM * sizeof(V) * options.dim,\n                          cudaMemcpyHostToDevice));\n\n    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n\n    cudaStream_t stream;\n    CUDA_CHECK(cudaStreamCreate(&stream));\n\n    uint64_t total_size = 0;\n    for (int i = 0; i < TEST_TIMES; i++) {\n      std::unique_ptr<Table> table = std::make_unique<Table>();\n      table->init(options);\n\n      total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, 0);\n\n      table->insert_or_assign(KEY_NUM, d_keys, d_vectors, d_scores, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n\n      total_size = table->size(stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      ASSERT_EQ(total_size, KEY_NUM);\n\n      CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));\n      CUDA_CHECK(cudaMemset(d_scores, 0, KEY_NUM * sizeof(S)));\n      if (load_scores) {\n        table->find(KEY_NUM, d_keys, d_vectors, d_found, d_scores, stream);\n      } else {\n        table->find(KEY_NUM, d_keys, d_vectors, d_found, nullptr, stream);\n      }\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      int found_num = 0;\n      CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                            cudaMemcpyDeviceToHost));\n      CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),\n                            cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,\n                            KEY_NUM * sizeof(V) * options.dim,\n                            cudaMemcpyDeviceToHost));\n\n      for (int i = 0; i < KEY_NUM; i++) {\n        if (h_found[i]) found_num++;\n        if (load_scores) ASSERT_EQ(h_scores[i], h_keys[i]);\n        for (int j = 0; j < options.dim; j++) {\n          ASSERT_EQ(h_vectors[i * options.dim + j],\n                    static_cast<float>(h_keys[i] * 0.00001));\n        }\n      }\n      ASSERT_EQ(found_num, KEY_NUM);\n\n      CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));\n      table->contains(KEY_NUM, d_keys, d_found, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      int contains_num = 0;\n      CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),\n                            cudaMemcpyDeviceToHost));\n      for (int i = 0; i < KEY_NUM; i++) {\n        if (h_found[i]) contains_num++;\n      }\n      ASSERT_EQ(contains_num, found_num);\n    }\n    CUDA_CHECK(cudaStreamDestroy(stream));\n  }\n\n  CUDA_CHECK(cudaFreeHost(h_keys));\n  CUDA_CHECK(cudaFreeHost(h_scores));\n  CUDA_CHECK(cudaFreeHost(h_vectors));\n  CUDA_CHECK(cudaFreeHost(h_found));\n\n  CUDA_CHECK(cudaFree(d_keys));\n  CUDA_CHECK(cudaFree(d_scores));\n  CUDA_CHECK(cudaFree(d_vectors));\n  CUDA_CHECK(cudaFree(d_found));\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n\nTEST(MerlinHashTableTest, test_export_batch_if) {\n  test_export_batch_if(16);\n  test_export_batch_if(0);\n}\nTEST(MerlinHashTableTest, test_insert_or_assign_multi_threads) {\n  test_insert_or_assign_multi_threads(16, 0.25f, 0.125f);\n  test_insert_or_assign_multi_threads(16, 0.375f, 0.125f);\n  test_insert_or_assign_multi_threads(0, 0.25f, 0.125f);\n  test_insert_or_assign_multi_threads(0, 0.375f, 0.125f);\n}\nTEST(MerlinHashTableTest, test_basic) {\n  test_basic(16);\n  test_basic(0);\n}\nTEST(MerlinHashTableTest, test_basic_without_rehash) {\n  test_basic_without_rehash(16);\n  test_basic_without_rehash(0);\n}\nTEST(MerlinHashTableTest, test_bucket_size) { test_bucket_size(); }\nTEST(MerlinHashTableTest, test_find_using_pipeline) {\n  test_find_using_pipeline<int32_t>(224, true);\n  test_find_using_pipeline<uint32_t>(202, true);\n  test_find_using_pipeline<float>(129, true);\n\n  test_find_using_pipeline<float>(128, true);\n  test_find_using_pipeline<int32_t>(66, false);\n  test_find_using_pipeline<uint32_t>(3, false);\n  test_find_using_pipeline<double>(3, true);\n\n  test_find_using_pipeline<int16_t>(128, true);\n  test_find_using_pipeline<int8_t>(66, false);\n  test_find_using_pipeline<uint16_t>(3, false);\n  test_find_using_pipeline<uint8_t>(3, true);\n}\nTEST(MerlinHashTableTest, test_basic_when_full) {\n  test_basic_when_full(16);\n  test_basic_when_full(0);\n}\nTEST(MerlinHashTableTest, test_erase_if_pred) {\n  test_erase_if_pred<EraseIfVersion::V1>(16);\n  test_erase_if_pred<EraseIfVersion::V1>(0);\n  test_erase_if_pred<EraseIfVersion::V2>(16);\n  test_erase_if_pred<EraseIfVersion::V3>(16);\n}\nTEST(MerlinHashTableTest, test_rehash) {\n  test_rehash(16);\n  test_rehash(0);\n}\nTEST(MerlinHashTableTest, test_rehash_on_big_batch_specific) {\n  test_rehash_on_big_batch_specific(16);\n  test_rehash_on_big_batch_specific(0);\n}\nTEST(MerlinHashTableTest, test_rehash_on_big_batch) {\n  test_rehash_on_big_batch(16);\n  test_rehash_on_big_batch(0);\n}\nTEST(MerlinHashTableTest, test_dynamic_rehash_on_multi_threads) {\n  test_dynamic_rehash_on_multi_threads(16);\n  test_dynamic_rehash_on_multi_threads(0);\n}\nTEST(MerlinHashTableTest, test_basic_for_cpu_io) { test_basic_for_cpu_io(); }\n\nTEST(MerlinHashTableTest, test_evict_strategy_lru_basic) {\n  test_evict_strategy_lru_basic(16);\n  test_evict_strategy_lru_basic(0);\n}\n\nTEST(MerlinHashTableTest, test_evict_strategy_lfu_basic) {\n  test_evict_strategy_lfu_basic(16);\n  // TODO(rhdong): Add back when diff error issue fixed in hybrid mode.\n  // test_evict_strategy_lfu_basic(0);\n}\n\nTEST(MerlinHashTableTest, test_evict_strategy_epochlru_basic) {\n  test_evict_strategy_epochlru_basic(16);\n  test_evict_strategy_epochlru_basic(0);\n}\n\nTEST(MerlinHashTableTest, test_evict_strategy_epochlfu_basic) {\n  test_evict_strategy_epochlfu_basic(16);\n  test_evict_strategy_epochlfu_basic(0);\n}\n\nTEST(MerlinHashTableTest, test_evict_strategy_customized_basic) {\n  test_evict_strategy_customized_basic(16);\n  test_evict_strategy_customized_basic(0);\n}\n\nTEST(MerlinHashTableTest, test_evict_strategy_customized_advanced) {\n  test_evict_strategy_customized_advanced(16);\n  test_evict_strategy_customized_advanced(0);\n}\n\nTEST(MerlinHashTableTest, test_evict_strategy_customized_correct_rate) {\n  test_evict_strategy_customized_correct_rate(16);\n  // TODO(rhdong): after blossom CI issue is resolved, the skip logic.\n  const bool skip_hmem_check = (nullptr != std::getenv(\"IS_BLOSSOM_CI\"));\n  if (!skip_hmem_check) {\n    test_evict_strategy_customized_correct_rate(0);\n  } else {\n    std::cout << \"The HMEM check is skipped in blossom CI!\" << std::endl;\n  }\n}\n\nTEST(MerlinHashTableTest, test_insert_or_assign_values_check) {\n  test_insert_or_assign_values_check(16);\n  // TODO(rhdong): Add back when diff error issue fixed in hybrid mode.\n  test_insert_or_assign_values_check(0);\n}\n"
  },
  {
    "path": "tests/reserved_keys_test.cc.cu",
    "content": "/*\n * Copyright (c) 2024, NVIDIA CORPORATION.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *     http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n#include <gtest/gtest.h>\n#include <cstdint>\n#include \"merlin/types.cuh\"\n#include \"merlin/utils.cuh\"\n#include \"test_util.cuh\"\n\nusing namespace nv::merlin;\n\n__global__ void testReservedKeysKernel(uint64_t* keys, bool* results,\n                                       size_t numKeys) {\n  int idx = threadIdx.x + blockIdx.x * blockDim.x;\n  if (idx < numKeys) {\n    results[idx] = IS_RESERVED_KEY(keys[idx]);\n  }\n}\n\nvoid testCustomMemsetAsync() {\n  size_t numElements = 4;\n  uint64_t value = 0xFFFFFFFFFFFFFFF1;\n  uint64_t* devPtr;\n  uint64_t* hostData = new uint64_t[numElements];\n\n  cudaMalloc((void**)&devPtr, numElements * sizeof(uint64_t));\n  memset64Async(devPtr, value, numElements);\n  cudaMemcpy(hostData, devPtr, numElements * sizeof(uint64_t),\n             cudaMemcpyDeviceToHost);\n  for (size_t i = 0; i < numElements; i++) {\n    assert(hostData[i] == value);\n  }\n\n  std::cout << \"All values were set correctly!\" << std::endl;\n\n  cudaFree(devPtr);\n  delete[] hostData;\n}\n\nvoid testReservedKeys(uint64_t* testKeys, bool* expectedResults,\n                      size_t numKeys) {\n  uint64_t* d_keys;\n  bool* d_results;\n  bool* h_results = new bool[numKeys];\n\n  cudaMalloc(&d_keys, numKeys * sizeof(uint64_t));\n  cudaMalloc(&d_results, numKeys * sizeof(bool));\n\n  cudaMemcpy(d_keys, testKeys, numKeys * sizeof(uint64_t),\n             cudaMemcpyHostToDevice);\n\n  int blockSize = 256;\n  int numBlocks = (numKeys + blockSize - 1) / blockSize;\n\n  testReservedKeysKernel<<<numBlocks, blockSize>>>(d_keys, d_results, numKeys);\n  cudaDeviceSynchronize();\n\n  cudaMemcpy(h_results, d_results, numKeys * sizeof(bool),\n             cudaMemcpyDeviceToHost);\n\n  for (size_t i = 0; i < numKeys; i++) {\n    assert(h_results[i] == expectedResults[i]);\n  }\n\n  cudaFree(d_keys);\n  cudaFree(d_results);\n  delete[] h_results;\n  CudaCheckError();\n  std::cout << \"All tests passed.\" << std::endl;\n}\n\nvoid testKeyOptions() {\n  for (int i = 0; i <= MAX_RESERVED_KEY_BIT; i++) {\n    CUDA_CHECK(init_reserved_keys(i));\n    uint64_t host_reclaim_key, host_locked_key;\n    cudaMemcpyFromSymbol(&host_reclaim_key, RECLAIM_KEY, sizeof(uint64_t));\n    cudaMemcpyFromSymbol(&host_locked_key, LOCKED_KEY, sizeof(uint64_t));\n\n    uint64_t testKeys[6] = {EMPTY_KEY_CPU, host_reclaim_key, host_locked_key,\n                            UINT64_C(0x0), UINT64_C(0x10),   DEFAULT_EMPTY_KEY};\n    bool expectedResults[6] = {true,  true,  true,\n                               false, false, (i == 0) ? true : false};\n    testReservedKeys(testKeys, expectedResults, 4);\n  }\n}\n\nTEST(ReservedKeysTest, testKeyOptions) {\n  testKeyOptions();\n  testCustomMemsetAsync();\n}"
  },
  {
    "path": "tests/save_and_load_test.cc.cu",
    "content": "/*\n * Copyright (c) 2022, NVIDIA CORPORATION.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *     http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n#include <gtest/gtest.h>\n#include <stdio.h>\n#include \"merlin/types.cuh\"\n#include \"merlin_hashtable.cuh\"\n#include \"merlin_localfile.hpp\"\n#include \"test_util.cuh\"\n\nconstexpr uint64_t DIM = 64;\nusing K = int64_t;\nusing S = uint64_t;\nusing V = float;\nusing EvictStrategy = nv::merlin::EvictStrategy;\nusing TableOptions = nv::merlin::HashTableOptions;\n\ntemplate <typename Table>\nvoid test_save_to_file() {\n  std::string prefix = \"checkpoint\";\n  size_t keynum = 1 * 1024 * 1024;\n  size_t capacity = 2 * 1024 * 1024;\n  size_t buffer_size = 1024 * 1024;\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  K* h_keys = nullptr;\n  V* h_vectors = nullptr;\n  S* h_scores = nullptr;\n  CUDA_CHECK(cudaMallocHost(&h_keys, keynum * sizeof(K)));\n  CUDA_CHECK(cudaMallocHost(&h_vectors, keynum * sizeof(V) * DIM));\n  CUDA_CHECK(cudaMallocHost(&h_scores, keynum * sizeof(S)));\n  memset(h_keys, 0, keynum * sizeof(K));\n  memset(h_vectors, 0, keynum * sizeof(V) * DIM);\n  memset(h_scores, 0, keynum * sizeof(S));\n  test_util::create_random_keys<K, S>(h_keys, h_scores, keynum);\n  printf(\"Pass create random keys.\\n\");\n\n  K* d_keys = nullptr;\n  V* d_vectors = nullptr;\n  S* d_scores = nullptr;\n  test_util::getBufferOnDevice(&d_keys, keynum * sizeof(K), stream);\n  test_util::getBufferOnDevice(&d_vectors, keynum * sizeof(V) * DIM, stream);\n  test_util::getBufferOnDevice(&d_scores, keynum * sizeof(S), stream);\n  CUDA_CHECK(cudaMemcpyAsync(d_keys, h_keys, keynum * sizeof(K),\n                             cudaMemcpyHostToDevice, stream));\n  CUDA_CHECK(cudaMemcpyAsync(d_vectors, h_vectors, keynum * sizeof(V) * DIM,\n                             cudaMemcpyHostToDevice, stream));\n  CUDA_CHECK(cudaMemcpyAsync(d_scores, h_scores, keynum * sizeof(S),\n                             cudaMemcpyHostToDevice, stream));\n  printf(\"Create buffers.\\n\");\n\n  TableOptions options;\n  options.init_capacity = capacity;\n  options.max_capacity = capacity;\n  options.dim = DIM;\n\n  std::unique_ptr<Table> table_0 = std::make_unique<Table>();\n  std::unique_ptr<Table> table_1 = std::make_unique<Table>();\n  table_0->init(options);\n  table_1->init(options);\n  printf(\"Init tables.\\n\");\n\n  S global_epoch = 101;\n  S* temp_score = (Table::evict_strategy == EvictStrategy::kLru ||\n                   Table::evict_strategy == EvictStrategy::kEpochLru)\n                      ? nullptr\n                      : d_scores;\n  table_0->set_global_epoch(global_epoch);\n  table_0->insert_or_assign(keynum, d_keys, d_vectors, temp_score, stream);\n  printf(\"Fill table_0.\\n\");\n  nv::merlin::LocalKVFile<K, V, S> file;\n  std::string keys_path = prefix + \".keys\";\n  std::string values_path = prefix + \".values\";\n  std::string scores_path = prefix + \".scores\";\n  file.open(keys_path, values_path, scores_path, \"wb\");\n  table_0->save(&file, buffer_size, stream);\n  file.close();\n  printf(\"table_0 saves.\\n\");\n  file.open(keys_path, values_path, scores_path, \"rb\");\n  table_1->load(&file, buffer_size, stream);\n  file.close();\n  printf(\"table_1 loads.\\n\");\n  bool check_score = !(Table::evict_strategy == EvictStrategy::kLru ||\n                       Table::evict_strategy == EvictStrategy::kEpochLru);\n  ASSERT_TRUE((test_util::tables_equal<K, V, S, Table>(\n      table_0.get(), table_1.get(), check_score, stream)));\n  printf(\"table_0 and table_1 are equal.\\n\");\n  CUDA_FREE_POINTERS(stream, d_keys, d_vectors, d_scores, h_keys, h_vectors,\n                     h_scores);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n}\n\nTEST(SaveAndLoadTest, test_save_and_load_on_lru) {\n  test_save_to_file<nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>>();\n}\nTEST(SaveAndLoadTest, test_save_and_load_on_lfu) {\n  test_save_to_file<nv::merlin::HashTable<K, V, S, EvictStrategy::kLfu>>();\n}\nTEST(SaveAndLoadTest, test_save_and_load_on_epochlru) {\n  test_save_to_file<nv::merlin::HashTable<K, V, S, EvictStrategy::kEpochLru>>();\n}\nTEST(SaveAndLoadTest, test_save_and_load_on_epochlfu) {\n  test_save_to_file<nv::merlin::HashTable<K, V, S, EvictStrategy::kEpochLfu>>();\n}\nTEST(SaveAndLoadTest, test_save_and_load_on_customized) {\n  test_save_to_file<\n      nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>>();\n}\n"
  },
  {
    "path": "tests/test_util.cuh",
    "content": "/*\n * Copyright (c) 2022, NVIDIA CORPORATION.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *     http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n#pragma once\n\n#include <assert.h>\n#include <math.h>\n#include <stdio.h>\n#include <stdlib.h>\n#include <algorithm>\n#include <iomanip>\n#include <iostream>\n#include <random>\n#include <string>\n#include <thread>\n#include <unordered_map>\n#include <unordered_set>\n#include \"merlin/utils.cuh\"\n#include \"merlin_hashtable.cuh\"\n\n#define UNEQUAL_EXPR(expr1, expr2)                             \\\n  {                                                            \\\n    std::cout << __FILE__ << \":\" << __LINE__ << \":Unequal\\n\"   \\\n              << \"\\t\\t\" << #expr1 << \" != \" << #expr2 << \"\\n\"; \\\n  }\n\n#define MERLIN_EXPECT_TRUE(cond, msg)                                    \\\n  if ((cond) == false) {                                                 \\\n    fprintf(stderr, \"[ERROR] %s at %s : %d\\n\", msg, __FILE__, __LINE__); \\\n    exit(-1);                                                            \\\n  }\n\nnamespace test_util {\n\ntemplate <class S>\n__global__ void host_nano_kernel(S* d_clk) {\n  S mclk;\n  asm volatile(\"mov.u64 %0,%%globaltimer;\" : \"=l\"(mclk));\n  *d_clk = mclk;\n}\n\ntemplate <class S>\nS host_nano(cudaStream_t stream = 0) {\n  S h_clk = 0;\n  S* d_clk;\n\n  CUDA_CHECK(cudaMalloc((void**)&(d_clk), sizeof(S)));\n  host_nano_kernel<S><<<1, 1, 0, stream>>>(d_clk);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n\n  CUDA_CHECK(cudaMemcpy(&h_clk, d_clk, sizeof(S), cudaMemcpyDeviceToHost));\n  CUDA_CHECK(cudaFree(d_clk));\n  return h_clk;\n}\n\n__global__ void all_true(const bool* conds, size_t n, int* nfalse) {\n  const size_t stripe =\n      (n + gridDim.x - 1) /\n      gridDim.x;  // number of elements assigned to each block.\n  size_t start = blockIdx.x * stripe + threadIdx.x;\n  size_t end = min(start + stripe, n);\n\n  __shared__ int local_nfalse;\n  if (threadIdx.x == 0) {\n    local_nfalse = 0;\n  }\n  __syncthreads();\n\n  for (size_t i = start; i < end; i += blockDim.x) {\n    if (!conds[i]) {\n      atomicAdd(&local_nfalse, 1);\n    }\n  }\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    atomicAdd(nfalse, local_nfalse);\n  }\n}\n\ntemplate <typename T>\n__global__ void all_equal(T* a, T* b, size_t n, int* ndiff) {\n  const size_t stripe =\n      (n + gridDim.x - 1) /\n      gridDim.x;  // number of elements assigned to each block.\n  size_t start = blockIdx.x * stripe + threadIdx.x;\n  size_t end = min(start + stripe, n);\n\n  __shared__ int local_ndiff;\n  if (threadIdx.x == 0) {\n    local_ndiff = 0;\n  }\n  __syncthreads();\n\n  for (size_t i = start; i < end; i += blockDim.x) {\n    if (a[i] != b[i]) {\n      atomicAdd(&local_ndiff, 1);\n    }\n  }\n  __syncthreads();\n  if (threadIdx.x == 0) {\n    atomicAdd(ndiff, local_ndiff);\n  }\n}\n\nuint64_t getTimestamp() {\n  return std::chrono::duration_cast<std::chrono::milliseconds>(\n             std::chrono::system_clock::now().time_since_epoch())\n      .count();\n}\n\ntemplate <class K, class S>\nvoid create_random_keys(K* h_keys, S* h_scores, int KEY_NUM,\n                        int freq_range = 1000) {\n  std::unordered_set<K> numbers;\n  std::random_device rd;\n  std::mt19937_64 eng(rd());\n  std::uniform_int_distribution<K> distr;\n  int i = 0;\n\n  while (numbers.size() < KEY_NUM) {\n    numbers.insert(distr(eng));\n  }\n  for (const K num : numbers) {\n    h_keys[i] = num;\n    h_scores[i] = num % freq_range;\n    i++;\n  }\n}\n\ntemplate <class K, class S, class V, size_t DIM = 16>\nvoid create_random_keys(K* h_keys, S* h_scores, V* h_vectors, int KEY_NUM,\n                        size_t range = std::numeric_limits<uint64_t>::max()) {\n  std::unordered_set<K> numbers;\n  std::random_device rd;\n  std::mt19937_64 eng(rd());\n  std::uniform_int_distribution<K> distr;\n  int i = 0;\n\n  while (numbers.size() < KEY_NUM) {\n    numbers.insert(distr(eng) % range);\n  }\n  for (const K num : numbers) {\n    h_keys[i] = num;\n    if (h_scores != nullptr) {\n      h_scores[i] = num;\n    }\n    if (h_vectors != nullptr) {\n      for (size_t j = 0; j < DIM; j++) {\n        h_vectors[i * DIM + j] = static_cast<float>(num * 0.00001);\n      }\n    }\n    i++;\n  }\n}\n\ntemplate <class K>\nvoid create_random_bools(bool* bools, int KEY_NUM, float true_ratio = 0.6) {\n  std::random_device rd;\n  std::mt19937_64 eng(rd());\n  std::uniform_int_distribution<K> distr;\n\n  for (int i = 0; i < KEY_NUM; i++) {\n    K bound = 1000 * true_ratio;\n    bools[i] = (distr(eng) % 1000 < bound);\n  }\n}\n\ntemplate <class K, class S, class V>\nvoid create_random_keys(size_t dim, K* h_keys, S* h_scores, V* h_vectors,\n                        int KEY_NUM,\n                        size_t range = std::numeric_limits<uint64_t>::max()) {\n  std::unordered_set<K> numbers;\n  std::random_device rd;\n  std::mt19937_64 eng(rd());\n  std::uniform_int_distribution<K> distr;\n  int i = 0;\n\n  while (numbers.size() < KEY_NUM) {\n    numbers.insert(distr(eng) % range);\n  }\n  for (const K num : numbers) {\n    h_keys[i] = num;\n    if (h_scores != nullptr) {\n      h_scores[i] = num;\n    }\n    if (h_vectors != nullptr) {\n      for (size_t j = 0; j < dim; j++) {\n        h_vectors[i * dim + j] = static_cast<V>(num * 0.00001);\n      }\n    }\n    i++;\n  }\n}\n\ntemplate <class K, class S, class V>\nvoid create_random_keys_advanced(\n    size_t dim, K* h_keys, S* h_scores, V* h_vectors, int KEY_NUM,\n    size_t range = std::numeric_limits<uint64_t>::max(), int freq_range = 10) {\n  std::unordered_set<K> numbers;\n  std::random_device rd;\n  std::mt19937_64 eng(rd());\n  std::uniform_int_distribution<K> distr;\n  int i = 0;\n\n  while (numbers.size() < KEY_NUM) {\n    numbers.insert(distr(eng) % range);\n  }\n  for (const K num : numbers) {\n    h_keys[i] = num;\n    if (h_scores != nullptr) {\n      h_scores[i] = num % freq_range;\n    }\n    if (h_vectors != nullptr) {\n      for (size_t j = 0; j < dim; j++) {\n        h_vectors[i * dim + j] = static_cast<float>(num * 0.00001);\n      }\n    }\n    i++;\n  }\n}\n\ntemplate <class K, class S, class V>\nvoid create_random_keys_advanced(\n    size_t dim, K* h_keys, K* pre_h_keys, S* h_scores, V* h_vectors,\n    int KEY_NUM, size_t range = std::numeric_limits<uint64_t>::max(),\n    int freq_range = 10, float repeat_rate = 0.9) {\n  std::unordered_set<K> numbers;\n  std::random_device rd;\n  std::mt19937_64 eng(rd());\n  std::uniform_int_distribution<K> distr;\n  std::mt19937_64 eng_switch(rd());\n  std::uniform_int_distribution<K> distr_switch;\n  int i = 0;\n  int pre_pos = 0;\n\n  while (numbers.size() < KEY_NUM) {\n    bool repeated = static_cast<K>(distr_switch(eng_switch) % 100000) <\n                    static_cast<K>(repeat_rate * 100000);\n    if (repeated) {\n      numbers.insert(pre_h_keys[pre_pos++]);\n    } else {\n      numbers.insert(distr(eng) % range);\n    }\n  }\n  for (const K num : numbers) {\n    h_keys[i] = num;\n    if (h_scores != nullptr) {\n      h_scores[i] = num % freq_range;\n    }\n    if (h_vectors != nullptr) {\n      for (size_t j = 0; j < dim; j++) {\n        h_vectors[i * dim + j] = static_cast<float>(num * 0.00001);\n      }\n    }\n    i++;\n  }\n}\n\ninline uint64_t Murmur3HashHost(const uint64_t& key) {\n  uint64_t k = key;\n  k ^= k >> 33;\n  k *= UINT64_C(0xff51afd7ed558ccd);\n  k ^= k >> 33;\n  k *= UINT64_C(0xc4ceb9fe1a85ec53);\n  k ^= k >> 33;\n  return k;\n}\n\ntemplate <class K, class S, class V, size_t DIM = 16>\nvoid create_continuous_keys(K* h_keys, S* h_scores, V* h_vectors, int KEY_NUM,\n                            K start = 1) {\n  for (K i = 0; i < KEY_NUM; i++) {\n    h_keys[i] = start + static_cast<K>(i);\n    h_scores[i] = h_keys[i];\n    if (h_vectors != nullptr) {\n      for (size_t j = 0; j < DIM; j++) {\n        h_vectors[i * DIM + j] = static_cast<V>(h_keys[i] * 0.00001);\n      }\n    }\n  }\n}\n\ntemplate <class K, class S, class V, size_t DIM = 16>\nvoid create_keys_in_one_buckets(K* h_keys, S* h_scores, V* h_vectors,\n                                int KEY_NUM, int capacity,\n                                int bucket_max_size = 128, int bucket_idx = 0,\n                                K min = 0,\n                                K max = static_cast<K>(0xFFFFFFFFFFFFFFFD)) {\n  std::unordered_set<K> numbers;\n  std::random_device rd;\n  std::mt19937_64 eng(rd());\n  std::uniform_int_distribution<K> distr;\n  K candidate;\n  K hashed_key;\n  size_t global_idx;\n  size_t bkt_idx;\n  int i = 0;\n\n  while (numbers.size() < KEY_NUM) {\n    candidate = (distr(eng) % (max - min)) + min;\n    hashed_key = Murmur3HashHost(candidate);\n    global_idx = hashed_key & (capacity - 1);\n    bkt_idx = global_idx / bucket_max_size;\n    if (bkt_idx == bucket_idx) {\n      numbers.insert(candidate);\n    }\n  }\n  for (const K num : numbers) {\n    h_keys[i] = num;\n    if (h_scores != nullptr) {\n      h_scores[i] = num;\n    }\n    for (size_t j = 0; j < DIM; j++) {\n      *(h_vectors + i * DIM + j) = static_cast<float>(num * 0.00001);\n    }\n    i++;\n  }\n}\n\ntemplate <class K, class S, class V, size_t DIM = 16>\nvoid create_keys_in_one_buckets_lfu(K* h_keys, S* h_scores, V* h_vectors,\n                                    int KEY_NUM, int capacity,\n                                    int bucket_max_size = 128,\n                                    int bucket_idx = 0, K min = 0,\n                                    K max = static_cast<K>(0xFFFFFFFFFFFFFFFD),\n                                    int freq_range = 1000) {\n  std::unordered_set<K> numbers;\n  std::random_device rd;\n  std::mt19937_64 eng(rd());\n  std::uniform_int_distribution<K> distr;\n  K candidate;\n  K hashed_key;\n  size_t global_idx;\n  size_t bkt_idx;\n  int i = 0;\n\n  while (numbers.size() < KEY_NUM) {\n    candidate = (distr(eng) % (max - min)) + min;\n    hashed_key = Murmur3HashHost(candidate);\n    global_idx = hashed_key & (capacity - 1);\n    bkt_idx = global_idx / bucket_max_size;\n    if (bkt_idx == bucket_idx) {\n      numbers.insert(candidate);\n    }\n  }\n  for (const K num : numbers) {\n    h_keys[i] = num;\n    if (h_scores != nullptr) {\n      h_scores[i] = num % freq_range;\n    }\n    for (size_t j = 0; j < DIM; j++) {\n      *(h_vectors + i * DIM + j) = static_cast<float>(num * 0.00001);\n    }\n    i++;\n  }\n}\n\ntemplate <class S>\nS make_expected_score_for_epochlfu(S global_epoch, S original_score) {\n  bool if_overflow = (original_score >= static_cast<S>(0xFFFFFFFF));\n  return ((global_epoch << 32) | (if_overflow ? (static_cast<S>(0xFFFFFFFF))\n                                              : original_score & 0xFFFFFFFF));\n}\n\ntemplate <typename T>\nvoid getBufferOnDevice(T** ptr, size_t size, cudaStream_t stream) {\n  MERLIN_EXPECT_TRUE((*ptr == nullptr), \"Pointer is already assigned.\");\n  CUDA_CHECK(cudaMallocAsync(ptr, size, stream));\n  CUDA_CHECK(cudaMemsetAsync(*ptr, 0, size, stream));\n}\n\nvoid freeBufferOnDevice(void* ptr, cudaStream_t stream) {\n  CUDA_CHECK(cudaFreeAsync(ptr, stream));\n  ptr = nullptr;\n}\n\ntemplate <typename T, size_t DIM>\nstruct ValueArray {\n public:\n  T data[DIM];\n\n  __host__ __device__ T sum() {\n    T s = 0;\n    for (size_t i = 0; i < DIM; i++) {\n      s += data[i];\n    }\n  }\n\n  __host__ __device__ T operator[](size_t i) { return data[i]; }\n};\n\ntemplate <typename T>\nstruct HostAndDeviceBuffer {\n public:\n  void Alloc(size_t n, cudaStream_t stream = 0) {\n    if (d_data) {\n      CUDA_FREE_POINTERS(stream, d_data);\n    }\n    if (h_data) {\n      free(h_data);\n      h_data = nullptr;\n    }\n    if (d_data) {\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      d_data = nullptr;\n    }\n    getBufferOnDevice(&d_data, n * sizeof(T), stream);\n    h_data = (T*)malloc(n * sizeof(T));\n    size_ = n;\n  }\n\n  ~HostAndDeviceBuffer() {\n    CUDA_CHECK(cudaDeviceSynchronize());\n    Free();\n    CUDA_CHECK(cudaDeviceSynchronize());\n  }\n\n  void Free(cudaStream_t stream = 0) {\n    if (d_data) {\n      CUDA_FREE_POINTERS(stream, d_data);\n    }\n    if (h_data) {\n      free(h_data);\n      h_data = nullptr;\n    }\n    if (d_data) {\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n      d_data = nullptr;\n    }\n    size_ = 0;\n  }\n\n  void SetFromHost(const T* data, size_t n, cudaStream_t stream = 0) {\n    CUDA_CHECK(cudaMemcpyAsync(d_data, data, n * sizeof(T),\n                               cudaMemcpyHostToDevice, stream));\n    memcpy(h_data, data, n * sizeof(T));\n  }\n\n  void SetFromDevice(const T* data, size_t n, cudaStream_t stream = 0) {\n    CUDA_CHECK(cudaMemcpyAsync(d_data, data, n * sizeof(T),\n                               cudaMemcpyDeviceToDevice, stream));\n    CUDA_CHECK(cudaMemcpyAsync(h_data, data, n * sizeof(T),\n                               cudaMemcpyDeviceToHost, stream));\n  }\n\n  bool SetValueInRange(T start, T skip, size_t stripe,\n                       cudaStream_t stream = 0) {\n    if (!h_data || skip == 0 || stripe == 0 || size_ % stripe != 0) {\n      return false;\n    }\n\n    size_t n_stripe = size_ / stripe;\n    for (size_t i = 0; i < n_stripe; i++) {\n      T value = start + static_cast<T>(i) * skip;\n      for (size_t j = 0; j < stripe; j++) {\n        h_data[i * stripe + j] = value;\n      }\n    }\n    CUDA_CHECK(cudaMemcpyAsync(d_data, h_data, size_ * sizeof(T),\n                               cudaMemcpyHostToDevice, stream));\n    return true;\n  }\n\n  void ToZeros(cudaStream_t stream = 0) {\n    CUDA_CHECK(cudaMemsetAsync(d_data, 0, size_ * sizeof(T), stream));\n    memset(h_data, 0, size_ * sizeof(T));\n  }\n\n  void ToConst(const T val, cudaStream_t stream) {\n    for (size_t i = 0; i < size_; i++) {\n      h_data[i] = val;\n    }\n    CUDA_CHECK(cudaMemcpyAsync(d_data, h_data, size_ * sizeof(T),\n                               cudaMemcpyHostToDevice, stream));\n  }\n\n  void SyncData(bool h2d, cudaStream_t stream = 0) {\n    if (h2d) {\n      CUDA_CHECK(cudaMemcpyAsync(d_data, h_data, size_ * sizeof(T),\n                                 cudaMemcpyHostToDevice, stream));\n    } else {\n      CUDA_CHECK(cudaMemcpyAsync(h_data, d_data, size_ * sizeof(T),\n                                 cudaMemcpyDeviceToHost, stream));\n    }\n  }\n\n public:\n  T* h_data = nullptr;\n  T* d_data = nullptr;\n  size_t size_ = 0;\n};\n\ntemplate <typename K, typename V, typename S>\nstruct KVMSBuffer {\n public:\n  KVMSBuffer() : len_(0), dim_(0) {}\n\n  void Reserve(size_t n, size_t dim, cudaStream_t stream = 0) {\n    keys.Alloc(n, stream);\n    values.Alloc(n * dim, stream);\n    scores.Alloc(n, stream);\n    status.Alloc(n, stream);\n    len_ = n;\n    dim_ = dim;\n  }\n\n  ~KVMSBuffer() {\n    CUDA_CHECK(cudaDeviceSynchronize());\n    Free();\n    CUDA_CHECK(cudaDeviceSynchronize());\n  }\n\n  void Free(cudaStream_t stream = 0) {\n    keys.Free(stream);\n    values.Free(stream);\n    scores.Free(stream);\n    status.Free(stream);\n    len_ = 0;\n  }\n\n  size_t len() const { return len_; }\n  size_t dim() const { return dim_; }\n\n  void ToRange(size_t start, size_t skip = 1, cudaStream_t stream = 0) {\n    keys.SetValueInRange(static_cast<K>(start), static_cast<K>(skip), 1,\n                         stream);\n    values.SetValueInRange(static_cast<V>(start), static_cast<V>(skip), dim_,\n                           stream);\n    status.ToZeros(stream);\n  }\n\n  void ToZeros(cudaStream_t stream) {\n    keys.ToZeros(stream);\n    values.ToZeros(stream);\n    scores.ToZeros(stream);\n    status.ToZeros(stream);\n  }\n\n  void Setscore(const S score, cudaStream_t stream) {\n    scores.ToConst(score, stream);\n  }\n\n  K* keys_ptr(bool on_device = true) {\n    if (on_device) {\n      return keys.d_data;\n    }\n    return keys.h_data;\n  }\n\n  V* values_ptr(bool on_device = true) {\n    if (on_device) {\n      return values.d_data;\n    }\n    return values.h_data;\n  }\n\n  S* scores_ptr(bool on_device = true) {\n    if (on_device) {\n      return scores.d_data;\n    }\n    return scores.h_data;\n  }\n\n  bool* status_ptr(bool on_device = true) {\n    if (on_device) {\n      return status.d_data;\n    }\n    return status.h_data;\n  }\n\n  void SyncData(bool h2d, cudaStream_t stream = 0) {\n    keys.SyncData(h2d, stream);\n    values.SyncData(h2d, stream);\n    scores.SyncData(h2d, stream);\n    status.SyncData(h2d, stream);\n  }\n\n  void CopyFrom(KVMSBuffer<K, V, S>& src, cudaStream_t stream = 0) {\n    memcpy(keys_ptr(false), src.keys_ptr(false), sizeof(K) * len());\n    memcpy(scores_ptr(false), src.scores_ptr(false), sizeof(S) * len());\n    memcpy(values_ptr(false), src.values_ptr(false), sizeof(V) * len() * dim());\n    keys.SyncData(true, stream);\n    values.SyncData(true, stream);\n    scores.SyncData(true, stream);\n    status.SyncData(true, stream);\n  }\n\n  void CopyFromByRate(KVMSBuffer<K, V, S>& src, float repeat_rate,\n                      cudaStream_t stream = 0) {\n    memcpy(keys_ptr(false), src.keys_ptr(false), sizeof(K) * len());\n    memcpy(scores_ptr(false), src.scores_ptr(false), sizeof(S) * len());\n    memcpy(values_ptr(false), src.values_ptr(false), sizeof(V) * len() * dim());\n    keys.SyncData(true, stream);\n    values.SyncData(true, stream);\n    scores.SyncData(true, stream);\n    status.SyncData(true, stream);\n  }\n\n public:\n  HostAndDeviceBuffer<K> keys;\n  HostAndDeviceBuffer<V> values;\n  HostAndDeviceBuffer<S> scores;\n  HostAndDeviceBuffer<bool> status;\n  size_t dim_;\n  size_t len_;\n};\n\nbool allTrueGpu(const bool* conds, size_t n, cudaStream_t stream) {\n  int nfalse = 0;\n  int* d_nfalse = nullptr;\n  getBufferOnDevice(&d_nfalse, sizeof(int), stream);\n  int block_size = 128;\n  int grid_size = (n + block_size - 1) / block_size;\n  all_true<<<grid_size, block_size, 0, stream>>>(conds, n, d_nfalse);\n  CUDA_CHECK(cudaMemcpyAsync(&nfalse, d_nfalse, sizeof(int),\n                             cudaMemcpyDeviceToHost, stream));\n  cudaStreamSynchronize(stream);\n  freeBufferOnDevice(d_nfalse, stream);\n  cudaStreamSynchronize(stream);\n  return nfalse == 0;\n}\n\ntemplate <typename T>\nbool allEqualGpu(T* a, T* b, size_t n, cudaStream_t stream) {\n  int ndiff = 0;\n  int* d_ndiff = nullptr;\n  getBufferOnDevice(&d_ndiff, sizeof(int), stream);\n  int block_size = 128;\n  int grid_size = (n + block_size - 1) / block_size;\n  all_equal<<<grid_size, block_size, 0, stream>>>(a, b, n, d_ndiff);\n  CUDA_CHECK(cudaMemcpyAsync(&ndiff, d_ndiff, sizeof(int),\n                             cudaMemcpyDeviceToHost, stream));\n  freeBufferOnDevice(d_ndiff, stream);\n  cudaStreamSynchronize(stream);\n  return ndiff == 0;\n}\n\ntemplate <typename K, typename V, typename S, typename Table>\nbool tables_equal(Table* a, Table* b, bool check_score, cudaStream_t stream) {\n  size_t size = a->size(stream);\n  if (size != b->size(stream)) {\n    return false;\n  }\n\n  if (a->dim() != b->dim()) {\n    return false;\n  }\n\n  size_t* d_size = nullptr;\n  K* d_keys = nullptr;\n  V* d_vectors = nullptr;\n  S* d_scores = nullptr;\n  bool* d_founds_in_b = nullptr;\n  V* d_vectors_in_b = nullptr;\n  S* d_scores_in_b = nullptr;\n\n  getBufferOnDevice(&d_size, sizeof(size_t), stream);\n  getBufferOnDevice(&d_keys, sizeof(K) * size, stream);\n  getBufferOnDevice(&d_vectors, sizeof(V) * size * a->dim(), stream);\n  getBufferOnDevice(&d_scores, sizeof(S) * size, stream);\n  getBufferOnDevice(&d_founds_in_b, sizeof(bool) * size, stream);\n  getBufferOnDevice(&d_vectors_in_b, sizeof(V) * size * a->dim(), stream);\n  getBufferOnDevice(&d_scores_in_b, sizeof(S) * size, stream);\n\n  a->export_batch(a->capacity(), 0, d_size, d_keys, d_vectors, d_scores,\n                  stream);\n  b->find(size, d_keys, d_vectors_in_b, d_founds_in_b, d_scores_in_b, stream);\n  if (!allTrueGpu(d_founds_in_b, size, stream)) {\n    CUDA_FREE_POINTERS(stream, d_size, d_keys, d_vectors, d_scores,\n                       d_founds_in_b, d_vectors_in_b, d_scores_in_b);\n    return false;\n  }\n  if (check_score && !allEqualGpu<S>(d_scores, d_scores_in_b, size, stream)) {\n    CUDA_FREE_POINTERS(stream, d_size, d_keys, d_vectors, d_scores,\n                       d_founds_in_b, d_vectors_in_b, d_scores_in_b);\n    return false;\n  }\n  if (!allEqualGpu(d_vectors, d_vectors_in_b, size * a->dim(), stream)) {\n    CUDA_FREE_POINTERS(stream, d_size, d_keys, d_vectors, d_scores,\n                       d_founds_in_b, d_vectors_in_b, d_scores_in_b);\n    return false;\n  }\n  return true;\n}\n\ntemplate <typename T, std::size_t N>\nstd::array<T, N> range(const T start) {\n  std::array<T, N> result;\n  size_t i = 0;\n  while (i < N) {\n    result[i] = start + i;\n    i++;\n  }\n  return result;\n}\n\ntemplate <class T>\nclass HostBuffer {\n public:\n  HostBuffer(const size_t size = 1) : ptr_(nullptr) {\n    if (!ptr_) {\n      size_ = size;\n      ptr_ = reinterpret_cast<T*>(malloc(sizeof(T) * size_));\n    }\n  }\n  ~HostBuffer() {\n    try {\n      if (!ptr_) free(ptr_);\n    } catch (const nv::merlin::CudaException& e) {\n      cerr << \"[HierarchicalKV] Failed to free HostBuffer!\" << endl;\n    }\n  }\n\n  __inline__ T* alloc_or_reuse(const size_t size = 0) {\n    if (size > size_) {\n      free(ptr_);\n      size_ = size;\n      reinterpret_cast<T*>(malloc(sizeof(T) * size_));\n    }\n    return ptr_;\n  }\n\n  __inline__ T* ptr() { return ptr_; }\n\n private:\n  T* ptr_;\n  size_t size_;\n};\n\ntemplate <class V>\n__global__ void read_from_ptr_kernel(const V* const* __restrict src,\n                                     V* __restrict dst, const size_t dim,\n                                     size_t N) {\n  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;\n\n  for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) {\n    int vec_index = int(t / dim);\n    int dim_index = t % dim;\n    if (src[vec_index]) {\n      dst[vec_index * dim + dim_index] = src[vec_index][dim_index];\n    }\n  }\n}\n\ntemplate <class V>\nvoid read_from_ptr(const V* const* __restrict src, V* __restrict dst,\n                   const size_t dim, size_t n, cudaStream_t stream) {\n  const size_t block_size = 1024;\n  const size_t N = n * dim;\n  const size_t grid_size = nv::merlin::SAFE_GET_GRID_SIZE(N, block_size);\n\n  read_from_ptr_kernel<V>\n      <<<grid_size, block_size, 0, stream>>>(src, dst, dim, N);\n}\n\ntemplate <class V>\n__global__ void array2ptr_kernel(V** ptr, V* __restrict array, const size_t dim,\n                                 size_t N) {\n  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;\n\n  for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) {\n    int vec_index = int(t);\n    ptr[vec_index] = array + vec_index * dim;\n  }\n}\n\ntemplate <class V>\nvoid array2ptr(V** ptr, V* __restrict array, const size_t dim, size_t n,\n               cudaStream_t stream) {\n  const size_t block_size = 1024;\n  const size_t N = n;\n  const size_t grid_size = nv::merlin::SAFE_GET_GRID_SIZE(N, block_size);\n\n  array2ptr_kernel<V><<<grid_size, block_size, 0, stream>>>(ptr, array, dim, N);\n}\n\ntemplate <class V>\n__global__ void read_or_write_ptr_kernel(V** __restrict src, V* __restrict dst,\n                                         bool* read_or_write, const size_t dim,\n                                         size_t N) {\n  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;\n\n  for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) {\n    int vec_index = int(t / dim);\n    int dim_index = t % dim;\n    if (!src[vec_index]) continue;\n    if (read_or_write[vec_index]) {\n      dst[vec_index * dim + dim_index] = src[vec_index][dim_index];\n    } else {\n      src[vec_index][dim_index] = dst[vec_index * dim + dim_index];\n    }\n  }\n}\n\ntemplate <class V>\nvoid read_or_write_ptr(V** __restrict src, V* __restrict dst,\n                       bool* read_or_write, const size_t dim, size_t n,\n                       cudaStream_t stream) {\n  const size_t block_size = 1024;\n  const size_t N = n * dim;\n  const size_t grid_size = nv::merlin::SAFE_GET_GRID_SIZE(N, block_size);\n\n  read_or_write_ptr_kernel<V>\n      <<<grid_size, block_size, 0, stream>>>(src, dst, read_or_write, dim, N);\n}\n\n}  // namespace test_util\n"
  },
  {
    "path": "tests/uint32_score_test.cc.cu",
    "content": "#include <gtest/gtest.h>\n#include <algorithm>\n#include <cstdint>\n#include <iostream>\n#include <limits>\n#include <memory>\n#include <unordered_map>\n#include \"merlin_hashtable.cuh\"\n#include \"test_util.cuh\"\n\nconstexpr size_t DIM = 8;\nconstexpr uint64_t CAPACITY = 1024;\nconstexpr uint64_t KEY_NUM = 256;\n\nusing K = uint64_t;\nusing V = float;\nusing S = uint32_t;\nusing TableOptions = nv::merlin::HashTableOptions;\nusing EvictStrategy = nv::merlin::EvictStrategy;\nusing Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;\n\nnamespace {\n\nTableOptions default_options() {\n  TableOptions options;\n  options.init_capacity = CAPACITY;\n  options.max_capacity = CAPACITY;\n  options.dim = DIM;\n  options.max_bucket_size = 128;\n  options.max_hbm_for_vectors = nv::merlin::GB(1);\n  return options;\n}\n\nvoid fill_sequential(test_util::KVMSBuffer<K, V, S>& buffer) {\n  for (size_t i = 0; i < buffer.len(); ++i) {\n    K key = static_cast<K>(i + 1);\n    buffer.keys.h_data[i] = key;\n    buffer.scores.h_data[i] = static_cast<S>(key);\n    for (size_t j = 0; j < buffer.dim(); ++j) {\n      buffer.values.h_data[i * buffer.dim() + j] =\n          static_cast<V>(key * 0.00001f);\n    }\n  }\n}\n\n}  // namespace\n\nTEST(Uint32ScoreTest, FindOrInsertAndFind) {\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  std::unique_ptr<Table> table = std::make_unique<Table>();\n  table->init(default_options());\n\n  test_util::KVMSBuffer<K, V, S> input;\n  input.Reserve(KEY_NUM, DIM, stream);\n  fill_sequential(input);\n  input.SyncData(true, stream);\n\n  table->find_or_insert(KEY_NUM, input.keys_ptr(), input.values_ptr(),\n                        input.scores_ptr(), stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n\n  size_t size = table->size(stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  ASSERT_EQ(size, KEY_NUM);\n\n  test_util::KVMSBuffer<K, V, S> output;\n  output.Reserve(KEY_NUM, DIM, stream);\n  output.ToZeros(stream);\n\n  table->find(KEY_NUM, input.keys_ptr(), output.values_ptr(),\n              output.status_ptr(), output.scores_ptr(), stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  output.SyncData(false, stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n\n  for (size_t i = 0; i < KEY_NUM; ++i) {\n    ASSERT_TRUE(output.status.h_data[i]);\n    ASSERT_EQ(output.scores.h_data[i], input.scores.h_data[i]);\n    for (size_t j = 0; j < DIM; ++j) {\n      ASSERT_EQ(output.values.h_data[i * DIM + j],\n                input.values.h_data[i * DIM + j]);\n    }\n  }\n\n  constexpr size_t MISSING_NUM = 16;\n  test_util::KVMSBuffer<K, V, S> missing;\n  missing.Reserve(MISSING_NUM, DIM, stream);\n  missing.ToZeros(stream);\n  for (size_t i = 0; i < MISSING_NUM; ++i) {\n    missing.keys.h_data[i] = static_cast<K>(KEY_NUM + 1000 + i);\n  }\n  missing.SyncData(true, stream);\n\n  table->find(MISSING_NUM, missing.keys_ptr(), missing.values_ptr(),\n              missing.status_ptr(), missing.scores_ptr(), stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  missing.SyncData(false, stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n\n  for (size_t i = 0; i < MISSING_NUM; ++i) {\n    ASSERT_FALSE(missing.status.h_data[i]);\n  }\n\n  CUDA_CHECK(cudaStreamDestroy(stream));\n  CudaCheckError();\n}\n\nTEST(Uint32ScoreTest, AssignScoresAndExport) {\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  std::unique_ptr<Table> table = std::make_unique<Table>();\n  table->init(default_options());\n\n  test_util::KVMSBuffer<K, V, S> input;\n  input.Reserve(KEY_NUM, DIM, stream);\n  fill_sequential(input);\n  input.SyncData(true, stream);\n\n  table->find_or_insert(KEY_NUM, input.keys_ptr(), input.values_ptr(),\n                        input.scores_ptr(), stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n\n  for (size_t i = 0; i < KEY_NUM; ++i) {\n    input.scores.h_data[i] = static_cast<S>(1000 + i);\n  }\n  input.scores.h_data[0] = static_cast<S>(0);\n  input.scores.h_data[1] = std::numeric_limits<S>::max();\n  input.scores.h_data[2] = static_cast<S>(1);\n  input.scores.h_data[3] = std::numeric_limits<S>::max() - 1;\n  input.scores.SyncData(true, stream);\n\n  table->assign_scores(KEY_NUM, input.keys_ptr(), input.scores_ptr(), stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n\n  test_util::KVMSBuffer<K, V, S> verify;\n  verify.Reserve(KEY_NUM, DIM, stream);\n  verify.ToZeros(stream);\n\n  table->find(KEY_NUM, input.keys_ptr(), verify.values_ptr(),\n              verify.status_ptr(), verify.scores_ptr(), stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  verify.SyncData(false, stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n\n  for (size_t i = 0; i < KEY_NUM; ++i) {\n    ASSERT_TRUE(verify.status.h_data[i]);\n    ASSERT_EQ(verify.scores.h_data[i], input.scores.h_data[i]);\n  }\n\n  const size_t capacity = table->capacity();\n  test_util::KVMSBuffer<K, V, S> exported;\n  exported.Reserve(capacity, DIM, stream);\n  exported.ToZeros(stream);\n\n  size_t dumped =\n      table->export_batch(capacity, 0, exported.keys_ptr(),\n                          exported.values_ptr(), exported.scores_ptr(), stream);\n  ASSERT_EQ(dumped, KEY_NUM);\n\n  exported.SyncData(false, stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n\n  std::unordered_map<K, S> expected_scores;\n  expected_scores.reserve(KEY_NUM);\n  for (size_t i = 0; i < KEY_NUM; ++i) {\n    expected_scores.emplace(static_cast<K>(i + 1), input.scores.h_data[i]);\n  }\n\n  for (size_t i = 0; i < dumped; ++i) {\n    K key = exported.keys.h_data[i];\n    auto it = expected_scores.find(key);\n    ASSERT_NE(it, expected_scores.end());\n    ASSERT_EQ(exported.scores.h_data[i], it->second);\n    expected_scores.erase(it);\n    for (size_t j = 0; j < DIM; ++j) {\n      ASSERT_EQ(exported.values.h_data[i * DIM + j],\n                static_cast<V>(key * 0.00001f));\n    }\n  }\n  ASSERT_TRUE(expected_scores.empty());\n\n  CUDA_CHECK(cudaStreamDestroy(stream));\n  CudaCheckError();\n}\n\nTEST(Uint32ScoreTest, EvictCustomizedCorrectRateFull) {\n  constexpr uint64_t BATCH_SIZE = 1024 * 1024ul;\n  constexpr uint64_t STEPS = 128;\n  constexpr uint64_t MAX_BUCKET_SIZE = 128;\n  constexpr uint64_t INIT_CAPACITY = BATCH_SIZE * STEPS;\n  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;\n  constexpr float EXPECTED_CORRECT_RATE = 0.964f;\n  const int rounds = 6;\n\n  TableOptions options;\n  options.init_capacity = INIT_CAPACITY;\n  options.max_capacity = MAX_CAPACITY;\n  options.dim = DIM;\n  options.reserved_key_start_bit = 17;\n  options.num_of_buckets_per_alloc = 128;\n  options.max_bucket_size = MAX_BUCKET_SIZE;\n  options.max_hbm_for_vectors = nv::merlin::GB(16);\n\n  K* h_keys_base = test_util::HostBuffer<K>(BATCH_SIZE).ptr();\n  S* h_scores_base = test_util::HostBuffer<S>(BATCH_SIZE).ptr();\n  V* h_vectors_base = test_util::HostBuffer<V>(BATCH_SIZE * options.dim).ptr();\n\n  K* h_keys_temp = test_util::HostBuffer<K>(MAX_CAPACITY).ptr();\n  S* h_scores_temp = test_util::HostBuffer<S>(MAX_CAPACITY).ptr();\n  V* h_vectors_temp =\n      test_util::HostBuffer<V>(MAX_CAPACITY * options.dim).ptr();\n\n  K* d_keys_temp = nullptr;\n  S* d_scores_temp = nullptr;\n  V* d_vectors_temp = nullptr;\n\n  CUDA_CHECK(cudaMalloc(&d_keys_temp, MAX_CAPACITY * sizeof(K)));\n  CUDA_CHECK(cudaMalloc(&d_scores_temp, MAX_CAPACITY * sizeof(S)));\n  CUDA_CHECK(\n      cudaMalloc(&d_vectors_temp, MAX_CAPACITY * sizeof(V) * options.dim));\n\n  cudaStream_t stream;\n  CUDA_CHECK(cudaStreamCreate(&stream));\n\n  std::unique_ptr<Table> table = std::make_unique<Table>();\n  table->init(options);\n\n  size_t total_size = table->size(stream);\n  CUDA_CHECK(cudaStreamSynchronize(stream));\n  ASSERT_EQ(total_size, 0);\n\n  size_t global_start_key = 100000;\n  size_t start_key = global_start_key;\n\n  for (int r = 0; r < rounds; ++r) {\n    const K expected_min_key =\n        static_cast<K>(global_start_key + INIT_CAPACITY * r);\n    const K expected_max_key =\n        static_cast<K>(global_start_key + INIT_CAPACITY * (r + 1) - 1);\n    const size_t expected_table_size =\n        (r == 0) ? static_cast<size_t>(EXPECTED_CORRECT_RATE * INIT_CAPACITY)\n                 : INIT_CAPACITY;\n\n    for (int s = 0; s < STEPS; ++s) {\n      test_util::create_continuous_keys<K, S, V, DIM>(\n          h_keys_base, h_scores_base, h_vectors_base, BATCH_SIZE, start_key);\n      start_key += BATCH_SIZE;\n\n      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base, BATCH_SIZE * sizeof(K),\n                            cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base,\n                            BATCH_SIZE * sizeof(S), cudaMemcpyHostToDevice));\n      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base,\n                            BATCH_SIZE * sizeof(V) * options.dim,\n                            cudaMemcpyHostToDevice));\n      table->insert_or_assign(BATCH_SIZE, d_keys_temp, d_vectors_temp,\n                              d_scores_temp, stream);\n      CUDA_CHECK(cudaStreamSynchronize(stream));\n    }\n\n    total_size = table->size(stream);\n    CUDA_CHECK(cudaStreamSynchronize(stream));\n    ASSERT_GE(total_size, expected_table_size);\n    ASSERT_EQ(MAX_CAPACITY, table->capacity());\n\n    size_t dump_counter = table->export_batch(\n        MAX_CAPACITY, 0, d_keys_temp, d_vectors_temp, d_scores_temp, stream);\n\n    CUDA_CHECK(cudaMemcpy(h_keys_temp, d_keys_temp, MAX_CAPACITY * sizeof(K),\n                          cudaMemcpyDefault));\n    CUDA_CHECK(cudaMemcpy(h_scores_temp, d_scores_temp,\n                          MAX_CAPACITY * sizeof(S), cudaMemcpyDefault));\n    CUDA_CHECK(cudaMemcpy(h_vectors_temp, d_vectors_temp,\n                          MAX_CAPACITY * sizeof(V) * options.dim,\n                          cudaMemcpyDefault));\n\n    ASSERT_EQ(total_size, dump_counter);\n    size_t bigger_score_counter = 0;\n    K max_key = 0;\n\n    for (size_t i = 0; i < dump_counter; ++i) {\n      ASSERT_EQ(h_scores_temp[i], static_cast<S>(h_keys_temp[i]));\n      max_key = std::max(max_key, h_keys_temp[i]);\n      if (h_scores_temp[i] >= static_cast<S>(expected_min_key)) {\n        bigger_score_counter++;\n      }\n      for (size_t j = 0; j < options.dim; ++j) {\n        const V expected = static_cast<V>(h_keys_temp[i] * 0.00001);\n        ASSERT_EQ(h_vectors_temp[i * options.dim + j], expected);\n      }\n    }\n\n    float correct_rate =\n        (bigger_score_counter * 1.0f) / static_cast<float>(MAX_CAPACITY);\n    std::cout << \"[Round \" << r << \"] \"\n              << \"correct_rate=\" << correct_rate << std::endl;\n    ASSERT_GE(max_key, expected_max_key);\n    ASSERT_GE(correct_rate, EXPECTED_CORRECT_RATE);\n  }\n\n  CUDA_CHECK(cudaStreamDestroy(stream));\n\n  CUDA_CHECK(cudaFree(d_keys_temp));\n  CUDA_CHECK(cudaFree(d_scores_temp));\n  CUDA_CHECK(cudaFree(d_vectors_temp));\n\n  CUDA_CHECK(cudaDeviceSynchronize());\n\n  CudaCheckError();\n}\n"
  }
]